From 0d2cdd2190da765ae290550855f45f09731d7dbf Mon Sep 17 00:00:00 2001 From: pseco Date: Wed, 25 Mar 2026 17:07:00 +0100 Subject: [PATCH] Refactor AVAP dataset generation prompts and add synthetic data generation notebook - Introduced a new notebook for generating synthetic datasets for AVAP, including loading AVAP and MBPP data, and creating prompts for LLM interactions. --- .../generate_synthethic_data.ipynb | 537 ++++++++++++++++++ 1 file changed, 537 insertions(+) create mode 100644 scratches/pseco/synthetic_dataset/generate_synthethic_data.ipynb diff --git a/scratches/pseco/synthetic_dataset/generate_synthethic_data.ipynb b/scratches/pseco/synthetic_dataset/generate_synthethic_data.ipynb new file mode 100644 index 0000000..c26343e --- /dev/null +++ b/scratches/pseco/synthetic_dataset/generate_synthethic_data.ipynb @@ -0,0 +1,537 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "b657efd2", + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "from datasets import load_dataset\n", + "\n", + "import boto3\n", + "from botocore.config import Config\n", + "from langchain_core.messages import SystemMessage, HumanMessage\n", + "\n", + "from src.utils.llm_factory import create_chat_model\n", + "from src.config import settings\n", + "from src.config import RAW_DIR, INTERIM_DIR, EXTERNAL_DIR" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6aaa59fc", + "metadata": {}, + "outputs": [], + "source": [ + "mbpp = False" + ] + }, + { + "cell_type": "markdown", + "id": "e6e90339", + "metadata": {}, + "source": [ + "### Create llm instance" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "20eecc53", + "metadata": {}, + "outputs": [], + "source": [ + "config = Config(\n", + " region_name=\"us-east-1\",\n", + " connect_timeout=10, \n", + " read_timeout=600, \n", + ")\n", + "\n", + "client = boto3.client(\"bedrock-runtime\", config=config)\n", + "\n", + "llm = create_chat_model(\n", + " provider=\"bedrock\",\n", + " client=client,\n", + " model=\"global.anthropic.claude-sonnet-4-6\",\n", + " temperature=0,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "96f12a22", + "metadata": {}, + "source": [ + "### Load mbpp data " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "78e29dc2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "DatasetDict({\n", + " train: Dataset({\n", + " features: ['task_id', 'text', 'code', 'test_list', 'test_setup_code', 'challenge_test_list'],\n", + " num_rows: 374\n", + " })\n", + " test: Dataset({\n", + " features: ['task_id', 'text', 'code', 'test_list', 'test_setup_code', 'challenge_test_list'],\n", + " num_rows: 500\n", + " })\n", + " validation: Dataset({\n", + " features: ['task_id', 'text', 'code', 'test_list', 'test_setup_code', 'challenge_test_list'],\n", + " num_rows: 90\n", + " })\n", + " prompt: Dataset({\n", + " features: ['task_id', 'text', 'code', 'test_list', 'test_setup_code', 'challenge_test_list'],\n", + " num_rows: 10\n", + " })\n", + "})" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset_full = load_dataset(\"mbpp\")" + ] + }, + { + "cell_type": "markdown", + "id": "3e7544bb", + "metadata": {}, + "source": [ + "### Load AVAP data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8e88b2d3", + "metadata": {}, + "outputs": [], + "source": [ + "with open(RAW_DIR / \"combined.txt\", \"r\") as f:\n", + " avap_docs = f.read()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ffc3ffa2", + "metadata": {}, + "outputs": [], + "source": [ + "samples_dir = settings.proj_root / \"docs/samples\"\n", + "avap_samples = []\n", + "\n", + "for avap_file in sorted(samples_dir.glob(\"*.avap\")):\n", + " with open(avap_file, \"r\") as f:\n", + " code = f.read()\n", + " \n", + " avap_samples.append({\n", + " \"file\": avap_file.name,\n", + " \"code\": code\n", + " })\n", + "\n", + "# Display as JSON\n", + "avap_samples_json = json.dumps(avap_samples, indent=2, ensure_ascii=False)\n", + "print(f\"Loaded {len(avap_samples)} AVAP samples\")" + ] + }, + { + "cell_type": "markdown", + "id": "c039d79f", + "metadata": {}, + "source": [ + "### Random sample mbpp" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "b186af30", + "metadata": {}, + "outputs": [], + "source": [ + "# Sample 100 random examples from the test split\n", + "random_test_samples = dataset_full[\"test\"].shuffle(seed=42).select(range(50))\n", + "\n", + "# Save text and code in a dictionary\n", + "test_samples_dict = {\n", + " str(i): {\n", + " \"text\": sample[\"text\"],\n", + " \"code\": sample[\"code\"],\n", + " }\n", + " for i, sample in enumerate(random_test_samples)\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "21a0422d", + "metadata": {}, + "source": [ + "### Prompts" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2c075e29", + "metadata": {}, + "outputs": [], + "source": [ + "PROMPT_MBPP = SystemMessage(\n", + " content=f\"\"\"\n", + " You are an AI agent responsible for generating a synthetic dataset for a programming language called AVAP (Advanced Virtual API Programming).\n", + "\n", + " You will receive a JSON object containing programming problems and their solutions written in Python.\n", + "\n", + " Your task is to:\n", + " 1. Convert each Python programming problem into an equivalent problem framed for the AVAP language.\n", + " 2. Replace the Python solution with a correct implementation written in AVAP.\n", + "\n", + " Requirements:\n", + " - The AVAP solution must follow AVAP syntax.\n", + " - Use AVAP libraries and constructs when appropriate.\n", + " - Functions in AVAP are declared using the `function` keyword.\n", + " - Preserve the original problem intent and difficulty.\n", + " - Do not produce Python code in the final solution.\n", + "\n", + " Documentation usage rules:\n", + " - You MUST rely exclusively on the AVAP documentation provided below.\n", + " - Do not invent syntax, functions, or libraries that are not described in the documentation.\n", + " - If the documentation does not contain enough information to solve the problem, return exactly:\n", + " \"I don't know how to answer this question based on the provided documentation.\"\n", + "\n", + " AVAP Documentation:\n", + " {avap_docs}\n", + " \"\"\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4c6ac5d2", + "metadata": {}, + "outputs": [], + "source": [ + "GOLDEN_DATASET_CODE_PROMPT = HumanMessage(\n", + " content=f\"\"\"\n", + "You are an expert AI assistant for code understanding and dataset generation.\n", + "\n", + "Your task is to generate a high-quality golden dataset for AVAP code retrieval and coding-question evaluation.\n", + "\n", + "Input:\n", + "You will receive `avap_samples_json`, a JSON array of AVAP code samples. Besides the code, you also have access to `avap_docs`, a markdown document containing comprehensive documentation about AVAP:\n", + "\n", + "AVAP Documentation:\n", + "{avap_docs}\n", + "\n", + "AVAP Code Samples:\n", + "{avap_samples_json}\n", + "\n", + "Each item contains:\n", + "- `file`: the filename\n", + "- `code`: the full code content\n", + "\n", + "Goal:\n", + "For each code sample, generate 2 realistic developer questions that can be answered ONLY by retrieving that specific code sample.\n", + "\n", + "Output:\n", + "Return a valid JSON array.\n", + "Each element in the array must contain exactly these fields:\n", + "- `id`: a unique string identifier\n", + "- `question`: a natural-language developer question\n", + "- `ground_truth`: the exact AVAP code snippet that answers the question\n", + "\n", + "Critical requirement:\n", + "- The `ground_truth` MUST be the relevant code (not an explanation).\n", + "- The code must be copied EXACTLY from the original sample (no modifications, no summarization, no reformatting).\n", + "- Do NOT generate pseudocode or partial reconstructions.\n", + "- Do NOT explain the code in `ground_truth`.\n", + "\n", + "Ground-truth rules:\n", + "- Prefer returning the minimal self-contained snippet that answers the question.\n", + "- If the logic depends on multiple lines, include all necessary lines.\n", + "- Do not include unrelated parts of the file.\n", + "- Preserve formatting, indentation, and syntax exactly as in the input.\n", + "\n", + "Question-writing guidelines:\n", + "Generate natural developer-style questions such as:\n", + "- \"How do you return a value based on a language parameter?\"\n", + "- \"How is a conditional used to set a variable depending on input?\"\n", + "- \"Show an example where a parameter is read and a result is returned\"\n", + "- \"How do you assign a variable conditionally in AVAP?\"\n", + "- \"How is a response built after evaluating an input condition?\"\n", + "\n", + "Good questions should:\n", + "- Clearly require THIS specific code to answer\n", + "- Be specific enough to retrieve the correct snippet\n", + "- Focus on behavior, logic, or patterns (not trivial syntax)\n", + "- Sound like real developer queries\n", + "\n", + "Avoid:\n", + "- Referencing filenames or dataset structure\n", + "- Asking generic questions that apply to many files\n", + "- Copying code into the question\n", + "- Using unnecessary variable names unless essential\n", + "- Asking about things not present in the code\n", + "\n", + "Diversity requirements:\n", + "For each file, vary the questions:\n", + "- one about input handling or parameters\n", + "- one about control flow (if, loops, conditions)\n", + "\n", + "Language:\n", + "- Use English and Spanish alternately, in order to evaluate both languages\n", + "- If the code is clearly Spanish-oriented, you may use Spanish\n", + "- Keep question language consistent\n", + "\n", + "ID requirements:\n", + "- Generate unique IDs using: `_q1`, `_q2`\n", + "\n", + "Process:\n", + "1. Analyze the code sample\n", + "2. Identify key behaviors (inputs, conditions, outputs, transformations)\n", + "3. Generate 2 realistic developer questions\n", + "4. Extract the exact code snippet that answers each question\n", + "5. Output the dataset\n", + "\n", + "Example:\n", + "\n", + "Input code:\n", + "addParam(\"lang\", l)\n", + "if(l, \"es\", \"=\")\n", + " addVar(msg, \"Hola\")\n", + "end()\n", + "addResult(msg)\n", + "\n", + "Output:\n", + "[\n", + " {\n", + " \"id\": \"example_q1\",\n", + " \"question\": \"How do you return a different message based on a language parameter?\",\n", + " \"ground_truth\": \"addParam(\\\"lang\\\", l)\\nif(l, \\\"es\\\", \\\"=\\\")\\n addVar(msg, \\\"Hola\\\")\\nend()\\naddResult(msg)\"\n", + " },\n", + " {\n", + " \"id\": \"example_q2\",\n", + " \"question\": \"How is a conditional used to assign a variable before returning it?\",\n", + " \"ground_truth\": \"if(l, \\\"es\\\", \\\"=\\\")\\n addVar(msg, \\\"Hola\\\")\\nend()\\naddResult(msg)\"\n", + " }\n", + "]\n", + "\n", + "Final constraints:\n", + "- Output ONLY the JSON array\n", + "- Do not include explanations or markdown\n", + "- Ensure valid JSON (escaped strings, proper formatting)\n", + " \"\"\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1929b35a", + "metadata": {}, + "outputs": [], + "source": [ + "GOLDEN_DATASET_DOCS_PROMPT = HumanMessage(\n", + " content=f\"\"\"\n", + "You are an expert AI assistant for technical documentation understanding and dataset generation.\n", + "\n", + "Your task is to generate a high-quality golden dataset for evaluating RAG systems over AVAP documentation.\n", + "\n", + "Input:\n", + "You will receive `avap_docs`, a collection of documentation chunks or sections describing the AVAP language (syntax, semantics, commands, behavior, etc.).\n", + "\n", + "AVAP Documentation:\n", + "{avap_docs}\n", + "\n", + "Goal:\n", + "For this documentation, generate 100 realistic developer questions that can be answered ONLY using that specific documentation content.\n", + "\n", + "Output:\n", + "Return a valid JSON array.\n", + "Each element must contain exactly these fields:\n", + "- \"id\": a unique string identifier\n", + "- \"question\": a natural-language developer question\n", + "- \"ground_truth\": a concise textual answer derived strictly from the documentation\n", + "\n", + "Critical requirements:\n", + "- This dataset is for **documentation RAG evaluation (Ragas)**.\n", + "- Each question must be answerable using ONLY one documentation chunk.\n", + "- The `ground_truth` must be a **textual explanation**, not code.\n", + "- Do NOT invent information not explicitly present in the documentation.\n", + "- Do NOT rely on external knowledge of AVAP.\n", + "\n", + "Ground-truth guidelines:\n", + "- Must directly answer the question\n", + "- Must be faithful to the documentation\n", + "- Should be concise but complete\n", + "- Prefer paraphrasing over copying verbatim\n", + "- Avoid quoting large blocks unless necessary\n", + "- No speculation or assumptions\n", + "\n", + "Question-writing guidelines:\n", + "Generate realistic developer questions such as:\n", + "- conceptual understanding (\"What does X do in AVAP?\")\n", + "- syntax questions (\"How is X defined?\")\n", + "- behavior questions (\"What happens when X is used?\")\n", + "- usage questions (\"When should X be used?\")\n", + "- constraints (\"What are the requirements for X?\")\n", + "- comparisons (ONLY if explicitly covered in the chunk)\n", + "\n", + "Good questions should:\n", + "- sound like real developer queries\n", + "- be specific enough to retrieve the correct chunk\n", + "- depend on the actual content of the documentation\n", + "- not be answerable from general programming knowledge alone\n", + "\n", + "Avoid:\n", + "- generic questions like “What is AVAP?”\n", + "- questions requiring multiple documents\n", + "- referencing “the documentation” or “this section”\n", + "- copying text into the question\n", + "- trivial or overly obvious questions\n", + "\n", + "Diversity requirements:\n", + "For each documentation chunk, vary the questions:\n", + "- one about definition or purpose\n", + "- one about behavior or rules\n", + "- one about usage or constraints (if applicable)\n", + "\n", + "Language:\n", + "- Use English and Spanish alternately, in order to evaluate both languages\n", + "- If the documentation is clearly in Spanish, generate both question and answer in Spanish\n", + "- Keep language consistent within each pair\n", + "\n", + "ID requirements:\n", + "- Use format: `_q1`, `_q2`, `_q3`, ...\n", + "- Ensure uniqueness across all entries\n", + "\n", + "Process:\n", + "1. Read the documentation chunk carefully\n", + "2. Identify key concepts, rules, and behaviors\n", + "3. Generate 100 realistic developer questions, knowing i have only this chunk to answer them. One chunk with all the documentation content, generate 100 questions that can be answered only with this documentation. Each question must be answerable using only this documentation content.\n", + "4. Write precise ground-truth answers based only on that chunk\n", + "5. Output the dataset\n", + "\n", + "Example:\n", + "\n", + "Input documentation:\n", + "\"The addParam function is used to define an input parameter for an endpoint. It takes the parameter name and assigns it to a variable that can be used later in the code.\"\n", + "\n", + "Output:\n", + "[\n", + " {\n", + " \"id\": \"doc1_q1\",\n", + " \"question\": \"What is the purpose of addParam in AVAP?\",\n", + " \"ground_truth\": \"The addParam function defines an input parameter for an endpoint and assigns its value to a variable for later use in the code.\"\n", + " },\n", + " {\n", + " \"id\": \"doc1_q2\",\n", + " \"question\": \"How does addParam make input values accessible in AVAP code?\",\n", + " \"ground_truth\": \"It assigns the value of the input parameter to a variable, which can then be used throughout the code.\"\n", + " }\n", + "]\n", + "\n", + "Final constraints:\n", + "- Output ONLY the JSON array\n", + "- Do not include explanations or markdown\n", + "- Ensure valid JSON formatting\n", + " \"\"\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cbb459f8", + "metadata": {}, + "outputs": [], + "source": [ + "PROMPT_NO_MBPP = HumanMessage(\n", + " content=\"\"\"\n", + " I gave you all the context for a new programming language called AVAP (Advanced Virtual API Programming). \n", + " With that context, I'd like you to generate a \"golden dataset\" of 100 question and answer where the user query is asking code snippets and the response is the full code. \n", + " I want the questions and answers to cover the entire context provided, and I want there to be questions and answers at all possible levels, from questions with simple code snippets to more complex questions with more complex code. \n", + " \n", + " You must return a .json file with user_query and response for each question and answer\n", + " \"\"\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "ea1e824e", + "metadata": {}, + "source": [ + "### Generate dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8b7dcf2f", + "metadata": {}, + "outputs": [], + "source": [ + "if mbpp:\n", + " llm_response = llm.invoke([PROMPT_MBPP, HumanMessage(content=str(test_samples_dict))])\n", + " print(llm_response.content)\n", + " json_str = llm_response.content.removeprefix(\"```json\").removesuffix(\"```\").strip()\n", + " synthetic_data = json.loads(json_str)\n", + " with open(INTERIM_DIR /'synthethic_datasets/synthethic_data.json', 'w') as f:\n", + " json.dump(synthetic_data, f)\n", + "else:\n", + " llm_response = llm.invoke([SystemMessage(content=avap_docs), GOLDEN_DATASET_CODE_PROMPT])\n", + " print(llm_response.content)\n", + " json_str = llm_response.content.removeprefix(\"```json\").removesuffix(\"```\").strip()\n", + " synthetic_data = json.loads(json_str)\n", + " with open(INTERIM_DIR /'synthethic_datasets/synthethic_data_code.json', 'w') as f:\n", + " json.dump(synthetic_data, f)\n", + "\n", + " llm_response = llm.invoke([SystemMessage(content=avap_docs), GOLDEN_DATASET_DOCS_PROMPT])\n", + " print(llm_response.content)\n", + " json_str = llm_response.content.removeprefix(\"```json\").removesuffix(\"```\").strip()\n", + " synthetic_data = json.loads(json_str)\n", + " with open(INTERIM_DIR /'synthethic_datasets/synthethic_data_docs.json', 'w') as f:\n", + " json.dump(synthetic_data, f)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "assistance-engine", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}