feat: Add synthetic dataset generation for AVAP using MBPP dataset

- Implemented a new script `translate_mbpp.py` to generate synthetic datasets using various LLM providers. - Integrated the `get_prompt_mbpp` function in `prompts.py` to create prompts tailored for AVAP language conversion.
2026-03-09 17:43:07 +01:00 · 2026-03-09 17:43:07 +01:00 · a9bf84fa79
parent f6bfba5561
commit a9bf84fa79
5 changed files with 3401 additions and 58 deletions
--- a/docs/avap.txt
+++ b/docs/avap.txt
--- a/scripts/pipelines/flows/dataset_synthetic_generation.py
+++ b/scripts/pipelines/flows/dataset_synthetic_generation.py
@ -1,58 +0,0 @@
-import json
-from datasets import load_dataset
-import boto3
-import typer
-import logging
-from botocore.config import Config
-from langchain_core.messages import SystemMessage, HumanMessage
-from src.utils.llm_factory import create_chat_model
-from src.config import RAW_DIR, INTERIM_DIR
-
-
-logger = logging.getLogger(__name__)
-app = typer.Typer()
-
-config = Config(
-    region_name="us-east-1",
-    connect_timeout=10,     
-    read_timeout=600,        
-)
-
-client = boto3.client("bedrock-runtime", config=config)
-
-llm = create_chat_model(
-    provider="bedrock",
-    client=client,
-    model="global.anthropic.claude-sonnet-4-6",
-    temperature=0,
-)
-
-dataset_full = load_dataset("mbpp")
-
-
-random_test_samples = dataset_full["test"].shuffle(seed=42).select(range(50))
-
-test_samples_dict = {
-    str(i): {
-        "text": sample["text"],
-        "code": sample["code"],
-    }
-    for i, sample in enumerate(random_test_samples)
-}
-
-llm_response = llm.invoke([PROMPT_MBPP, HumanMessage(content=str(test_samples_dict))])
-
-json_str = llm_response.content.removeprefix("```json").removesuffix("```").strip()
-synthetic_data = json.loads(json_str)
-
-
-if __name__ == "__main__":
-    logging.basicConfig(
-        level=logging.INFO,
-        format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
-    )
-    try:
-        app()
-    except Exception as exc:
-        logger.exception(exc)
-        raise
--- a/scripts/pipelines/flows/translate_mbpp.py
+++ b/scripts/pipelines/flows/translate_mbpp.py
@ -0,0 +1,137 @@
+import json
+from enum import Enum
+from datasets import load_dataset
+import boto3
+import typer
+from loguru import logger
+from botocore.config import Config
+from pathlib import Path
+from langchain_core.messages import SystemMessage, HumanMessage
+from src.utils.llm_factory import create_chat_model
+from src.config import RAW_DIR, INTERIM_DIR
+from scripts.pipelines.input.prompts import get_prompt_mbpp
+
+
+app = typer.Typer()
+
+
+class Provider(str, Enum):
+    bedrock = "bedrock"
+    openai = "openai"
+    ollama = "ollama"
+
+
+@app.command()
+def generate_synthetic_dataset(
+    provider: Provider = Provider.bedrock,
+    model: str = "global.anthropic.claude-sonnet-4-6",
+    temperature: float = 0.0,
+    num_samples: int = 10,
+    seed: int = 42,
+    context_docs_path: str = "docs/avap.txt",
+    synthetic_output_path: str = "synthetic_datasets",
+) -> None:
+    """Generate synthetic dataset using the specified LLM."""
+    logger.info("🚀 Starting synthetic dataset generation pipeline")
+    logger.info(
+        f"Configuration - Provider: {provider}, Model: {model}, Temperature: {temperature}, Samples: {num_samples}, Seed: {seed}"
+    )
+
+    try:
+        config = Config(
+            connect_timeout=10,
+            read_timeout=600,
+        )
+        client = boto3.client("bedrock-runtime", config=config)
+        logger.info("✓ Bedrock client initialized successfully")
+
+        # Create LLM instance with specified parameters
+        logger.debug(f"Creating LLM instance with provider: {provider}")
+        llm = create_chat_model(
+            provider=provider,
+            client=client,
+            model=model,
+            temperature=temperature,
+        )
+        logger.info(f"✓ LLM initialized: {model}")
+
+        # Load MBPP dataset
+        logger.debug("Loading MBPP dataset")
+        dataset_full = load_dataset("mbpp")
+        logger.info("✓ MBPP dataset loaded successfully")
+
+        # Select random test samples for synthetic generation
+        logger.debug(f"Selecting {num_samples} random test samples from MBPP dataset")
+        random_test_samples = (
+            dataset_full["test"].shuffle(seed=seed).select(range(num_samples))
+        )
+        logger.info(f"✓ Selected {len(random_test_samples)} test samples")
+
+        # Prepare test samples dictionary
+        logger.debug("Preparing test samples dictionary")
+        test_samples_dict = {
+            str(i): {
+                "text": sample["text"],
+                "code": sample["code"],
+            }
+            for i, sample in enumerate(random_test_samples)
+        }
+        logger.info(f"✓ Prepared {len(test_samples_dict)} samples for processing")
+
+        # Load AVAP documentation
+
+        logger.debug(f"Loading AVAP documentation from {context_docs_path}")
+        with open(context_docs_path, "r") as f:
+            avap_docs = f.read()
+        logger.info(f"✓ AVAP documentation loaded ({len(avap_docs)} characters)")
+
+        # Generate prompt with AVAP context
+        logger.debug("Generating prompt with AVAP context")
+        get_prompt_mbpp_func = get_prompt_mbpp(avap_docs=avap_docs)
+        logger.debug("✓ Prompt generated successfully")
+
+        # Invoke LLM to generate synthetic data
+        logger.info("Invoking LLM to generate synthetic dataset...")
+        llm_response = llm.invoke(
+            [get_prompt_mbpp_func, HumanMessage(content=str(test_samples_dict))]
+        )
+        logger.info("✓ LLM response received")
+
+        # Parse JSON response
+        logger.debug("Parsing LLM response as JSON")
+        json_str = (
+            llm_response.content.removeprefix("```json").removesuffix("```").strip()
+        )
+        synthetic_data = json.loads(json_str)
+        logger.info(
+            f"✓ Successfully parsed synthetic data with {len(synthetic_data)} samples"
+        )
+
+        logger.info(
+            f"Pipeline completed successfully! Generated {len(synthetic_data)} synthetic samples"
+        )
+
+        output_dir = Path(synthetic_output_path)
+        output_dir.mkdir(parents=True, exist_ok=True)
+
+        output_file = output_dir / f"synthetic_data_{provider.value}.json"
+        with output_file.open("w", encoding="utf-8") as f:
+            json.dump(synthetic_data, f, ensure_ascii=False, indent=2)
+
+        return synthetic_data
+
+    except Exception as exc:
+        logger.error(f"Error during synthetic dataset generation: {exc}")
+        logger.exception("Full exception traceback")
+        raise
+
+
+if __name__ == "__main__":
+    logger.info("=" * 50)
+    logger.info("Dataset Synthetic Generation Pipeline")
+    logger.info("=" * 50)
+    try:
+        app()
+    except Exception as exc:
+        logger.error(f"Pipeline execution failed: {exc}")
+        raise
--- a/scripts/pipelines/input/prompts.py
+++ b/scripts/pipelines/input/prompts.py
@ -0,0 +1,31 @@
+from langchain_core.messages import SystemMessage
+
+
+def get_prompt_mbpp(avap_docs: str) -> SystemMessage:
+    return SystemMessage(
+        content=f"""
+        You are an AI agent responsible for generating a synthetic dataset for a programming language called AVAP (Advanced Virtual API Programming).
+
+        You will receive a JSON object containing programming problems and their solutions written in Python.
+
+        Your task is to:
+        1. Convert each Python programming problem into an equivalent problem framed for the AVAP language.
+        2. Replace the Python solution with a correct implementation written in AVAP.
+
+        Requirements:
+        - The AVAP solution must follow AVAP syntax.
+        - Use AVAP libraries and constructs when appropriate.
+        - Functions in AVAP are declared using the `function` keyword.
+        - Preserve the original problem intent and difficulty.
+        - Do not produce Python code in the final solution.
+
+        Documentation usage rules:
+        - You MUST rely exclusively on the AVAP documentation provided below.
+        - Do not invent syntax, functions, or libraries that are not described in the documentation.
+        - If the documentation does not contain enough information to solve the problem, return exactly:
+        "I don't know how to answer this question based on the provided documentation."
+
+        AVAP Documentation:
+        {avap_docs}
+        """
+    )
--- a/synthetic_datasets/synthetic_data_bedrock.json
+++ b/synthetic_datasets/synthetic_data_bedrock.json
@ -0,0 +1,42 @@
+{
+  "0": {
+    "text": "Write a function to remove the matching tuples from the given two lists.",
+    "code": "function remove_matching_tuple(test_list1, test_list2){\n    // Result list to store non-matching items\n    result = []\n    getListLen(test_list1, len1)\n    getListLen(test_list2, len2)\n    startLoop(i, 0, len1)\n        itemFromList(test_list1, i, current_item)\n        found = False\n        startLoop(j, 0, len2)\n            itemFromList(test_list2, j, check_item)\n            if(current_item, check_item, \"=\")\n                found = True\n            end()\n        endLoop()\n        if(found, False, \"=\")\n            variableToList(current_item, result)\n        end()\n    endLoop()\n    return(result)\n}"
+  },
+  "1": {
+    "text": "Write a function to find the number of elements present in the given list.",
+    "code": "function find_lists(input_list){\n    getListLen(input_list, list_len)\n    return(list_len)\n}"
+  },
+  "2": {
+    "text": "Write a function to find the first natural number whose factorial is divisible by x.",
+    "code": "function first_Factorial_Divisible_Number(x){\n    i = 1\n    fact = 1\n    result = 1\n    startLoop(i, 1, x)\n        fact = fact * i\n        remainder = fact % x\n        if(remainder, 0, \"=\")\n            result = i\n            i = x\n        end()\n    endLoop()\n    return(result)\n}"
+  },
+  "3": {
+    "text": "Write a function to find the largest number that can be formed with the given list of digits.",
+    "code": "function find_Max_Num(arr){\n    // Sort the array in descending order\n    getListLen(arr, n)\n    // Bubble sort descending\n    startLoop(i, 0, n)\n        startLoop(j, 0, n)\n            itemFromList(arr, j, val_j)\n            next_idx = j + 1\n            itemFromList(arr, next_idx, val_next)\n            if(val_j, val_next, \"<\")\n                // Swap\n                arr[j] = val_next\n                arr[next_idx] = val_j\n            end()\n        endLoop()\n    endLoop()\n    // Build the number\n    itemFromList(arr, 0, num)\n    startLoop(k, 1, n)\n        itemFromList(arr, k, digit)\n        num = num * 10 + digit\n    endLoop()\n    return(num)\n}"
+  },
+  "4": {
+    "text": "Write a function to check if the triangle is equilateral or not.",
+    "code": "function check_equilateral(x, y, z){\n    if(x, y, \"=\")\n        if(y, z, \"=\")\n            return(True)\n        end()\n    end()\n    return(False)\n}"
+  },
+  "5": {
+    "text": "Write a function to count the occurrences of the first element of each tuple in the given list of tuples.",
+    "code": "function sort_on_occurence(lst){\n    // Build a JSON object counting occurrences of first elements\n    counts = {}\n    getListLen(lst, lst_len)\n    startLoop(i, 0, lst_len)\n        itemFromList(lst, i, current_tuple)\n        itemFromList(current_tuple, 0, key)\n        variableFromJSON(counts, key, existing)\n        if(existing, None, \"=\")\n            AddvariableToJSON(key, 1, counts)\n        else()\n            new_count = existing + 1\n            AddvariableToJSON(key, new_count, counts)\n        end()\n    endLoop()\n    return(counts)\n}"
+  },
+  "6": {
+    "text": "Write a function to check if a given number is one less than twice its reverse.",
+    "code": "function rev(num){\n    rev_num = 0\n    startLoop(i, 1, num)\n        if(num, 0, \">\")\n            remainder = num % 10\n            rev_num = rev_num * 10 + remainder\n            num = num // 10\n        end()\n    endLoop()\n    return(rev_num)\n}\n\nfunction check(n){\n    reversed_n = rev(n)\n    twice_rev = 2 * reversed_n\n    expected = n + 1\n    if(twice_rev, expected, \"=\")\n        return(True)\n    end()\n    return(False)\n}"
+  },
+  "7": {
+    "text": "Write a function to convert a list of multiple integers into a single integer.",
+    "code": "function multiple_to_single(L){\n    getListLen(L, list_len)\n    result = 0\n    startLoop(i, 0, list_len)\n        itemFromList(L, i, digit)\n        result = result * 10 + digit\n    endLoop()\n    return(result)\n}"
+  },
+  "8": {
+    "text": "Write a function that checks if a word exists at the end of a string, with optional punctuation.",
+    "code": "function text_match_word(text){\n    // Use getRegex to find a word at the end of the string with optional punctuation\n    pattern = \"\\\\w+\\\\S*$\"\n    getRegex(text, pattern, match_result)\n    if(match_result, None, \"!=\")\n        addVar(output, \"Found a match!\")\n    else()\n        addVar(output, \"Not matched!\")\n    end()\n    return(output)\n}"
+  },
+  "9": {
+    "text": "Write a function to find the sum of numbers in a list between the indices of a specified range.",
+    "code": "function sum_range_list(list1, m, n){\n    sum_range = 0\n    end_idx = n + 1\n    startLoop(i, m, end_idx)\n        itemFromList(list1, i, current_val)\n        sum_range = sum_range + current_val\n    endLoop()\n    return(sum_range)\n}"
+  }
+}