feat: Add synthetic dataset generation for AVAP using MBPP dataset
- Implemented a new script `translate_mbpp.py` to generate synthetic datasets using various LLM providers. - Integrated the `get_prompt_mbpp` function in `prompts.py` to create prompts tailored for AVAP language conversion.
This commit is contained in:
parent
f6bfba5561
commit
a9bf84fa79
File diff suppressed because it is too large
Load Diff
|
|
@ -1,58 +0,0 @@
|
|||
import json
|
||||
from datasets import load_dataset
|
||||
import boto3
|
||||
import typer
|
||||
import logging
|
||||
from botocore.config import Config
|
||||
from langchain_core.messages import SystemMessage, HumanMessage
|
||||
from src.utils.llm_factory import create_chat_model
|
||||
from src.config import RAW_DIR, INTERIM_DIR
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
app = typer.Typer()
|
||||
|
||||
config = Config(
|
||||
region_name="us-east-1",
|
||||
connect_timeout=10,
|
||||
read_timeout=600,
|
||||
)
|
||||
|
||||
client = boto3.client("bedrock-runtime", config=config)
|
||||
|
||||
llm = create_chat_model(
|
||||
provider="bedrock",
|
||||
client=client,
|
||||
model="global.anthropic.claude-sonnet-4-6",
|
||||
temperature=0,
|
||||
)
|
||||
|
||||
dataset_full = load_dataset("mbpp")
|
||||
|
||||
|
||||
random_test_samples = dataset_full["test"].shuffle(seed=42).select(range(50))
|
||||
|
||||
test_samples_dict = {
|
||||
str(i): {
|
||||
"text": sample["text"],
|
||||
"code": sample["code"],
|
||||
}
|
||||
for i, sample in enumerate(random_test_samples)
|
||||
}
|
||||
|
||||
llm_response = llm.invoke([PROMPT_MBPP, HumanMessage(content=str(test_samples_dict))])
|
||||
|
||||
json_str = llm_response.content.removeprefix("```json").removesuffix("```").strip()
|
||||
synthetic_data = json.loads(json_str)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
|
||||
)
|
||||
try:
|
||||
app()
|
||||
except Exception as exc:
|
||||
logger.exception(exc)
|
||||
raise
|
||||
|
|
@ -0,0 +1,137 @@
|
|||
import json
|
||||
from enum import Enum
|
||||
from datasets import load_dataset
|
||||
import boto3
|
||||
import typer
|
||||
from loguru import logger
|
||||
from botocore.config import Config
|
||||
from pathlib import Path
|
||||
from langchain_core.messages import SystemMessage, HumanMessage
|
||||
from src.utils.llm_factory import create_chat_model
|
||||
from src.config import RAW_DIR, INTERIM_DIR
|
||||
from scripts.pipelines.input.prompts import get_prompt_mbpp
|
||||
|
||||
|
||||
app = typer.Typer()
|
||||
|
||||
|
||||
class Provider(str, Enum):
|
||||
bedrock = "bedrock"
|
||||
openai = "openai"
|
||||
ollama = "ollama"
|
||||
|
||||
|
||||
@app.command()
|
||||
def generate_synthetic_dataset(
|
||||
provider: Provider = Provider.bedrock,
|
||||
model: str = "global.anthropic.claude-sonnet-4-6",
|
||||
temperature: float = 0.0,
|
||||
num_samples: int = 10,
|
||||
seed: int = 42,
|
||||
context_docs_path: str = "docs/avap.txt",
|
||||
synthetic_output_path: str = "synthetic_datasets",
|
||||
) -> None:
|
||||
"""Generate synthetic dataset using the specified LLM."""
|
||||
logger.info("🚀 Starting synthetic dataset generation pipeline")
|
||||
logger.info(
|
||||
f"Configuration - Provider: {provider}, Model: {model}, Temperature: {temperature}, Samples: {num_samples}, Seed: {seed}"
|
||||
)
|
||||
|
||||
try:
|
||||
config = Config(
|
||||
connect_timeout=10,
|
||||
read_timeout=600,
|
||||
)
|
||||
client = boto3.client("bedrock-runtime", config=config)
|
||||
logger.info("✓ Bedrock client initialized successfully")
|
||||
|
||||
# Create LLM instance with specified parameters
|
||||
logger.debug(f"Creating LLM instance with provider: {provider}")
|
||||
llm = create_chat_model(
|
||||
provider=provider,
|
||||
client=client,
|
||||
model=model,
|
||||
temperature=temperature,
|
||||
)
|
||||
logger.info(f"✓ LLM initialized: {model}")
|
||||
|
||||
# Load MBPP dataset
|
||||
logger.debug("Loading MBPP dataset")
|
||||
dataset_full = load_dataset("mbpp")
|
||||
logger.info("✓ MBPP dataset loaded successfully")
|
||||
|
||||
# Select random test samples for synthetic generation
|
||||
logger.debug(f"Selecting {num_samples} random test samples from MBPP dataset")
|
||||
random_test_samples = (
|
||||
dataset_full["test"].shuffle(seed=seed).select(range(num_samples))
|
||||
)
|
||||
logger.info(f"✓ Selected {len(random_test_samples)} test samples")
|
||||
|
||||
# Prepare test samples dictionary
|
||||
logger.debug("Preparing test samples dictionary")
|
||||
test_samples_dict = {
|
||||
str(i): {
|
||||
"text": sample["text"],
|
||||
"code": sample["code"],
|
||||
}
|
||||
for i, sample in enumerate(random_test_samples)
|
||||
}
|
||||
logger.info(f"✓ Prepared {len(test_samples_dict)} samples for processing")
|
||||
|
||||
# Load AVAP documentation
|
||||
|
||||
logger.debug(f"Loading AVAP documentation from {context_docs_path}")
|
||||
with open(context_docs_path, "r") as f:
|
||||
avap_docs = f.read()
|
||||
logger.info(f"✓ AVAP documentation loaded ({len(avap_docs)} characters)")
|
||||
|
||||
# Generate prompt with AVAP context
|
||||
logger.debug("Generating prompt with AVAP context")
|
||||
get_prompt_mbpp_func = get_prompt_mbpp(avap_docs=avap_docs)
|
||||
logger.debug("✓ Prompt generated successfully")
|
||||
|
||||
# Invoke LLM to generate synthetic data
|
||||
logger.info("Invoking LLM to generate synthetic dataset...")
|
||||
llm_response = llm.invoke(
|
||||
[get_prompt_mbpp_func, HumanMessage(content=str(test_samples_dict))]
|
||||
)
|
||||
logger.info("✓ LLM response received")
|
||||
|
||||
# Parse JSON response
|
||||
logger.debug("Parsing LLM response as JSON")
|
||||
json_str = (
|
||||
llm_response.content.removeprefix("```json").removesuffix("```").strip()
|
||||
)
|
||||
synthetic_data = json.loads(json_str)
|
||||
logger.info(
|
||||
f"✓ Successfully parsed synthetic data with {len(synthetic_data)} samples"
|
||||
)
|
||||
|
||||
logger.info(
|
||||
f"Pipeline completed successfully! Generated {len(synthetic_data)} synthetic samples"
|
||||
)
|
||||
|
||||
output_dir = Path(synthetic_output_path)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
output_file = output_dir / f"synthetic_data_{provider.value}.json"
|
||||
with output_file.open("w", encoding="utf-8") as f:
|
||||
json.dump(synthetic_data, f, ensure_ascii=False, indent=2)
|
||||
|
||||
return synthetic_data
|
||||
|
||||
except Exception as exc:
|
||||
logger.error(f"Error during synthetic dataset generation: {exc}")
|
||||
logger.exception("Full exception traceback")
|
||||
raise
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
logger.info("=" * 50)
|
||||
logger.info("Dataset Synthetic Generation Pipeline")
|
||||
logger.info("=" * 50)
|
||||
try:
|
||||
app()
|
||||
except Exception as exc:
|
||||
logger.error(f"Pipeline execution failed: {exc}")
|
||||
raise
|
||||
|
|
@ -0,0 +1,31 @@
|
|||
from langchain_core.messages import SystemMessage
|
||||
|
||||
|
||||
def get_prompt_mbpp(avap_docs: str) -> SystemMessage:
|
||||
return SystemMessage(
|
||||
content=f"""
|
||||
You are an AI agent responsible for generating a synthetic dataset for a programming language called AVAP (Advanced Virtual API Programming).
|
||||
|
||||
You will receive a JSON object containing programming problems and their solutions written in Python.
|
||||
|
||||
Your task is to:
|
||||
1. Convert each Python programming problem into an equivalent problem framed for the AVAP language.
|
||||
2. Replace the Python solution with a correct implementation written in AVAP.
|
||||
|
||||
Requirements:
|
||||
- The AVAP solution must follow AVAP syntax.
|
||||
- Use AVAP libraries and constructs when appropriate.
|
||||
- Functions in AVAP are declared using the `function` keyword.
|
||||
- Preserve the original problem intent and difficulty.
|
||||
- Do not produce Python code in the final solution.
|
||||
|
||||
Documentation usage rules:
|
||||
- You MUST rely exclusively on the AVAP documentation provided below.
|
||||
- Do not invent syntax, functions, or libraries that are not described in the documentation.
|
||||
- If the documentation does not contain enough information to solve the problem, return exactly:
|
||||
"I don't know how to answer this question based on the provided documentation."
|
||||
|
||||
AVAP Documentation:
|
||||
{avap_docs}
|
||||
"""
|
||||
)
|
||||
|
|
@ -0,0 +1,42 @@
|
|||
{
|
||||
"0": {
|
||||
"text": "Write a function to remove the matching tuples from the given two lists.",
|
||||
"code": "function remove_matching_tuple(test_list1, test_list2){\n // Result list to store non-matching items\n result = []\n getListLen(test_list1, len1)\n getListLen(test_list2, len2)\n startLoop(i, 0, len1)\n itemFromList(test_list1, i, current_item)\n found = False\n startLoop(j, 0, len2)\n itemFromList(test_list2, j, check_item)\n if(current_item, check_item, \"=\")\n found = True\n end()\n endLoop()\n if(found, False, \"=\")\n variableToList(current_item, result)\n end()\n endLoop()\n return(result)\n}"
|
||||
},
|
||||
"1": {
|
||||
"text": "Write a function to find the number of elements present in the given list.",
|
||||
"code": "function find_lists(input_list){\n getListLen(input_list, list_len)\n return(list_len)\n}"
|
||||
},
|
||||
"2": {
|
||||
"text": "Write a function to find the first natural number whose factorial is divisible by x.",
|
||||
"code": "function first_Factorial_Divisible_Number(x){\n i = 1\n fact = 1\n result = 1\n startLoop(i, 1, x)\n fact = fact * i\n remainder = fact % x\n if(remainder, 0, \"=\")\n result = i\n i = x\n end()\n endLoop()\n return(result)\n}"
|
||||
},
|
||||
"3": {
|
||||
"text": "Write a function to find the largest number that can be formed with the given list of digits.",
|
||||
"code": "function find_Max_Num(arr){\n // Sort the array in descending order\n getListLen(arr, n)\n // Bubble sort descending\n startLoop(i, 0, n)\n startLoop(j, 0, n)\n itemFromList(arr, j, val_j)\n next_idx = j + 1\n itemFromList(arr, next_idx, val_next)\n if(val_j, val_next, \"<\")\n // Swap\n arr[j] = val_next\n arr[next_idx] = val_j\n end()\n endLoop()\n endLoop()\n // Build the number\n itemFromList(arr, 0, num)\n startLoop(k, 1, n)\n itemFromList(arr, k, digit)\n num = num * 10 + digit\n endLoop()\n return(num)\n}"
|
||||
},
|
||||
"4": {
|
||||
"text": "Write a function to check if the triangle is equilateral or not.",
|
||||
"code": "function check_equilateral(x, y, z){\n if(x, y, \"=\")\n if(y, z, \"=\")\n return(True)\n end()\n end()\n return(False)\n}"
|
||||
},
|
||||
"5": {
|
||||
"text": "Write a function to count the occurrences of the first element of each tuple in the given list of tuples.",
|
||||
"code": "function sort_on_occurence(lst){\n // Build a JSON object counting occurrences of first elements\n counts = {}\n getListLen(lst, lst_len)\n startLoop(i, 0, lst_len)\n itemFromList(lst, i, current_tuple)\n itemFromList(current_tuple, 0, key)\n variableFromJSON(counts, key, existing)\n if(existing, None, \"=\")\n AddvariableToJSON(key, 1, counts)\n else()\n new_count = existing + 1\n AddvariableToJSON(key, new_count, counts)\n end()\n endLoop()\n return(counts)\n}"
|
||||
},
|
||||
"6": {
|
||||
"text": "Write a function to check if a given number is one less than twice its reverse.",
|
||||
"code": "function rev(num){\n rev_num = 0\n startLoop(i, 1, num)\n if(num, 0, \">\")\n remainder = num % 10\n rev_num = rev_num * 10 + remainder\n num = num // 10\n end()\n endLoop()\n return(rev_num)\n}\n\nfunction check(n){\n reversed_n = rev(n)\n twice_rev = 2 * reversed_n\n expected = n + 1\n if(twice_rev, expected, \"=\")\n return(True)\n end()\n return(False)\n}"
|
||||
},
|
||||
"7": {
|
||||
"text": "Write a function to convert a list of multiple integers into a single integer.",
|
||||
"code": "function multiple_to_single(L){\n getListLen(L, list_len)\n result = 0\n startLoop(i, 0, list_len)\n itemFromList(L, i, digit)\n result = result * 10 + digit\n endLoop()\n return(result)\n}"
|
||||
},
|
||||
"8": {
|
||||
"text": "Write a function that checks if a word exists at the end of a string, with optional punctuation.",
|
||||
"code": "function text_match_word(text){\n // Use getRegex to find a word at the end of the string with optional punctuation\n pattern = \"\\\\w+\\\\S*$\"\n getRegex(text, pattern, match_result)\n if(match_result, None, \"!=\")\n addVar(output, \"Found a match!\")\n else()\n addVar(output, \"Not matched!\")\n end()\n return(output)\n}"
|
||||
},
|
||||
"9": {
|
||||
"text": "Write a function to find the sum of numbers in a list between the indices of a specified range.",
|
||||
"code": "function sum_range_list(list1, m, n){\n sum_range = 0\n end_idx = n + 1\n startLoop(i, m, end_idx)\n itemFromList(list1, i, current_val)\n sum_range = sum_range + current_val\n endLoop()\n return(sum_range)\n}"
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue