Merge branch 'mrh-online-dev' of github.com:BRUNIX-AI/assistance-engine into mrh-online-dev

This commit is contained in:
acano 2026-03-10 14:36:17 +01:00
commit 745ce07805
5 changed files with 3401 additions and 58 deletions

3191
docs/avap.txt Normal file

File diff suppressed because it is too large Load Diff

View File

@ -1,58 +0,0 @@
import json
from datasets import load_dataset
import boto3
import typer
import logging
from botocore.config import Config
from langchain_core.messages import SystemMessage, HumanMessage
from src.utils.llm_factory import create_chat_model
from src.config import RAW_DIR, INTERIM_DIR
logger = logging.getLogger(__name__)
app = typer.Typer()
config = Config(
region_name="us-east-1",
connect_timeout=10,
read_timeout=600,
)
client = boto3.client("bedrock-runtime", config=config)
llm = create_chat_model(
provider="bedrock",
client=client,
model="global.anthropic.claude-sonnet-4-6",
temperature=0,
)
dataset_full = load_dataset("mbpp")
random_test_samples = dataset_full["test"].shuffle(seed=42).select(range(50))
test_samples_dict = {
str(i): {
"text": sample["text"],
"code": sample["code"],
}
for i, sample in enumerate(random_test_samples)
}
llm_response = llm.invoke([PROMPT_MBPP, HumanMessage(content=str(test_samples_dict))])
json_str = llm_response.content.removeprefix("```json").removesuffix("```").strip()
synthetic_data = json.loads(json_str)
if __name__ == "__main__":
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
)
try:
app()
except Exception as exc:
logger.exception(exc)
raise

View File

@ -0,0 +1,137 @@
import json
from enum import Enum
from datasets import load_dataset
import boto3
import typer
from loguru import logger
from botocore.config import Config
from pathlib import Path
from langchain_core.messages import SystemMessage, HumanMessage
from src.utils.llm_factory import create_chat_model
from src.config import RAW_DIR, INTERIM_DIR
from scripts.pipelines.input.prompts import get_prompt_mbpp
app = typer.Typer()
class Provider(str, Enum):
bedrock = "bedrock"
openai = "openai"
ollama = "ollama"
@app.command()
def generate_synthetic_dataset(
provider: Provider = Provider.bedrock,
model: str = "global.anthropic.claude-sonnet-4-6",
temperature: float = 0.0,
num_samples: int = 10,
seed: int = 42,
context_docs_path: str = "docs/avap.txt",
synthetic_output_path: str = "synthetic_datasets",
) -> None:
"""Generate synthetic dataset using the specified LLM."""
logger.info("🚀 Starting synthetic dataset generation pipeline")
logger.info(
f"Configuration - Provider: {provider}, Model: {model}, Temperature: {temperature}, Samples: {num_samples}, Seed: {seed}"
)
try:
config = Config(
connect_timeout=10,
read_timeout=600,
)
client = boto3.client("bedrock-runtime", config=config)
logger.info("✓ Bedrock client initialized successfully")
# Create LLM instance with specified parameters
logger.debug(f"Creating LLM instance with provider: {provider}")
llm = create_chat_model(
provider=provider,
client=client,
model=model,
temperature=temperature,
)
logger.info(f"✓ LLM initialized: {model}")
# Load MBPP dataset
logger.debug("Loading MBPP dataset")
dataset_full = load_dataset("mbpp")
logger.info("✓ MBPP dataset loaded successfully")
# Select random test samples for synthetic generation
logger.debug(f"Selecting {num_samples} random test samples from MBPP dataset")
random_test_samples = (
dataset_full["test"].shuffle(seed=seed).select(range(num_samples))
)
logger.info(f"✓ Selected {len(random_test_samples)} test samples")
# Prepare test samples dictionary
logger.debug("Preparing test samples dictionary")
test_samples_dict = {
str(i): {
"text": sample["text"],
"code": sample["code"],
}
for i, sample in enumerate(random_test_samples)
}
logger.info(f"✓ Prepared {len(test_samples_dict)} samples for processing")
# Load AVAP documentation
logger.debug(f"Loading AVAP documentation from {context_docs_path}")
with open(context_docs_path, "r") as f:
avap_docs = f.read()
logger.info(f"✓ AVAP documentation loaded ({len(avap_docs)} characters)")
# Generate prompt with AVAP context
logger.debug("Generating prompt with AVAP context")
get_prompt_mbpp_func = get_prompt_mbpp(avap_docs=avap_docs)
logger.debug("✓ Prompt generated successfully")
# Invoke LLM to generate synthetic data
logger.info("Invoking LLM to generate synthetic dataset...")
llm_response = llm.invoke(
[get_prompt_mbpp_func, HumanMessage(content=str(test_samples_dict))]
)
logger.info("✓ LLM response received")
# Parse JSON response
logger.debug("Parsing LLM response as JSON")
json_str = (
llm_response.content.removeprefix("```json").removesuffix("```").strip()
)
synthetic_data = json.loads(json_str)
logger.info(
f"✓ Successfully parsed synthetic data with {len(synthetic_data)} samples"
)
logger.info(
f"Pipeline completed successfully! Generated {len(synthetic_data)} synthetic samples"
)
output_dir = Path(synthetic_output_path)
output_dir.mkdir(parents=True, exist_ok=True)
output_file = output_dir / f"synthetic_data_{provider.value}.json"
with output_file.open("w", encoding="utf-8") as f:
json.dump(synthetic_data, f, ensure_ascii=False, indent=2)
return synthetic_data
except Exception as exc:
logger.error(f"Error during synthetic dataset generation: {exc}")
logger.exception("Full exception traceback")
raise
if __name__ == "__main__":
logger.info("=" * 50)
logger.info("Dataset Synthetic Generation Pipeline")
logger.info("=" * 50)
try:
app()
except Exception as exc:
logger.error(f"Pipeline execution failed: {exc}")
raise

View File

@ -0,0 +1,31 @@
from langchain_core.messages import SystemMessage
def get_prompt_mbpp(avap_docs: str) -> SystemMessage:
return SystemMessage(
content=f"""
You are an AI agent responsible for generating a synthetic dataset for a programming language called AVAP (Advanced Virtual API Programming).
You will receive a JSON object containing programming problems and their solutions written in Python.
Your task is to:
1. Convert each Python programming problem into an equivalent problem framed for the AVAP language.
2. Replace the Python solution with a correct implementation written in AVAP.
Requirements:
- The AVAP solution must follow AVAP syntax.
- Use AVAP libraries and constructs when appropriate.
- Functions in AVAP are declared using the `function` keyword.
- Preserve the original problem intent and difficulty.
- Do not produce Python code in the final solution.
Documentation usage rules:
- You MUST rely exclusively on the AVAP documentation provided below.
- Do not invent syntax, functions, or libraries that are not described in the documentation.
- If the documentation does not contain enough information to solve the problem, return exactly:
"I don't know how to answer this question based on the provided documentation."
AVAP Documentation:
{avap_docs}
"""
)

View File

@ -0,0 +1,42 @@
{
"0": {
"text": "Write a function to remove the matching tuples from the given two lists.",
"code": "function remove_matching_tuple(test_list1, test_list2){\n // Result list to store non-matching items\n result = []\n getListLen(test_list1, len1)\n getListLen(test_list2, len2)\n startLoop(i, 0, len1)\n itemFromList(test_list1, i, current_item)\n found = False\n startLoop(j, 0, len2)\n itemFromList(test_list2, j, check_item)\n if(current_item, check_item, \"=\")\n found = True\n end()\n endLoop()\n if(found, False, \"=\")\n variableToList(current_item, result)\n end()\n endLoop()\n return(result)\n}"
},
"1": {
"text": "Write a function to find the number of elements present in the given list.",
"code": "function find_lists(input_list){\n getListLen(input_list, list_len)\n return(list_len)\n}"
},
"2": {
"text": "Write a function to find the first natural number whose factorial is divisible by x.",
"code": "function first_Factorial_Divisible_Number(x){\n i = 1\n fact = 1\n result = 1\n startLoop(i, 1, x)\n fact = fact * i\n remainder = fact % x\n if(remainder, 0, \"=\")\n result = i\n i = x\n end()\n endLoop()\n return(result)\n}"
},
"3": {
"text": "Write a function to find the largest number that can be formed with the given list of digits.",
"code": "function find_Max_Num(arr){\n // Sort the array in descending order\n getListLen(arr, n)\n // Bubble sort descending\n startLoop(i, 0, n)\n startLoop(j, 0, n)\n itemFromList(arr, j, val_j)\n next_idx = j + 1\n itemFromList(arr, next_idx, val_next)\n if(val_j, val_next, \"<\")\n // Swap\n arr[j] = val_next\n arr[next_idx] = val_j\n end()\n endLoop()\n endLoop()\n // Build the number\n itemFromList(arr, 0, num)\n startLoop(k, 1, n)\n itemFromList(arr, k, digit)\n num = num * 10 + digit\n endLoop()\n return(num)\n}"
},
"4": {
"text": "Write a function to check if the triangle is equilateral or not.",
"code": "function check_equilateral(x, y, z){\n if(x, y, \"=\")\n if(y, z, \"=\")\n return(True)\n end()\n end()\n return(False)\n}"
},
"5": {
"text": "Write a function to count the occurrences of the first element of each tuple in the given list of tuples.",
"code": "function sort_on_occurence(lst){\n // Build a JSON object counting occurrences of first elements\n counts = {}\n getListLen(lst, lst_len)\n startLoop(i, 0, lst_len)\n itemFromList(lst, i, current_tuple)\n itemFromList(current_tuple, 0, key)\n variableFromJSON(counts, key, existing)\n if(existing, None, \"=\")\n AddvariableToJSON(key, 1, counts)\n else()\n new_count = existing + 1\n AddvariableToJSON(key, new_count, counts)\n end()\n endLoop()\n return(counts)\n}"
},
"6": {
"text": "Write a function to check if a given number is one less than twice its reverse.",
"code": "function rev(num){\n rev_num = 0\n startLoop(i, 1, num)\n if(num, 0, \">\")\n remainder = num % 10\n rev_num = rev_num * 10 + remainder\n num = num // 10\n end()\n endLoop()\n return(rev_num)\n}\n\nfunction check(n){\n reversed_n = rev(n)\n twice_rev = 2 * reversed_n\n expected = n + 1\n if(twice_rev, expected, \"=\")\n return(True)\n end()\n return(False)\n}"
},
"7": {
"text": "Write a function to convert a list of multiple integers into a single integer.",
"code": "function multiple_to_single(L){\n getListLen(L, list_len)\n result = 0\n startLoop(i, 0, list_len)\n itemFromList(L, i, digit)\n result = result * 10 + digit\n endLoop()\n return(result)\n}"
},
"8": {
"text": "Write a function that checks if a word exists at the end of a string, with optional punctuation.",
"code": "function text_match_word(text){\n // Use getRegex to find a word at the end of the string with optional punctuation\n pattern = \"\\\\w+\\\\S*$\"\n getRegex(text, pattern, match_result)\n if(match_result, None, \"!=\")\n addVar(output, \"Found a match!\")\n else()\n addVar(output, \"Not matched!\")\n end()\n return(output)\n}"
},
"9": {
"text": "Write a function to find the sum of numbers in a list between the indices of a specified range.",
"code": "function sum_range_list(list1, m, n){\n sum_range = 0\n end_idx = n + 1\n startLoop(i, m, end_idx)\n itemFromList(list1, i, current_val)\n sum_range = sum_range + current_val\n endLoop()\n return(sum_range)\n}"
}
}