working on synthetic dataset
This commit is contained in:
parent
8501988619
commit
c7adab24a6
|
|
@ -0,0 +1,16 @@
|
|||
addParam("txt", txt)
|
||||
result = False
|
||||
if(None, None, 'txt == "" or txt == None')
|
||||
result = False
|
||||
else()
|
||||
replace(txt, ",", " ", txt_clean)
|
||||
last_word = txt_clean.split()[-1] if txt_clean.split() else ""
|
||||
getListLen(last_word, last_len)
|
||||
if(None, None, 'last_len == 1 and last_word.lower() >= "a" and last_word.lower() <= "z"')
|
||||
result = True
|
||||
else()
|
||||
result = False
|
||||
end()
|
||||
end()
|
||||
addResult(result)
|
||||
_status = 200
|
||||
|
|
@ -0,0 +1,76 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "c46228bd",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"code=\"addParam(\\\"txt\\\", txt)\\nresult = False\\nif(None, None, `txt == \\\"\\\" or txt == None`)\\n result = False\\nelse()\\n replace(txt, \\\",\\\", \\\" \\\", txt_clean)\\n last_word = txt_clean.split()[-1] if txt_clean.split() else \\\"\\\"\\n getListLen(last_word, last_len)\\n if(None, None, `last_len == 1 and last_word.lower() >= \\\"a\\\" and last_word.lower() <= \\\"z\\\"`)\\n result = True\\n else()\\n result = False\\n end()\\nend()\\naddResult(result)\\n_status = 200\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "91c20032",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"addParam(\"txt\", txt)\n",
|
||||
"result = False\n",
|
||||
"if(None, None, `txt == \"\" or txt == None`)\n",
|
||||
" result = False\n",
|
||||
"else()\n",
|
||||
" replace(txt, \",\", \" \", txt_clean)\n",
|
||||
" last_word = txt_clean.split()[-1] if txt_clean.split() else \"\"\n",
|
||||
" getListLen(last_word, last_len)\n",
|
||||
" if(None, None, `last_len == 1 and last_word.lower() >= \"a\" and last_word.lower() <= \"z\"`)\n",
|
||||
" result = True\n",
|
||||
" else()\n",
|
||||
" result = False\n",
|
||||
" end()\n",
|
||||
"end()\n",
|
||||
"addResult(result)\n",
|
||||
"_status = 200\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(code)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "64d5e9d4",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "assistance-engine",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.12.11"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
|
|
@ -0,0 +1,141 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "475ac5f0",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Libraries"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "108f20c8",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from datasets import load_dataset\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "43491a6a",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Load Data"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"id": "76c78147",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from datasets import DatasetDict, load_dataset\n",
|
||||
"from evidently import Dataset\n",
|
||||
"import pandas as pd\n",
|
||||
"\n",
|
||||
"database_he = load_dataset(\"openai_humaneval\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 17,
|
||||
"id": "e76eefff",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"DatasetDict({\n",
|
||||
" test: Dataset({\n",
|
||||
" features: ['task_id', 'prompt', 'canonical_solution', 'test', 'entry_point'],\n",
|
||||
" num_rows: 164\n",
|
||||
" })\n",
|
||||
"})"
|
||||
]
|
||||
},
|
||||
"execution_count": 17,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"database_he"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"id": "386395f1",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"database_he_df = database_he[\"test\"].to_pandas()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "ae12c1f3",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"database_he_df"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"id": "edfd2f52",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"mbpp = load_dataset(\"mbpp\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"id": "5c3d8120",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"mbpp_df = mbpp[\"train\"].to_pandas()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "3d59904c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"mbpp_df"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "assistance-engine",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.12.11"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
|
|
@ -0,0 +1,251 @@
|
|||
import json
|
||||
from enum import Enum
|
||||
from typing import Optional
|
||||
from datasets import load_dataset
|
||||
import boto3
|
||||
import typer
|
||||
from loguru import logger
|
||||
from botocore.config import Config
|
||||
from pathlib import Path
|
||||
from langchain_core.messages import SystemMessage, HumanMessage
|
||||
from src.utils.llm_factory import create_chat_model
|
||||
from scripts.pipelines.tasks.prompts import (
|
||||
get_prompt_mbpp,
|
||||
get_prompt_human_eval,
|
||||
get_prompt_generation,
|
||||
)
|
||||
|
||||
|
||||
app = typer.Typer()
|
||||
|
||||
|
||||
class Provider(str, Enum):
|
||||
bedrock = "bedrock"
|
||||
openai = "openai"
|
||||
ollama = "ollama"
|
||||
|
||||
|
||||
class Dataset(str, Enum):
|
||||
mbpp = "mbpp"
|
||||
human_eval = "openai_humaneval"
|
||||
|
||||
|
||||
@app.command()
|
||||
def generate_synthetic_dataset(
|
||||
provider: Provider = Provider.bedrock,
|
||||
model: str = "global.anthropic.claude-sonnet-4-6",
|
||||
temperature: float = 0.0,
|
||||
num_samples: int = 10,
|
||||
seed: int = 42,
|
||||
context_docs_path: str = "docs/LRM/avap.md",
|
||||
synthetic_output_path: str = "synthetic_datasets",
|
||||
dataset: Optional[Dataset] = None,
|
||||
problems_per_category: int = 10,
|
||||
) -> None:
|
||||
"""
|
||||
Generate synthetic AVAP dataset.
|
||||
|
||||
Modes:
|
||||
- With --dataset {mbpp|human_eval}: Translate existing dataset to AVAP
|
||||
- Without --dataset: Generate new problems from scratch using the prompt
|
||||
"""
|
||||
logger.info("🚀 Starting synthetic dataset generation pipeline")
|
||||
logger.info(
|
||||
f"Configuration - Provider: {provider}, Model: {model}, Temperature: {temperature}, "
|
||||
f"Samples: {num_samples}, Seed: {seed}, Dataset: {dataset or 'generation mode'}"
|
||||
)
|
||||
|
||||
config = Config(
|
||||
connect_timeout=10,
|
||||
read_timeout=600,
|
||||
)
|
||||
client = boto3.client("bedrock-runtime", config=config)
|
||||
logger.info("✓ Bedrock client initialized successfully")
|
||||
|
||||
# Create LLM instance with specified parameters
|
||||
logger.debug(f"Creating LLM instance with provider: {provider}")
|
||||
llm = create_chat_model(
|
||||
provider=provider,
|
||||
client=client,
|
||||
model=model,
|
||||
temperature=temperature,
|
||||
)
|
||||
logger.info(f"✓ LLM initialized: {model}")
|
||||
|
||||
# Load AVAP documentation
|
||||
logger.debug(f"Loading AVAP documentation from {context_docs_path}")
|
||||
with open(context_docs_path, "r") as f:
|
||||
avap_docs = f.read()
|
||||
logger.info(f"✓ AVAP documentation loaded ({len(avap_docs)} characters)")
|
||||
|
||||
# Choose mode: translation or generation
|
||||
if dataset:
|
||||
logger.info(f"🔄 Translation mode: Converting {dataset.value} dataset to AVAP")
|
||||
_generate_from_dataset(
|
||||
llm=llm,
|
||||
avap_docs=avap_docs,
|
||||
dataset_name=dataset.value,
|
||||
num_samples=num_samples,
|
||||
seed=seed,
|
||||
output_path=synthetic_output_path,
|
||||
provider=provider.value,
|
||||
)
|
||||
else:
|
||||
logger.info("✨ Generation mode: Creating new problems from prompt")
|
||||
_generate_from_prompt(
|
||||
llm=llm,
|
||||
avap_docs=avap_docs,
|
||||
num_samples=num_samples,
|
||||
output_path=synthetic_output_path,
|
||||
provider=provider.value,
|
||||
problems_per_category=problems_per_category,
|
||||
)
|
||||
|
||||
|
||||
def _generate_from_dataset(
|
||||
llm,
|
||||
avap_docs: str,
|
||||
dataset_name: str,
|
||||
num_samples: int,
|
||||
seed: int,
|
||||
output_path: str,
|
||||
provider: str,
|
||||
) -> None:
|
||||
"""Generate by translating an existing dataset to AVAP."""
|
||||
# Load dataset
|
||||
logger.debug(f"Loading {dataset_name} dataset")
|
||||
dataset_full = load_dataset(dataset_name)
|
||||
logger.info(f"✓ {dataset_name} dataset loaded successfully")
|
||||
|
||||
# Select random test samples
|
||||
logger.debug(f"Selecting {num_samples} random samples from {dataset_name}")
|
||||
random_test_samples = dataset_full["test"].shuffle(seed=seed).select(
|
||||
range(min(num_samples, len(dataset_full["test"])))
|
||||
)
|
||||
logger.info(f"✓ Selected {len(random_test_samples)} samples")
|
||||
|
||||
# Prepare samples dictionary
|
||||
logger.debug("Preparing samples dictionary")
|
||||
if dataset_name == "mbpp":
|
||||
test_samples_dict = {
|
||||
str(i): {"text": sample["text"], "code": sample["code"]}
|
||||
for i, sample in enumerate(random_test_samples)
|
||||
}
|
||||
prompt_func = get_prompt_mbpp(avap_docs=avap_docs)
|
||||
output_suffix = "mbpp"
|
||||
elif dataset_name == "openai_humaneval":
|
||||
test_samples_dict = {
|
||||
str(sample["task_id"]): {
|
||||
"task_id": sample["task_id"],
|
||||
"prompt": sample["prompt"],
|
||||
"canonical_solution": sample["canonical_solution"],
|
||||
"test": sample["test"],
|
||||
"entry_point": sample["entry_point"],
|
||||
}
|
||||
for i, sample in enumerate(random_test_samples)
|
||||
}
|
||||
prompt_func = get_prompt_human_eval(avap_docs=avap_docs)
|
||||
output_suffix = "human_eval"
|
||||
else:
|
||||
raise ValueError(f"Unsupported dataset: {dataset_name}")
|
||||
|
||||
logger.info(f"✓ Prepared {len(test_samples_dict)} samples for processing")
|
||||
|
||||
# Generate prompt
|
||||
logger.debug("Generating prompt with AVAP context")
|
||||
logger.debug("✓ Prompt generated successfully")
|
||||
|
||||
# Invoke LLM
|
||||
logger.info("Invoking LLM to generate synthetic dataset...")
|
||||
llm_response = llm.invoke(
|
||||
[prompt_func, HumanMessage(content=str(test_samples_dict))]
|
||||
)
|
||||
logger.info("✓ LLM response received")
|
||||
logger.debug(f"LLM Response: {llm_response.content}")
|
||||
|
||||
# Parse JSON response
|
||||
logger.debug("Parsing LLM response as JSON")
|
||||
json_str = (
|
||||
llm_response.content.removeprefix("```json")
|
||||
.removesuffix("```")
|
||||
.strip()
|
||||
)
|
||||
logger.debug(f"JSON string: {json_str}")
|
||||
synthetic_data = json.loads(json_str)
|
||||
logger.info(
|
||||
f"✓ Successfully parsed synthetic data with {len(synthetic_data)} samples"
|
||||
)
|
||||
|
||||
# Save output
|
||||
output_dir = Path(output_path)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
output_file = output_dir / f"synthetic_data_{output_suffix}_{provider}.json"
|
||||
with output_file.open("w", encoding="utf-8") as f:
|
||||
json.dump(synthetic_data, f, ensure_ascii=False, indent=2)
|
||||
|
||||
logger.info(f"✓ Output saved to {output_file}")
|
||||
logger.info(
|
||||
f"Pipeline completed successfully! Generated {len(synthetic_data)} synthetic samples"
|
||||
)
|
||||
|
||||
|
||||
def _generate_from_prompt(
|
||||
llm,
|
||||
avap_docs: str,
|
||||
num_samples: int,
|
||||
output_path: str,
|
||||
provider: str,
|
||||
problems_per_category: int = 10,
|
||||
) -> None:
|
||||
"""Generate new problems from scratch using the generation prompt."""
|
||||
logger.debug("Generating prompt for problem generation")
|
||||
prompt_func = get_prompt_generation(
|
||||
avap_docs=avap_docs,
|
||||
num_problems=num_samples,
|
||||
problems_per_category=problems_per_category,
|
||||
)
|
||||
logger.debug("✓ Prompt generated successfully")
|
||||
|
||||
# Invoke LLM
|
||||
logger.info("Invoking LLM to generate new problems...")
|
||||
llm_response = llm.invoke(
|
||||
[prompt_func, HumanMessage(content="Generate the synthetic dataset now.")]
|
||||
)
|
||||
logger.info("✓ LLM response received")
|
||||
logger.debug(f"LLM Response: {llm_response.content}")
|
||||
|
||||
# Parse JSON response
|
||||
logger.debug("Parsing LLM response as JSON")
|
||||
json_str = (
|
||||
llm_response.content.removeprefix("```json")
|
||||
.removeprefix("```")
|
||||
.removesuffix("```")
|
||||
.strip()
|
||||
)
|
||||
logger.debug(f"JSON string: {json_str}")
|
||||
synthetic_data = json.loads(json_str)
|
||||
logger.info(
|
||||
f"✓ Successfully parsed synthetic data with {len(synthetic_data)} samples"
|
||||
)
|
||||
|
||||
# Save output
|
||||
output_dir = Path(output_path)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
output_file = output_dir / f"synthetic_data_generated_{provider}.json"
|
||||
with output_file.open("w", encoding="utf-8") as f:
|
||||
json.dump(synthetic_data, f, ensure_ascii=False, indent=2)
|
||||
|
||||
logger.info(f"✓ Output saved to {output_file}")
|
||||
logger.info(
|
||||
f"Pipeline completed successfully! Generated {len(synthetic_data)} synthetic samples"
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
logger.info("=" * 50)
|
||||
logger.info("Synthetic Dataset Generation Pipeline")
|
||||
logger.info("=" * 50)
|
||||
|
||||
app()
|
||||
|
|
@ -0,0 +1,131 @@
|
|||
import json
|
||||
from enum import Enum
|
||||
from datasets import load_dataset
|
||||
import boto3
|
||||
import typer
|
||||
from loguru import logger
|
||||
from botocore.config import Config
|
||||
from pathlib import Path
|
||||
from langchain_core.messages import SystemMessage, HumanMessage
|
||||
from src.utils.llm_factory import create_chat_model
|
||||
from scripts.pipelines.tasks.prompts import get_prompt_human_eval
|
||||
|
||||
|
||||
app = typer.Typer()
|
||||
|
||||
|
||||
class Provider(str, Enum):
|
||||
bedrock = "bedrock"
|
||||
openai = "openai"
|
||||
ollama = "ollama"
|
||||
|
||||
|
||||
@app.command()
|
||||
def generate_synthetic_dataset(
|
||||
provider: Provider = Provider.bedrock,
|
||||
model: str = "global.anthropic.claude-sonnet-4-6",
|
||||
temperature: float = 0.0,
|
||||
num_samples: int = 10,
|
||||
seed: int = 42,
|
||||
context_docs_path: str = "data/avap.txt",
|
||||
synthetic_output_path: str = "synthetic_datasets",
|
||||
) -> None:
|
||||
"""Generate synthetic dataset using the specified LLM with Human Eval dataset."""
|
||||
logger.info("🚀 Starting synthetic dataset generation pipeline (Human Eval)")
|
||||
logger.info(
|
||||
f"Configuration - Provider: {provider}, Model: {model}, Temperature: {temperature}, Samples: {num_samples}, Seed: {seed}"
|
||||
)
|
||||
|
||||
config = Config(
|
||||
connect_timeout=10,
|
||||
read_timeout=600,
|
||||
)
|
||||
client = boto3.client("bedrock-runtime", config=config)
|
||||
logger.info("✓ Bedrock client initialized successfully")
|
||||
|
||||
# Create LLM instance with specified parameters
|
||||
logger.debug(f"Creating LLM instance with provider: {provider}")
|
||||
llm = create_chat_model(
|
||||
provider=provider,
|
||||
client=client,
|
||||
model=model,
|
||||
temperature=temperature,
|
||||
)
|
||||
logger.info(f"✓ LLM initialized: {model}")
|
||||
|
||||
# Load Human Eval dataset
|
||||
logger.debug("Loading OpenAI Human Eval dataset")
|
||||
dataset_full = load_dataset("openai_humaneval")
|
||||
logger.info("✓ Human Eval dataset loaded successfully")
|
||||
|
||||
# Select random test samples for synthetic generation
|
||||
logger.debug(f"Selecting {num_samples} random test samples from Human Eval dataset")
|
||||
random_test_samples = (
|
||||
dataset_full["test"].shuffle(seed=seed).select(range(min(num_samples, len(dataset_full["test"]))))
|
||||
)
|
||||
logger.info(f"✓ Selected {len(random_test_samples)} test samples")
|
||||
|
||||
# Prepare test samples dictionary
|
||||
logger.debug("Preparing test samples dictionary")
|
||||
test_samples_dict = {
|
||||
str(sample["task_id"]): {
|
||||
"task_id": sample["task_id"],
|
||||
"prompt": sample["prompt"],
|
||||
"canonical_solution": sample["canonical_solution"],
|
||||
"test": sample["test"],
|
||||
"entry_point": sample["entry_point"],
|
||||
}
|
||||
for i, sample in enumerate(random_test_samples)
|
||||
}
|
||||
logger.info(f"✓ Prepared {len(test_samples_dict)} samples for processing")
|
||||
|
||||
# Load AVAP documentation
|
||||
logger.debug(f"Loading AVAP documentation from {context_docs_path}")
|
||||
with open(context_docs_path, "r") as f:
|
||||
avap_docs = f.read()
|
||||
logger.info(f"✓ AVAP documentation loaded ({len(avap_docs)} characters)")
|
||||
|
||||
# Generate prompt with AVAP context
|
||||
logger.debug("Generating prompt with AVAP context")
|
||||
get_prompt_human_eval_func = get_prompt_human_eval(avap_docs=avap_docs)
|
||||
logger.debug("✓ Prompt generated successfully")
|
||||
|
||||
# Invoke LLM to generate synthetic data
|
||||
logger.info("Invoking LLM to generate synthetic dataset...")
|
||||
llm_response = llm.invoke(
|
||||
[get_prompt_human_eval_func, HumanMessage(content=str(test_samples_dict))]
|
||||
)
|
||||
logger.info("✓ LLM response received")
|
||||
logger.info(f"LLM Response: {llm_response.content}")
|
||||
|
||||
# Parse JSON response
|
||||
logger.debug("Parsing LLM response as JSON")
|
||||
json_str = (
|
||||
llm_response.content.removeprefix("```json").removesuffix("```").strip()
|
||||
)
|
||||
logger.debug(f"JSON string: {json_str}")
|
||||
synthetic_data = json.loads(json_str)
|
||||
logger.info(
|
||||
f"✓ Successfully parsed synthetic data with {len(synthetic_data)} samples"
|
||||
)
|
||||
|
||||
logger.info(
|
||||
f"Pipeline completed successfully! Generated {len(synthetic_data)} synthetic samples"
|
||||
)
|
||||
|
||||
output_dir = Path(synthetic_output_path)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
output_file = output_dir / f"synthetic_data_human_eval_{provider.value}.json"
|
||||
with output_file.open("w", encoding="utf-8") as f:
|
||||
json.dump(synthetic_data, f, ensure_ascii=False, indent=2)
|
||||
|
||||
return synthetic_data
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
logger.info("=" * 50)
|
||||
logger.info("Human Eval Synthetic Generation Pipeline")
|
||||
logger.info("=" * 50)
|
||||
|
||||
app()
|
||||
|
|
@ -27,7 +27,7 @@ def generate_synthetic_dataset(
|
|||
temperature: float = 0.0,
|
||||
num_samples: int = 10,
|
||||
seed: int = 42,
|
||||
context_docs_path: str = "docs/avap.txt",
|
||||
context_docs_path: str = "docs/LRM/avap.md",
|
||||
synthetic_output_path: str = "synthetic_datasets",
|
||||
) -> None:
|
||||
"""Generate synthetic dataset using the specified LLM."""
|
||||
|
|
@ -129,8 +129,5 @@ if __name__ == "__main__":
|
|||
logger.info("=" * 50)
|
||||
logger.info("Dataset Synthetic Generation Pipeline")
|
||||
logger.info("=" * 50)
|
||||
try:
|
||||
|
||||
app()
|
||||
except Exception as exc:
|
||||
logger.error(f"Pipeline execution failed: {exc}")
|
||||
raise
|
||||
|
|
|
|||
|
|
@ -4,28 +4,153 @@ from langchain_core.messages import SystemMessage
|
|||
def get_prompt_mbpp(avap_docs: str) -> SystemMessage:
|
||||
return SystemMessage(
|
||||
content=f"""
|
||||
You are an AI agent responsible for generating a synthetic dataset for a programming language called AVAP (Advanced Virtual API Programming).
|
||||
Eres un agente de IA responsable de generar un conjunto de datos sintético para un lenguaje de programación llamado AVAP (Advanced Virtual API Programming).
|
||||
|
||||
You will receive a JSON object containing programming problems and their solutions written in Python.
|
||||
Recibirás un objeto JSON que contiene problemas de programación y sus soluciones escritas en Python.
|
||||
|
||||
Your task is to:
|
||||
1. Convert each Python programming problem into an equivalent problem framed for the AVAP language.
|
||||
2. Replace the Python solution with a correct implementation written in AVAP.
|
||||
Tu tarea es:
|
||||
1. Convertir cada problema de programación en Python en un problema equivalente formulado para el lenguaje AVAP.
|
||||
2. Reemplazar la solución en Python con una implementación correcta escrita en AVAP.
|
||||
|
||||
Requirements:
|
||||
- The AVAP solution must follow AVAP syntax.
|
||||
- Use AVAP libraries and constructs when appropriate.
|
||||
- Functions in AVAP are declared using the `function` keyword.
|
||||
- Preserve the original problem intent and difficulty.
|
||||
- Do not produce Python code in the final solution.
|
||||
Requisitos:
|
||||
- La solución en AVAP debe seguir la sintaxis de AVAP.
|
||||
- Usa bibliotecas y construcciones de AVAP cuando sea apropiado.
|
||||
- Las funciones en AVAP se declaran usando la palabra clave `function`.
|
||||
- Preserva la intención y dificultad del problema original.
|
||||
- No produzcas código Python en la solución final.
|
||||
|
||||
Documentation usage rules:
|
||||
- You MUST rely exclusively on the AVAP documentation provided below.
|
||||
- Do not invent syntax, functions, or libraries that are not described in the documentation.
|
||||
- If the documentation does not contain enough information to solve the problem, return exactly:
|
||||
"I don't know how to answer this question based on the provided documentation."
|
||||
Reglas de uso de documentación:
|
||||
- DEBES confiar exclusivamente en la documentación de AVAP proporcionada a continuación.
|
||||
- No inventes sintaxis, funciones o bibliotecas que no estén descritas en la documentación.
|
||||
- Si la documentación no contiene suficiente información para resolver el problema, devuelve exactamente:
|
||||
"No sé cómo responder esta pregunta basándome en la documentación proporcionada."
|
||||
|
||||
AVAP Documentation:
|
||||
Documentación AVAP:
|
||||
{avap_docs}
|
||||
"""
|
||||
)
|
||||
|
||||
|
||||
def get_prompt_human_eval(avap_docs: str) -> SystemMessage:
|
||||
return SystemMessage(
|
||||
content = f"""
|
||||
Eres un agente de IA responsable de generar un conjunto de datos sintético para un lenguaje de programación llamado AVAP (Advanced Virtual API Programming).
|
||||
|
||||
AVAP es un lenguaje cuyo propósito principal es crear APIs. Por lo tanto, cada problema de programación debe transformarse en una tarea orientada a API, no en una función independiente.
|
||||
|
||||
Recibirás como entrada un objeto JSON que contiene problemas de programación del conjunto de datos OpenAI HumanEval y sus soluciones de referencia escritas en Python.
|
||||
|
||||
Tu tarea es transformar cada ejemplo de entrada en un elemento del conjunto de datos AVAP haciendo lo siguiente:
|
||||
1. Reescribe la tarea de programación original como una especificación de endpoint API.
|
||||
2. Reemplaza la solución basada en funciones de Python con una implementación válida de AVAP centrada en la lógica del endpoint.
|
||||
3. Adapta las pruebas para que validen el comportamiento esperado de la API en AVAP.
|
||||
4. Preserva la intención del problema original tanto como sea posible, pero exprésalo a través de interacciones de tipo solicitud/respuesta.
|
||||
|
||||
Requisitos de salida:
|
||||
- Tu respuesta DEBE ser un JSON válido.
|
||||
- Tu respuesta DEBE ser un array JSON.
|
||||
- Cada elemento del array DEBE seguir exactamente esta estructura:
|
||||
|
||||
[
|
||||
{{
|
||||
"task_id": 1,
|
||||
"text": "Crear un endpoint que reciba un parámetro 'message' y devuelva un saludo personalizado. Si no se proporciona el parámetro, debe devolver un saludo genérico con código de estado 200.",
|
||||
"code": "addParam(\\"message\\", message)\\nif(message, None, \\"=\\")\\n greeting = \\"Hello, World!\\"\\nelse()\\n greeting = \\"Hello, \\" + message + \\"!\\"\\nend()\\naddResult(greeting)\\n_status = 200",
|
||||
"test_inputs": {{
|
||||
"message": "Alice"
|
||||
}},
|
||||
"test_list": [
|
||||
"re.search(r'Hello, Alice!', greeting)",
|
||||
"re.match(r'^200$', str(_status))"
|
||||
]
|
||||
}}
|
||||
]
|
||||
|
||||
Significado de los campos:
|
||||
- "task_id": identificador entero de la tarea original.
|
||||
- "text": declaración del problema reescrita en español, expresada como una tarea de endpoint API.
|
||||
- "code": código AVAP válido que implementa el comportamiento del endpoint.
|
||||
- "test_inputs": objeto con los parámetros de solicitud o datos de carga necesarios para probar el endpoint.
|
||||
- "test_list": lista de expresiones de validación como cadenas que verifican el resultado de la API, valores devueltos y estado cuando sea apropiado.
|
||||
|
||||
Reglas de transformación:
|
||||
- Reinterpreta cada problema de HumanEval como algo que haría un endpoint API.
|
||||
- Prefiere entradas como parámetros de solicitud, parámetros de consulta, campos de cuerpo o valores de ruta, según lo que mejor se adapte a la tarea original.
|
||||
- Prefiere salidas como respuestas API, resultados o campos de estado.
|
||||
- Si la tarea original de Python devuelve un valor calculado, la versión de AVAP debe exponer ese valor como una respuesta de endpoint.
|
||||
- Si es útil, incluye manejo de estado como `_status = 200`, pero solo si es compatible con la documentación.
|
||||
- El endpoint debe reflejar la intención algorítmica original, no solo un envoltorio superficial de tipo saludo.
|
||||
|
||||
Restricciones estrictas:
|
||||
- Usa solo sintaxis de AVAP, construcciones, operadores y bibliotecas describidas explícitamente en la documentación a continuación.
|
||||
- No inventes características no documentadas.
|
||||
- No produzcas código Python en la solución.
|
||||
- No incluyas explicaciones, markdown o ningún texto fuera del array JSON.
|
||||
- Devuelve solo el array JSON final.
|
||||
|
||||
Regla de fracaso:
|
||||
- Si la documentación de AVAP no contiene suficiente información para producir una solución correcta basada en endpoints, devuelve exactamente:
|
||||
"No sé cómo responder esta pregunta basándome en la documentación proporcionada."
|
||||
|
||||
Documentación AVAP:
|
||||
{avap_docs}
|
||||
""")
|
||||
|
||||
|
||||
def get_prompt_generation(
|
||||
avap_docs: str, num_problems: int = 10, problems_per_category: int = 10
|
||||
) -> SystemMessage:
|
||||
return SystemMessage(
|
||||
content=f"""
|
||||
Eres un agente de IA responsable de generar un conjunto de datos sintético para un lenguaje de programación llamado AVAP (Advanced Virtual API Programming).
|
||||
|
||||
Tu tarea es generar exactamente {num_problems} nuevos problemas de programación en AVAP que demuestren diferentes características del lenguaje.
|
||||
|
||||
Requisitos:
|
||||
- Genera problemas realistas de endpoints API que podrían usarse para evaluación.
|
||||
- Cubre características y construcciones diversas de AVAP.
|
||||
- Cada problema debe ser independiente y autónomo.
|
||||
- Varía el nivel de dificultad: problemas simples, intermedios y avanzados.
|
||||
- Sigue la sintaxis y semántica de AVAP exactamente como se describe en la documentación.
|
||||
|
||||
Formato de salida:
|
||||
DEBES responder SOLO con un array JSON válido. Sin markdown, sin explicaciones, sin texto fuera del array.
|
||||
|
||||
Cada elemento DEBE seguir esta estructura exacta:
|
||||
{{
|
||||
"task_id": <entero>,
|
||||
"text": "<descripción del problema en español, describiendo lo que el endpoint debe hacer>",
|
||||
"code": "<código AVAP válido implementando el endpoint, con \\n para saltos de línea>",
|
||||
"test_inputs": {{}},
|
||||
"test_list": ["<expresión re.match(r'...', var)>", ...]
|
||||
}}
|
||||
|
||||
Descripción de los campos:
|
||||
- "task_id": entero único comenzando desde 1, consecutivo para cada problema.
|
||||
- "text": descripción de lo que un endpoint API debe implementar (en español).
|
||||
- "code": código AVAP completo y válido que implementa el endpoint. Debe usar:
|
||||
- addParam() para parámetros de entrada
|
||||
- addResult() para resultados de salida
|
||||
- _status para códigos de estado HTTP cuando sea apropiado
|
||||
- Sintaxis válida de AVAP (if/else, bucles, variables, etc.)
|
||||
- "test_inputs": típicamente vacío {{}} ya que el código generado contiene parámetros en línea, o contiene valores de prueba fijos si es necesario.
|
||||
- "test_list": lista de expresiones de validación con regex usando formato re.match(r'patrón', variable).
|
||||
|
||||
Reglas estrictas para la generación de código:
|
||||
- Usa SOLO características y sintaxis documentadas en el manual de AVAP a continuación.
|
||||
- NO inventes sintaxis o funciones no documentadas.
|
||||
- Cada línea de código representa una instrucción.
|
||||
- Las variables deben declararse antes de usarlas.
|
||||
- El estado HTTP debe usar _status = <código>.
|
||||
- Los valores devueltos deben usar addResult().
|
||||
- Solo código AVAP puro, nada de Python.
|
||||
|
||||
Si no puedes generar un problema válido con la documentación proporcionada, aún genera un problema pero indica en la test_list:
|
||||
["<documentación insuficiente para validación completa>"]
|
||||
|
||||
Documentación AVAP:
|
||||
{avap_docs}
|
||||
|
||||
Ahora genera {num_problems} problemas. Devuelve SOLO el array JSON.
|
||||
"""
|
||||
)
|
||||
|
|
@ -1,42 +1,42 @@
|
|||
{
|
||||
"0": {
|
||||
"text": "Write a function to remove the matching tuples from the given two lists.",
|
||||
"code": "function remove_matching_tuple(test_list1, test_list2){\n // Result list to store non-matching items\n result = []\n getListLen(test_list1, len1)\n getListLen(test_list2, len2)\n startLoop(i, 0, len1)\n itemFromList(test_list1, i, current_item)\n found = False\n startLoop(j, 0, len2)\n itemFromList(test_list2, j, check_item)\n if(current_item, check_item, \"=\")\n found = True\n end()\n endLoop()\n if(found, False, \"=\")\n variableToList(current_item, result)\n end()\n endLoop()\n return(result)\n}"
|
||||
"text": "Write a function in AVAP to remove the matching tuples from the given two lists.",
|
||||
"code": "function remove_matching_tuple(test_list1, test_list2) {\n result = [sub for sub in test_list1 if sub not in test_list2]\n return(result)\n}"
|
||||
},
|
||||
"1": {
|
||||
"text": "Write a function to find the number of elements present in the given list.",
|
||||
"code": "function find_lists(input_list){\n getListLen(input_list, list_len)\n return(list_len)\n}"
|
||||
"text": "Write a function in AVAP to find the number of lists present in the given tuple (list).",
|
||||
"code": "function find_lists(Input) {\n getListLen(Input, length)\n return(length)\n}"
|
||||
},
|
||||
"2": {
|
||||
"text": "Write a function to find the first natural number whose factorial is divisible by x.",
|
||||
"code": "function first_Factorial_Divisible_Number(x){\n i = 1\n fact = 1\n result = 1\n startLoop(i, 1, x)\n fact = fact * i\n remainder = fact % x\n if(remainder, 0, \"=\")\n result = i\n i = x\n end()\n endLoop()\n return(result)\n}"
|
||||
"text": "Write a Avap function to find the first natural number whose factorial is divisible by x.",
|
||||
"code": "function first_Factorial_Divisible_Number(x) {\n i = 1\n fact = 1\n startLoop(i, 1, x)\n fact = fact * i\n remainder = fact % x\n if(remainder, 0, \"==\")\n return(i)\n end()\n endLoop()\n return(i)\n}"
|
||||
},
|
||||
"3": {
|
||||
"text": "Write a function to find the largest number that can be formed with the given list of digits.",
|
||||
"code": "function find_Max_Num(arr){\n // Sort the array in descending order\n getListLen(arr, n)\n // Bubble sort descending\n startLoop(i, 0, n)\n startLoop(j, 0, n)\n itemFromList(arr, j, val_j)\n next_idx = j + 1\n itemFromList(arr, next_idx, val_next)\n if(val_j, val_next, \"<\")\n // Swap\n arr[j] = val_next\n arr[next_idx] = val_j\n end()\n endLoop()\n endLoop()\n // Build the number\n itemFromList(arr, 0, num)\n startLoop(k, 1, n)\n itemFromList(arr, k, digit)\n num = num * 10 + digit\n endLoop()\n return(num)\n}"
|
||||
"text": "Write an AVAP function to find the largest number that can be formed with the given list of digits.",
|
||||
"code": "function find_Max_Num(arr, n) {\n sorted_arr = arr.sort(reverse=True)\n itemFromList(sorted_arr, 0, num)\n i = 1\n startLoop(i, 1, n)\n itemFromList(sorted_arr, i, digit)\n num = num * 10 + digit\n endLoop()\n return(num)\n}"
|
||||
},
|
||||
"4": {
|
||||
"text": "Write a function to check if the triangle is equilateral or not.",
|
||||
"code": "function check_equilateral(x, y, z){\n if(x, y, \"=\")\n if(y, z, \"=\")\n return(True)\n end()\n end()\n return(False)\n}"
|
||||
"text": "Write an AVAP function to check if a triangle is equilateral or not.",
|
||||
"code": "function check_equilateral(x, y, z) {\n if(None, None, `x == y and y == z`)\n return(True)\n else()\n return(False)\n end()\n}"
|
||||
},
|
||||
"5": {
|
||||
"text": "Write a function to count the occurrences of the first element of each tuple in the given list of tuples.",
|
||||
"code": "function sort_on_occurence(lst){\n // Build a JSON object counting occurrences of first elements\n counts = {}\n getListLen(lst, lst_len)\n startLoop(i, 0, lst_len)\n itemFromList(lst, i, current_tuple)\n itemFromList(current_tuple, 0, key)\n variableFromJSON(counts, key, existing)\n if(existing, None, \"=\")\n AddvariableToJSON(key, 1, counts)\n else()\n new_count = existing + 1\n AddvariableToJSON(key, new_count, counts)\n end()\n endLoop()\n return(counts)\n}"
|
||||
"text": "Write an AVAP function to sort the given list based on the occurrence of the first element of tuples.",
|
||||
"code": "function sort_on_occurence(lst) {\n getListLen(lst, lst_len)\n result = {}\n i = 0\n startLoop(i, 0, lst_len)\n itemFromList(lst, i, current_tuple)\n itemFromList(current_tuple, 0, key)\n itemFromList(current_tuple, 1, val)\n AddVariableToJSON(key, val, result)\n endLoop()\n return(result)\n}"
|
||||
},
|
||||
"6": {
|
||||
"text": "Write a function to check if a given number is one less than twice its reverse.",
|
||||
"code": "function rev(num){\n rev_num = 0\n startLoop(i, 1, num)\n if(num, 0, \">\")\n remainder = num % 10\n rev_num = rev_num * 10 + remainder\n num = num // 10\n end()\n endLoop()\n return(rev_num)\n}\n\nfunction check(n){\n reversed_n = rev(n)\n twice_rev = 2 * reversed_n\n expected = n + 1\n if(twice_rev, expected, \"=\")\n return(True)\n end()\n return(False)\n}"
|
||||
"text": "Write an AVAP function to check if a given number is one less than twice its reverse.",
|
||||
"code": "function rev(num) {\n rev_num = 0\n startLoop(i, 0, num)\n if(num, 0, \">\")\n last_digit = num % 10\n rev_num = rev_num * 10 + last_digit\n num = int(num / 10)\n end()\n endLoop()\n return(rev_num)\n}\n\nfunction check(n) {\n reversed_n = rev(n)\n twice_rev = 2 * reversed_n\n expected = n + 1\n if(twice_rev, expected, \"==\")\n return(True)\n else()\n return(False)\n end()\n}"
|
||||
},
|
||||
"7": {
|
||||
"text": "Write a function to convert a list of multiple integers into a single integer.",
|
||||
"code": "function multiple_to_single(L){\n getListLen(L, list_len)\n result = 0\n startLoop(i, 0, list_len)\n itemFromList(L, i, digit)\n result = result * 10 + digit\n endLoop()\n return(result)\n}"
|
||||
"text": "Write an AVAP function to convert a list of multiple integers into a single integer.",
|
||||
"code": "function multiple_to_single(L) {\n getListLen(L, length)\n result_str = \"\"\n i = 0\n startLoop(i, 0, length)\n itemFromList(L, i, digit)\n digit_str = str(digit)\n result_str = result_str + digit_str\n endLoop()\n result = int(result_str)\n return(result)\n}"
|
||||
},
|
||||
"8": {
|
||||
"text": "Write a function that checks if a word exists at the end of a string, with optional punctuation.",
|
||||
"code": "function text_match_word(text){\n // Use getRegex to find a word at the end of the string with optional punctuation\n pattern = \"\\\\w+\\\\S*$\"\n getRegex(text, pattern, match_result)\n if(match_result, None, \"!=\")\n addVar(output, \"Found a match!\")\n else()\n addVar(output, \"Not matched!\")\n end()\n return(output)\n}"
|
||||
"text": "Write an AVAP function that matches a word at the end of a string, with optional punctuation.",
|
||||
"code": "function text_match_word(text) {\n getRegex(text, \"\\\\w+\\\\S*$\", match_result)\n if(match_result, None, \"!=\")\n return(\"Found a match!\")\n else()\n return(\"Not matched!\")\n end()\n}"
|
||||
},
|
||||
"9": {
|
||||
"text": "Write a function to find the sum of numbers in a list between the indices of a specified range.",
|
||||
"code": "function sum_range_list(list1, m, n){\n sum_range = 0\n end_idx = n + 1\n startLoop(i, m, end_idx)\n itemFromList(list1, i, current_val)\n sum_range = sum_range + current_val\n endLoop()\n return(sum_range)\n}"
|
||||
"text": "Write an AVAP function to find the sum of numbers in a list between the indices of a specified range.",
|
||||
"code": "function sum_range_list(list1, m, n) {\n sum_range = 0\n end_index = n + 1\n startLoop(i, m, end_index)\n itemFromList(list1, i, current_val)\n sum_range = sum_range + current_val\n endLoop()\n return(sum_range)\n}"
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,136 @@
|
|||
[
|
||||
{
|
||||
"task_id": 52,
|
||||
"text": "Crear un endpoint que reciba una lista de números 'numbers' y un umbral 'threshold'. El endpoint debe devolver True si todos los números de la lista son estrictamente menores que el umbral, y False en caso contrario. El resultado debe exponerse como respuesta de la API con código de estado 200.",
|
||||
"code": "addParam(\"numbers\", numbers)\naddParam(\"threshold\", threshold)\nresult = True\ngetListLen(numbers, list_len)\ni = 0\nstartLoop(i, 0, list_len)\n itemFromList(numbers, i, current_item)\n current_val = int(current_item)\n thresh_val = int(threshold)\n if(None, None, `current_val >= thresh_val`)\n result = False\n end()\nendLoop()\naddResult(result)\n_status = 200",
|
||||
"test_inputs": {
|
||||
"numbers": [
|
||||
1,
|
||||
2,
|
||||
4,
|
||||
10
|
||||
],
|
||||
"threshold": 100
|
||||
},
|
||||
"test_list": [
|
||||
"re.search(r'True', str(result))",
|
||||
"re.match(r'^200$', str(_status))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"task_id": 134,
|
||||
"text": "Crear un endpoint que reciba un parámetro 'txt' (cadena de texto) y devuelva True si el último carácter de la cadena es una letra alfabética y no forma parte de una palabra (es decir, el último token separado por espacios tiene longitud 1 y es una letra). En caso contrario, devuelve False. El resultado debe exponerse como respuesta de la API con código de estado 200.",
|
||||
"code": "addParam(\"txt\", txt)\nresult = False\nif(None, None, `txt == \"\" or txt == None`)\n result = False\nelse()\n replace(txt, \",\", \" \", txt_clean)\n last_word = txt_clean.split()[-1] if txt_clean.split() else \"\"\n getListLen(last_word, last_len)\n if(None, None, `last_len == 1 and last_word.lower() >= \"a\" and last_word.lower() <= \"z\"`)\n result = True\n else()\n result = False\n end()\nend()\naddResult(result)\n_status = 200",
|
||||
"test_inputs": {
|
||||
"txt": "apple pi e"
|
||||
},
|
||||
"test_list": [
|
||||
"re.search(r'True', str(result))",
|
||||
"re.match(r'^200$', str(_status))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"task_id": 51,
|
||||
"text": "Crear un endpoint que reciba un parámetro 'text' (cadena de texto) y devuelva la misma cadena pero con todas las vocales (a, e, i, o, u, tanto mayúsculas como minúsculas) eliminadas. El resultado debe exponerse como respuesta de la API con código de estado 200.",
|
||||
"code": "addParam(\"text\", text)\nresult = [s for s in text if s.lower() not in [\"a\", \"e\", \"i\", \"o\", \"u\"]]\nresult_str = \"\".join(result)\naddResult(result_str)\n_status = 200",
|
||||
"test_inputs": {
|
||||
"text": "abcdef"
|
||||
},
|
||||
"test_list": [
|
||||
"re.search(r'bcdf', str(result_str))",
|
||||
"re.match(r'^200$', str(_status))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"task_id": 66,
|
||||
"text": "Crear un endpoint que reciba un parámetro 's' (cadena de texto) y devuelva la suma de los códigos ASCII de todos los caracteres en mayúscula presentes en la cadena. Si la cadena está vacía, debe devolver 0. El resultado debe exponerse como respuesta de la API con código de estado 200.",
|
||||
"code": "addParam(\"s\", s)\nif(None, None, `s == \"\" or s == None`)\n digit_sum = 0\nelse()\n digit_sum = sum(ord(char) if char.isupper() else 0 for char in s)\nend()\naddResult(digit_sum)\n_status = 200",
|
||||
"test_inputs": {
|
||||
"s": "abAB"
|
||||
},
|
||||
"test_list": [
|
||||
"re.search(r'131', str(digit_sum))",
|
||||
"re.match(r'^200$', str(_status))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"task_id": 147,
|
||||
"text": "Crear un endpoint que reciba un parámetro entero 'n' y construya un array a de longitud n donde a[i] = i*i - i + 1 (para i desde 1 hasta n). El endpoint debe devolver el número de tripletas (a[i], a[j], a[k]) con i < j < k tal que la suma a[i] + a[j] + a[k] sea múltiplo de 3. El resultado debe exponerse como respuesta de la API con código de estado 200.",
|
||||
"code": "addParam(\"n\", n)\nn_val = int(n)\nA = [i*i - i + 1 for i in range(1, n_val + 1)]\ncount = 0\ngetListLen(A, len_a)\ni = 0\nstartLoop(i, 0, len_a)\n j = i + 1\n startLoop(j, i + 1, len_a)\n k = j + 1\n startLoop(k, j + 1, len_a)\n itemFromList(A, i, ai)\n itemFromList(A, j, aj)\n itemFromList(A, k, ak)\n triple_sum = int(ai) + int(aj) + int(ak)\n if(None, None, `triple_sum % 3 == 0`)\n count = count + 1\n end()\n endLoop()\n endLoop()\nendLoop()\naddResult(count)\n_status = 200",
|
||||
"test_inputs": {
|
||||
"n": 5
|
||||
},
|
||||
"test_list": [
|
||||
"re.search(r'^1$', str(count))",
|
||||
"re.match(r'^200$', str(_status))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"task_id": 101,
|
||||
"text": "Crear un endpoint que reciba un parámetro 's' (cadena de texto con palabras separadas por comas o espacios) y devuelva un array con las palabras individuales extraídas de la cadena. Si la cadena está vacía, debe devolver una lista vacía. El resultado debe exponerse como respuesta de la API con código de estado 200.",
|
||||
"code": "addParam(\"s\", s)\nif(None, None, `s == \"\" or s == None`)\n words = []\nelse()\n replace(s, \",\", \" \", s_clean)\n words = s_clean.split()\nend()\naddResult(words)\n_status = 200",
|
||||
"test_inputs": {
|
||||
"s": "Hi, my name is John"
|
||||
},
|
||||
"test_list": [
|
||||
"re.search(r'Hi', str(words))",
|
||||
"re.search(r'John', str(words))",
|
||||
"re.match(r'^200$', str(_status))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"task_id": 24,
|
||||
"text": "Crear un endpoint que reciba un parámetro entero 'n' y devuelva el mayor divisor de n que sea estrictamente menor que n. El resultado debe exponerse como respuesta de la API con código de estado 200.",
|
||||
"code": "addParam(\"n\", n)\nn_val = int(n)\nlargest_div = 1\ni = n_val - 1\nstartLoop(i, 1, n_val)\n candidate = n_val - i\n if(None, None, `n_val % candidate == 0`)\n largest_div = candidate\n return(largest_div)\n end()\nendLoop()\naddResult(largest_div)\n_status = 200",
|
||||
"test_inputs": {
|
||||
"n": 15
|
||||
},
|
||||
"test_list": [
|
||||
"re.search(r'^5$', str(largest_div))",
|
||||
"re.match(r'^200$', str(_status))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"task_id": 125,
|
||||
"text": "Crear un endpoint que reciba un parámetro 'txt' (cadena de texto) y devuelva: una lista de palabras separadas por espacios si hay espacios en el texto; una lista de palabras separadas por comas si no hay espacios pero sí comas; o el número de letras minúsculas con posición impar en el alfabeto (ord('a')=0, ord('b')=1, ...) si no hay ni espacios ni comas. El resultado debe exponerse como respuesta de la API con código de estado 200.",
|
||||
"code": "addParam(\"txt\", txt)\nif(None, None, `\" \" in txt`)\n split_result = txt.split()\nelse()\n if(None, None, `\",\" in txt`)\n replace(txt, \",\", \" \", txt_replaced)\n split_result = txt_replaced.split()\n else()\n split_result = len([i for i in txt if i.islower() and ord(i) % 2 == 0])\n end()\nend()\naddResult(split_result)\n_status = 200",
|
||||
"test_inputs": {
|
||||
"txt": "Hello world!"
|
||||
},
|
||||
"test_list": [
|
||||
"re.search(r'Hello', str(split_result))",
|
||||
"re.search(r'world!', str(split_result))",
|
||||
"re.match(r'^200$', str(_status))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"task_id": 145,
|
||||
"text": "Crear un endpoint que reciba una lista de enteros 'nums' y la devuelva ordenada de forma ascendente según la suma de sus dígitos. Si varios elementos tienen la misma suma de dígitos, se mantiene su orden relativo original. El resultado debe exponerse como respuesta de la API con código de estado 200.",
|
||||
"code": "addParam(\"nums\", nums)\nfunction digits_sum(n) {\n neg = 1\n if(None, None, `n < 0`)\n n = -1 * n\n neg = -1\n end()\n digits = [int(d) for d in str(n)]\n digits[0] = digits[0] * neg\n return(sum(digits))\n}\nsorted_nums = sorted(nums, key=digits_sum)\naddResult(sorted_nums)\n_status = 200",
|
||||
"test_inputs": {
|
||||
"nums": [
|
||||
1,
|
||||
11,
|
||||
-1,
|
||||
-11,
|
||||
-12
|
||||
]
|
||||
},
|
||||
"test_list": [
|
||||
"re.search(r'-1', str(sorted_nums))",
|
||||
"re.match(r'^200$', str(_status))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"task_id": 50,
|
||||
"text": "Crear un endpoint que reciba un parámetro 's' (cadena de texto codificada mediante un desplazamiento de 5 posiciones en el alfabeto) y devuelva la cadena decodificada original, revirtiendo el desplazamiento. El resultado debe exponerse como respuesta de la API con código de estado 200.",
|
||||
"code": "addParam(\"s\", s)\ndecoded = \"\".join([chr(((ord(ch) - 5 - ord(\"a\")) % 26) + ord(\"a\")) for ch in s])\naddResult(decoded)\n_status = 200",
|
||||
"test_inputs": {
|
||||
"s": "fghij"
|
||||
},
|
||||
"test_list": [
|
||||
"re.search(r'[a-z]+', str(decoded))",
|
||||
"re.match(r'^200$', str(_status))"
|
||||
]
|
||||
}
|
||||
]
|
||||
Loading…
Reference in New Issue