feat: add MBPP-style dataset generator and evaluation docs

2026-03-10 13:37:19 -07:00 · 2026-03-10 13:37:19 -07:00 · 35ca56118d
parent a08f754e25
commit 35ca56118d
4 changed files with 1446 additions and 1 deletions
--- a/README.md
+++ b/README.md
@ -46,6 +46,12 @@ graph TD
 ├── README.md                     # System documentation & Dev guide
 ├── changelog                     # Version tracking and release history
 ├── pyproject.toml                # Python project configuration
 ├── docs/
 |   ├── AVAP Language: ...        # AVAP DSL Documentation
 |   |   └── AVAP.md
 │   ├── developer.avapfr...       # Documents on developer web page
 |   └── LRM/                      # AVAP LRM documentation
 |       └── avap.md 
 ├── Docker/
 │   ├── Dockerfile                # Container definition for the Engine
 │   ├── docker-compose.yaml       # Local orchestration for dev environment
@ -64,7 +70,10 @@ graph TD
 │   └── kubeconfig.yaml           # Kubernetes cluster configuration
 ├── scripts/
 │   └── pipelines/
 |       ├── samples_generator/    # AVAP Sample generator
 |       |   └─ generate_mbap.py   
 │       └── flows/                # Data processing flows
 |           └─ elasticsearch_ingestion.py 
 └── src/
    ├── __init__.py
    └── utils/
@ -202,6 +211,53 @@ python -m grpc_tools.protoc -I./protos --python_out=./src --grpc_python_out=./sr
 ---
 ## Dataset Generation & Evaluation
 The engine includes a specialized benchmarking suite to evaluate the model's proficiency in **AVAP syntax**. This is achieved through a synthetic data generator that creates problems in the MBPP (Mostly Basic Python Problems) style, but tailored for the AVAP Language Reference Manual (LRM).
 ### 1. Synthetic Data Generator
 The script `scripts/generate_mbpp_avap.py` leverages Claude 3.5 Sonnet to produce high-quality, executable code examples and validation tests.
 **Key Features:**
 * **LRM Grounding:** Uses the provided `avap.md` as the source of truth for syntax and logic.
 * **Validation Logic:** Generates `test_list` with Python regex assertions to verify the state of the AVAP stack after execution.
 * **Balanced Categories:** Covers 14 domains including ORM, Concurrency (`go/gather`), HTTP handling, and Cryptography.
 ### 2. Usage
 Ensure you have the `anthropic` library installed and your API key configured:
 ```bash
 pip install anthropic
 export ANTHROPIC_API_KEY="your-sk-ant-key"
 ```
 Run the generator specifying the path to your LRM and the desired output:
 ```bash
 python scripts/generate_mbpp_avap.py \
  --lrm ingestion/docs/avap.md \
  --output evaluation/mbpp_avap.json \
  --problems 300
 ```
 ### 3. Dataset Schema
 The generated JSON follows this structure:
 | Field | Type | Description |
 | :--- | :--- | :--- |
 | `task_id` | Integer | Unique identifier for the benchmark. |
 | `text` | String | Natural language description of the problem (Spanish). |
 | `code` | String | The reference AVAP implementation. |
 | `test_list` | Array | Python `re.match` expressions to validate execution results. |
 ### 4. Integration in RAG
 These generated examples are used to:
 1.  **Fine-tune** the local models (`qwen2.5:1.5b`) or others via the MrHouston pipeline.
 2.  **Evaluate** the "Zero-Shot" performance of the engine before deployment.
 3.  **Provide Few-Shot examples** in the RAG prompt orchestration (`src/prompts.py`).
 ---
 ## Repository Standards & Architecture
 ### Docker & Build Context
--- a/15
+++ b/15
@ -4,6 +4,21 @@ All notable changes to the **Brunix Assistance Engine** will be documented in th
 ---
 ## [1.4.0] - 2026-03-10
 ### Added
 - **Dataset Generation Suite**: Added `scripts/generate_mbpp_avap.py` to automate the creation of synthetic AVAP training data.
 - **MBPP-style Benchmarking**: Support for generating structured JSON datasets with code solutions and Python-based validation tests (`test_list`).
 - **LRM Integration**: The generator now performs grounded synthesis using the `avap.md` Language Reference Manual.
 - **Anthropic Claude 3.5 Sonnet Integration**: Orchestration logic for high-fidelity code generation via API.
 ### Changed
 - **README.md**: Added comprehensive documentation for the Evaluation & Dataset Generation pipeline.
 - **Project Structure**: Integrated `evaluation/` directory for synthetic dataset storage.
 ### Security
 - Added explicit policy to avoid committing real Anthropic API keys, enforcing the use of environment variables.
 ## [1.3.0] - 2026-03-05
 ### Added
--- a/docs/LRM/avap.md
+++ b/docs/LRM/avap.md
--- a/scripts/pipelines/samples_generator/generate_mbap.py
+++ b/scripts/pipelines/samples_generator/generate_mbap.py
@ -0,0 +1,329 @@
 #!/usr/bin/env python3
 """
 Use:
    python generate_mbpp_avap.py
    python generate_mbpp_avap.py --lrm path/to/avap.md
    python generate_mbpp_avap.py --lrm avap.md --output output/mbpp_avap.json --problems 300
 Requirements:
    pip install anthropic
    export ANTHROPIC_API_KEY=sk-ant-...
 """
 import argparse
 import json
 import os
 import sys
 import time
 from pathlib import Path
 import anthropic
 CATEGORIES = [
    ("HTTP params / addParam / addResult / _status",10),
    ("Variables y strings / addVar / replace / randomString",10),
    ("Condicionales / if() Modo 1 y Modo 2 / else() / end()",10),
    ("Bucles y listas / startLoop / itemFromList / getListLen",10),
    ("JSON / variableFromJSON / AddVariableToJSON",10),
    ("ORM / ormAccessSelect / ormAccessInsert / ormAccessUpdate",10),
    ("Criptografía / encodeSHA256 / encodeMD5",10),
    ("Fechas / getTimeStamp / getDateTime / stampToDatetime",10),
    ("Conectores externos / avapConnector + métodos dinámicos",10),
    ("Concurrencia / go + gather",10),
    ("Funciones y scope / function / return()",10),
    ("Manejo de errores / try() / exception()",10),
    ("HTTP externo / RequestGet / RequestPost",10),
    ("Modularidad / import / include + casos de uso complejos",10),
 ]
 TOTAL_PROBLEMS    = sum(n for _, n in CATEGORIES)
 PROBLEMS_PER_CALL = 10
 SYSTEM_PROMPT = """Eres un experto en el lenguaje AVAP.
 Se te proporciona el Language Reference Manual (LRM) completo de AVAP.
 Tu tarea es generar problemas de benchmark estilo MBPP para evaluar
 modelos de lenguaje en su capacidad de generar código AVAP correcto.
 REGLAS ESTRICTAS para el código AVAP generado:
 1. Una instrucción por línea. EOL es el terminador absoluto.
 2. Sin indentación significativa (es solo decorativa).
 3. Bloques de control: if()...else()...end(), startLoop()...endLoop(), try()...exception()...end()
 4. Funciones: function name(args) { ... return(val) }
 5. if() Modo 1: if(var_o_literal, var_o_literal, "operador")
   — los argumentos NO pueden ser expresiones de acceso como dict['key'];
     hay que extraer el valor a una variable propia primero.
 6. if() Modo 2: if(None, None, "expresion_completa_como_string")
 7. _status se asigna con: addVar(_status, 404)
 8. ormAccessSelect firma: ormAccessSelect(campos, "tabla", selector, varTarget)
   — selector puede ser cadena vacía.
 9. Acceso a campos de dict: val = dict['campo']  (línea propia, luego se usa val).
 10. Genera ÚNICAMENTE código AVAP válido según el LRM. Sin Python, sin pseudocódigo.
 MODO DE EJECUCIÓN — MUY IMPORTANTE:
 - El código se ejecuta DIRECTAMENTE, línea a línea, sin servidor ni registro de endpoints.
 - NUNCA uses registerEndpoint(), NUNCA uses mainHandler(), NUNCA envuelvas el código en funciones solo para ejecutarlo salvo que queramos probar la funcionalidad de funciones.
 - El código correcto es simplemente las instrucciones en línea, por ejemplo:
    result = "Hello World"
    addResult(result)
 - Si el problema requiere una función auxiliar reutilizable, defínela con function...{} y llámala directamente después:
    function double(n) {
        return(n * 2)
    }
    addParam("n", n)
    result = double(n)
    addResult(result)
 - NUNCA termines el código con registerEndpoint ni con ninguna llamada de registro.
 FORMATO DE SALIDA: responde ÚNICAMENTE con un array JSON válido.
 Sin texto adicional, sin bloques de código markdown, sin explicaciones.
 Estructura exacta de cada elemento:
 {
  "task_id": <número entero>,
  "text": "<enunciado del problema en español>",
  "code": "<código AVAP con saltos de línea como \\n>",
  "test_list": ["<expr_python_1>", "<expr_python_2>"]
 }
 FORMATO DE test_list — MUY IMPORTANTE:
 Cada aserción debe ser una expresión Python con re.match() o re.search()
 evaluable directamente sobre las variables del stack AVAP (disponibles como
 variables Python locales). El módulo 're' está siempre disponible.
 La expresión debe devolver un match object (truthy) si el test pasa.
 Reglas estrictas:
 - USA ÚNICAMENTE re.match(r'<patrón>', <variable>) o re.search(r'<patrón>', str(<variable>))
 - Convierte a string si es necesario: re.match(r'^\\d+$', str(result))
 - Puedes encadenar con 'and': re.match(r'^[a-zA-Z0-9]{32}$', token) and re.match(r'.{32}', token)
 - Las variables referenciadas deben existir en el stack tras ejecutar el código.
 - NUNCA uses comparaciones directas (==, !=, >, <).
 - NUNCA uses isinstance(), len(), assert, ni texto descriptivo.
 - NUNCA uses nada que no sea re.match() o re.search().
 Ejemplos correctos de test_list:
  "re.match(r'^[a-f0-9]{64}$', hashed)"
  "re.match(r'^[a-zA-Z0-9]{32}$', token)"
  "re.match(r'^\\d{4}-\\d{2}-\\d{2}$', date_str)"
  "re.search(r'Hello', result)"
  "re.match(r'^-?\\d+(\\.\\d+)?$', str(result))"
  "re.match(r'^(par|impar)$', result)"
  "re.match(r'^40[134]$', str(_status))"
  "re.match(r'^\\d+$', str(length))"
 """
 def build_user_prompt(lrm: str, category: str, count: int, start_id: int):
    return f"""# LRM AVAP — Language Reference Manual
 {lrm}
 ---
 # TAREA
 Genera exactamente {count} problemas de benchmark MBPP-AVAP para la categoría:
 **{category}**
 Requisitos:
 - Los task_id deben comenzar en {start_id} y ser consecutivos.
 - Cada problema debe cubrir un aspecto distinto de la categoría.
 - Dificultad variada: algunos simples, algunos intermedios, alguno avanzado.
 - El código debe ser realista como endpoint de microservicio HTTP en AVAP.
 - Incluye 2-3 aserciones descriptivas en test_list por problema.
 Responde ÚNICAMENTE con el array JSON. Sin texto antes ni después.
 """
 def parse_response(raw: str):
    text = raw.strip()
    if text.startswith("```"):
        lines = text.splitlines()
        inner = lines[1:]
        if inner and inner[-1].strip() == "```":
            inner = inner[:-1]
        text = "\n".join(inner).strip()
    problems = json.loads(text)
    if not isinstance(problems, list):
        raise ValueError("answer is not a JSON.")
    for p in problems:
        for field in ("task_id", "text", "code", "test_list"):
            if field not in p:
                raise ValueError(f"field '{field}' not found in a problem.")
    return problems
 def call_api( client: anthropic.Anthropic, lrm: str, category: str, count: int, start_id: int, retries: int = 3,):
    for attempt in range(1, retries + 1):
        try:
            message = client.messages.create(
                model="claude-sonnet-4-20250514",
                max_tokens=8000,
                system=SYSTEM_PROMPT,
                messages=[
                    {
                        "role": "user",
                        "content": build_user_prompt(lrm, category, count, start_id),
                    }
                ],
            )
            raw = message.content[0].text
            problems = parse_response(raw)
            for i, problem in enumerate(problems):
                problem["task_id"] = start_id + i
            return problems
        except (json.JSONDecodeError, ValueError, KeyError) as e:
            print(f"\n   Attempt {attempt}/{retries} — parser error: {e}")
            if attempt < retries:
                time.sleep(2 ** attempt)
        except anthropic.RateLimitError:
            wait = 30 * attempt
            print(f"\n   Rate limit. waiting {wait}s...")
            time.sleep(wait)
        except anthropic.APIError as e:
            print(f"\n   API error at attempt {attempt}: {e}")
            if attempt < retries:
                time.sleep(5)
    raise RuntimeError(
        f"Cant generate problems '{category}' since {retries} trys."
    )
 def scale_categories(target: int):
    base = TOTAL_PROBLEMS
    scaled = [
        (cat, max(1, round(n * target / base)))
        for cat, n in CATEGORIES
    ]
    diff = target - sum(n for _, n in scaled)
    if diff != 0:
        last_cat, last_n = scaled[-1]
        scaled[-1] = (last_cat, max(1, last_n + diff))
    return scaled
 def main():
    parser = argparse.ArgumentParser(
        description="Create a bunch of samples of code from an LRM."
    )
    parser.add_argument(
        "--lrm",
        default="avap.md",
        help="Path to AVAP LRM (default: avap.md)",
    )
    parser.add_argument(
        "--output",
        default="output/mbpp_avap.json",
        help="Output JSON file (default: output/mbpp_avap.json)",
    )
    parser.add_argument(
        "--problems",
        type=int,
        default=300,
        help="Total problems number to generate (default: 300)",
    )
    parser.add_argument(
        "--api-key",
        default=None,
        help="Anthropic API key",
    )
    args = parser.parse_args()
    api_key = args.api_key or os.environ.get("ANTHROPIC_API_KEY")
    if not api_key:
        sys.exit(
            "ERROR: API key not found.\n"
            "  Export variable:  export ANTHROPIC_API_KEY=sk-ant-...\n"
            "  Or indicate with:         --api-key sk-ant-..."
        )
    lrm_path = Path(args.lrm)
    if not lrm_path.exists():
        sys.exit(
            f"ERROR: file '{lrm_path}' not found.\n"
            f"  Put avap.md in actual directory or use --lrm <path>."
        )
    lrm = lrm_path.read_text(encoding="utf-8")
    print(f" Source LRM: {lrm_path} ")
    output_path = Path(args.output)
    output_path.parent.mkdir(parents=True, exist_ok=True)
    categories = scale_categories(args.problems)
    total_calls = sum((n + PROBLEMS_PER_CALL - 1) // PROBLEMS_PER_CALL for _, n in categories)
    print(f" Problems    : {args.problems}")
    print(f" Output file : {output_path}\n")
    print("────────────────────────────────────────────────────────────")
    client       = anthropic.Anthropic(api_key=api_key)
    all_problems: list[dict] = []
    task_id      = 1
    call_count   = 0
    for cat_idx, (category, total_cat) in enumerate(categories, 1):
        print(f"\n[{cat_idx:02d}/{len(categories)}] {category}  ({total_cat} problems)")
        remaining = total_cat
        batch_num = 0
        while remaining > 0:
            batch_size = min(PROBLEMS_PER_CALL, remaining)
            batch_num += 1
            call_count += 1
            print(
                f"  Batch {batch_num}  |  task_id {task_id}–{task_id + batch_size - 1}  "
                f"|  API Call {call_count}/{total_calls} ... ",
                end="",
                flush=True,
            )
            try:
                batch = call_api(client, lrm, category, batch_size, task_id)
            except RuntimeError as e:
                print(f"\n {e}")
                if all_problems:
                    _save(all_problems, output_path, partial=True)
                sys.exit(1)
            all_problems.extend(batch)
            task_id   += len(batch)
            remaining -= len(batch)
            print(f"  {len(batch)} generated (total: {len(all_problems)})")
            if remaining > 0:
                time.sleep(1.5)
        _save(all_problems, output_path, partial=False)
        print(f"  '- Save actual results.")
    print("\n" + "────────────────────────────────────────────────────────────")
    print(f" Process completed")
    print(f" Problems generated : {len(all_problems)}")
    print(f" task_id range      : {all_problems[0]['task_id']} – {all_problems[-1]['task_id']}")
    print(f" Output file        : {output_path}")
 def _save(problems: list[dict], path: Path, partial: bool = False):
    suffix = ".partial" if partial else ""
    target = path.with_suffix(suffix + path.suffix) if partial else path
    with open(target, "w", encoding="utf-8") as f:
        json.dump(problems, f, ensure_ascii=False, indent=2)
 if __name__ == "__main__":
    main()