assistance-engine/scripts/pipelines/flows/generate_mbap.py

#!/usr/bin/env python3
"""
Use:
    python generate_mbap.py
    python generate_mbap.py --lrm path/to/avap.md
    python generate_mbap.py --lrm avap.md --output output/mbpp_avap.json --problems 300

Requirements:
    pip install anthropic
    export ANTHROPIC_API_KEY=sk-ant-...
"""

import argparse
import json
import os
import sys
import time
from pathlib import Path

import anthropic

CATEGORIES = [
    ("HTTP params / addParam / addResult / _status",10),
    ("Variables y strings / addVar / replace / randomString",10),
    ("Condicionales / if() Modo 1 y Modo 2 / else() / end()",10),
    ("Bucles y listas / startLoop / itemFromList / getListLen",10),
    ("JSON / variableFromJSON / AddVariableToJSON",10),
    ("ORM / ormAccessSelect / ormAccessInsert / ormAccessUpdate",10),
    ("Criptografía / encodeSHA256 / encodeMD5",10),
    ("Fechas / getTimeStamp / getDateTime / stampToDatetime",10),
    ("Conectores externos / avapConnector + métodos dinámicos",10),
    ("Concurrencia / go + gather",10),
    ("Funciones y scope / function / return()",10),
    ("Manejo de errores / try() / exception()",10),
    ("HTTP externo / RequestGet / RequestPost",10),
    ("Modularidad / import / include + casos de uso complejos",10),
]

TOTAL_PROBLEMS    = sum(n for _, n in CATEGORIES)
PROBLEMS_PER_CALL = 10


SYSTEM_PROMPT = """Eres un experto en el lenguaje AVAP.
Se te proporciona el Language Reference Manual (LRM) completo de AVAP.
Tu tarea es generar problemas de benchmark estilo MBPP para evaluar
modelos de lenguaje en su capacidad de generar código AVAP correcto.

REGLAS ESTRICTAS para el código AVAP generado:
1. Una instrucción por línea. EOL es el terminador absoluto.
2. Sin indentación significativa (es solo decorativa).
3. Bloques de control: if()...else()...end(), startLoop()...endLoop(), try()...exception()...end()
4. Funciones: function name(args) { ... return(val) }
5. if() Modo 1: if(var_o_literal, var_o_literal, "operador")
   — los argumentos NO pueden ser expresiones de acceso como dict['key'];
     hay que extraer el valor a una variable propia primero.
6. if() Modo 2: if(None, None, `expresion_completa_como_string`)
7. _status se asigna con: addVar(_status, 404)
8. ormAccessSelect firma: ormAccessSelect(campos, "tabla", selector, varTarget)
   — selector puede ser cadena vacía.
9. Acceso a campos de dict: val = dict['campo']  (línea propia, luego se usa val).
10. Genera ÚNICAMENTE código AVAP válido según el LRM. Sin Python, sin pseudocódigo.

MODO DE EJECUCIÓN — MUY IMPORTANTE:
- El código se ejecuta DIRECTAMENTE, línea a línea, sin servidor ni registro de endpoints.
- NUNCA uses registerEndpoint(), NUNCA uses mainHandler(), NUNCA envuelvas el código en funciones solo para ejecutarlo.
- El código correcto es simplemente las instrucciones en línea, por ejemplo:
    result = "Hello World"
    addResult(result)
- Si el problema requiere una función auxiliar reutilizable, defínela con function...{} y llámala directamente después:
    function double(n) {
        return(n * 2)
    }
    addParam("n", n)
    result = double(n)
    addResult(result)
- NUNCA termines el código con registerEndpoint ni con ninguna llamada de registro.

FORMATO DE SALIDA: responde ÚNICAMENTE con un array JSON válido.
Sin texto adicional, sin bloques de código markdown, sin explicaciones.
Estructura exacta de cada elemento:
{
  "task_id": <número entero>,
  "text": "<enunciado del problema en español>",
  "code": "<código AVAP con saltos de línea como \\n>",
  "test_inputs": { "<param1>": <valor1>, "<param2>": <valor2> },
  "test_list": ["<expr_python_1>", "<expr_python_2>"]
}

FORMATO DE test_inputs — MUY IMPORTANTE:
- Es un objeto JSON con un valor fijo para cada variable que el código recibe via addParam().
- Los nombres de las claves deben coincidir EXACTAMENTE con el nombre de variable usado en addParam().
- Los valores deben ser concretos y representativos del problema (no genéricos como "test" o 123).
- Si el código no tiene ningún addParam(), el campo test_inputs debe ser un objeto vacío: {}
- Estos valores son los que el evaluador inyectará en el stack antes de ejecutar el código,
  de modo que las aserciones de test_list puedan validar las variables de salida resultantes.

Ejemplo con addParam:
  código:       addParam("password", password)\\nencodeSHA256(password, hashed)\\naddResult(hashed)
  test_inputs:  { "password": "secret123" }
  test_list:    ["re.match(r'^[a-f0-9]{64}$', hashed)"]

Ejemplo sin addParam:
  código:       randomString(16, token)\\naddResult(token)
  test_inputs:  {}
  test_list:    ["re.match(r'^[a-zA-Z0-9]{16}$', token)"]

FORMATO DE test_list — MUY IMPORTANTE:
Cada aserción debe ser una expresión Python con re.match()
evaluable directamente sobre las variables del stack AVAP (disponibles como
variables Python locales). El módulo 're' está siempre disponible.
La expresión debe devolver un match object (truthy) si el test pasa.

Reglas estrictas:
- USA ÚNICAMENTE re.match(r'<patrón>', <variable>)
- NO combines expresiones re.match en una aserción, cada asercion tiene que ser un unico re.match(r'<patrón>', <variable>)
- Convierte a string si es necesario: re.match(r'^\\d+$', str(result))
- Puedes encadenar con 'and': re.match(r'^[a-zA-Z0-9]{32}$', token) and re.match(r'.{32}', token)
- Las variables referenciadas deben existir en el stack tras ejecutar el código.
- NUNCA uses comparaciones directas (==, !=, >, <).
- NUNCA uses isinstance(), len(), assert, ni texto descriptivo.
- NUNCA uses nada que no sea re.match().

Ejemplos correctos de test_list:
  "re.match(r'^[a-f0-9]{64}$', hashed)"
  "re.match(r'^[a-zA-Z0-9]{32}$', token)"
  "re.match(r'^\\d{4}-\\d{2}-\\d{2}$', date_str)"
  "re.match(r'^-?\\d+(\\.\\d+)?$', str(result))"
  "re.match(r'^(par|impar)$', result)"
  "re.match(r'^40[134]$', str(_status))"
  "re.match(r'^\\d+$', str(length))"
"""


def build_user_prompt(lrm: str, category: str, count: int, start_id: int):
    return f"""# LRM AVAP — Language Reference Manual

{lrm}

---

# TAREA

Genera exactamente {count} problemas de benchmark MBPP-AVAP para la categoría:

**{category}**

Requisitos:
- Los task_id deben comenzar en {start_id} y ser consecutivos.
- Cada problema debe cubrir un aspecto distinto de la categoría.
- Dificultad variada: algunos simples, algunos intermedios, alguno avanzado.
- El código debe ser realista como endpoint de microservicio HTTP en AVAP.
- Incluye 2-3 aserciones descriptivas en test_list por problema.

Responde ÚNICAMENTE con el array JSON. Sin texto antes ni después.
"""


def parse_response(raw: str):
    text = raw.strip()
    if text.startswith("```"):
        lines = text.splitlines()
        inner = lines[1:]
        if inner and inner[-1].strip() == "```":
            inner = inner[:-1]
        text = "\n".join(inner).strip()

    problems = json.loads(text)

    if not isinstance(problems, list):
        raise ValueError("response is not an JSON array")

    for p in problems:
        for field in ("task_id", "text", "code", "test_list"):
            if field not in p:
                raise ValueError(f"Field missing '{field}' in task_id={p.get('task_id','?')}.")
        if "test_inputs" not in p:
            p["test_inputs"] = {}
        if not isinstance(p["test_inputs"], dict):
            raise ValueError(f"'test_inputs' must by a JSON Object (task_id={p.get('task_id','?')}).")

    return problems


def call_api( client: anthropic.Anthropic, lrm: str, category: str, count: int, start_id: int, retries: int = 3,):

    for attempt in range(1, retries + 1):
        try:
            message = client.messages.create(
                model="claude-sonnet-4-20250514",
                max_tokens=8000,
                system=SYSTEM_PROMPT,
                messages=[
                    {
                        "role": "user",
                        "content": build_user_prompt(lrm, category, count, start_id),
                    }
                ],
            )
            raw = message.content[0].text
            problems = parse_response(raw)

            for i, problem in enumerate(problems):
                problem["task_id"] = start_id + i

            return problems

        except (json.JSONDecodeError, ValueError, KeyError) as e:
            print(f"\n   Attempt {attempt}/{retries} — parser error: {e}")
            if attempt < retries:
                time.sleep(2 ** attempt)

        except anthropic.RateLimitError:
            wait = 30 * attempt
            print(f"\n   Rate limit. waiting {wait}s...")
            time.sleep(wait)

        except anthropic.APIError as e:
            print(f"\n   API error at attempt {attempt}: {e}")
            if attempt < retries:
                time.sleep(5)

    raise RuntimeError(
        f"Cant generate problems '{category}' since {retries} trys."
    )


def scale_categories(target: int):
    base = TOTAL_PROBLEMS
    scaled = [
        (cat, max(1, round(n * target / base)))
        for cat, n in CATEGORIES
    ]

    diff = target - sum(n for _, n in scaled)
    if diff != 0:
        last_cat, last_n = scaled[-1]
        scaled[-1] = (last_cat, max(1, last_n + diff))
    return scaled


def main():
    parser = argparse.ArgumentParser(
        description="Create a bunch of samples of code from an LRM."
    )
    parser.add_argument(
        "--lrm",
        default="avap.md",
        help="Path to AVAP LRM (default: avap.md)",
    )
    parser.add_argument(
        "--output",
        default="output/mbpp_avap.json",
        help="Output JSON file (default: output/mbpp_avap.json)",
    )
    parser.add_argument(
        "--problems",
        type=int,
        default=300,
        help="Total problems number to generate (default: 300)",
    )
    parser.add_argument(
        "--api-key",
        default=None,
        help="Anthropic API key",
    )
    args = parser.parse_args()

    api_key = args.api_key or os.environ.get("ANTHROPIC_API_KEY")
    if not api_key:
        sys.exit(
            "ERROR: API key not found.\n"
            "  Export variable:  export ANTHROPIC_API_KEY=sk-ant-...\n"
            "  Or indicate with:         --api-key sk-ant-..."
        )

    lrm_path = Path(args.lrm)
    if not lrm_path.exists():
        sys.exit(
            f"ERROR: file '{lrm_path}' not found.\n"
            f"  Put avap.md in actual directory or use --lrm <path>."
        )
    lrm = lrm_path.read_text(encoding="utf-8")
    print(f" Source LRM: {lrm_path} ")

    output_path = Path(args.output)
    output_path.parent.mkdir(parents=True, exist_ok=True)

    categories = scale_categories(args.problems)
    total_calls = sum((n + PROBLEMS_PER_CALL - 1) // PROBLEMS_PER_CALL for _, n in categories)

    print(f" Problems    : {args.problems}")
    print(f" Output file : {output_path}\n")
    print("────────────────────────────────────────────────────────────")

    client       = anthropic.Anthropic(api_key=api_key)
    all_problems: list[dict] = []
    task_id      = 1
    call_count   = 0

    for cat_idx, (category, total_cat) in enumerate(categories, 1):
        print(f"\n[{cat_idx:02d}/{len(categories)}] {category}  ({total_cat} problems)")

        remaining = total_cat
        batch_num = 0

        while remaining > 0:
            batch_size = min(PROBLEMS_PER_CALL, remaining)
            batch_num += 1
            call_count += 1

            print(
                f"  Batch {batch_num}  |  task_id {task_id}–{task_id + batch_size - 1}  "
                f"|  API Call {call_count}/{total_calls} ... ",
                end="",
                flush=True,
            )

            try:
                batch = call_api(client, lrm, category, batch_size, task_id)
            except RuntimeError as e:
                print(f"\n {e}")
                if all_problems:
                    _save(all_problems, output_path, partial=True)
                sys.exit(1)

            all_problems.extend(batch)
            task_id   += len(batch)
            remaining -= len(batch)
            print(f"  {len(batch)} generated (total: {len(all_problems)})")

            if remaining > 0:
                time.sleep(1.5)

        _save(all_problems, output_path, partial=False)
        print(f"  '- Save actual results.")

    print("\n" + "────────────────────────────────────────────────────────────")
    print(f" Process completed")
    print(f" Problems generated : {len(all_problems)}")
    print(f" task_id range      : {all_problems[0]['task_id']} – {all_problems[-1]['task_id']}")
    print(f" Output file        : {output_path}")


def _save(problems: list[dict], path: Path, partial: bool = False):
    suffix = ".partial" if partial else ""
    target = path.with_suffix(suffix + path.suffix) if partial else path
    with open(target, "w", encoding="utf-8") as f:
        json.dump(problems, f, ensure_ascii=False, indent=2)


if __name__ == "__main__":
    main()