ADR0008 Finished and ADR0009 finalizing

2026-04-13 20:27:06 -07:00 · 2026-04-13 20:27:06 -07:00 · 6af0a84f4c
parent 0b9c19d61f
commit 6af0a84f4c
19 changed files with 630 additions and 85 deletions
--- a/.DS_Store
+++ b/.DS_Store
--- a/.stfolder/syncthing-folder-18c1bb.txt
+++ b/.stfolder/syncthing-folder-18c1bb.txt
@ -2,4 +2,4 @@
 # Do not delete.

 folderID: wmoge-xmh3x
-created: 2026-04-12T14:02:49-07:00
+created: 2026-04-12T11:12:38-07:00
--- a/Docker/.DS_Store
+++ b/Docker/.DS_Store
--- a/Docker/docker-compose.yaml
+++ b/Docker/docker-compose.yaml
@ -33,7 +33,7 @@ services:
      CLASSIFIER_SEED_DATASET: ${CLASSIFIER_SEED_DATASET}
      CLASSIFIER_MIN_CV_ACCURACY: ${CLASSIFIER_MIN_CV_ACCURACY}
      CLASSIFIER_HELD_OUT_RATIO: ${CLASSIFIER_HELD_OUT_RATIO}
-      PROXY_THREAD_WORKERS: 10
+      

    extra_hosts:
      - "host.docker.internal:host-gateway"
--- a/Docker/src/graph.py
+++ b/Docker/src/graph.py
@ -1,3 +1,4 @@
+import json as _json
 import logging
 import os
 import re as _re
@ -28,6 +29,7 @@ from prompts import (
    GENERATE_PROMPT,
    PLATFORM_PROMPT,
    REFORMULATE_PROMPT,
+    TEST_GENERATION_PROMPT,
 )

 from state import AgentState, ClassifyEntry
@ -36,7 +38,7 @@ logger = logging.getLogger(__name__)

 # ── AVAP Parser client — ADR-0009 (PTVL) ──────────────────────────────────────

-_PARSER_URL     = os.getenv("AVAP_PARSER_URL", "")
+_PARSER_URL     = os.getenv("AVAP_PARSER_URL", "http://45.77.193.144:8888")
 _PARSER_TIMEOUT = int(os.getenv("AVAP_PARSER_TIMEOUT", "2"))
 _CB_THRESHOLD   = int(os.getenv("PARSER_CB_THRESHOLD", "3"))
 _CB_COOLDOWN    = int(os.getenv("PARSER_CB_COOLDOWN", "30"))
@ -87,23 +89,39 @@ class _CircuitBreaker:
 _parser_cb = _CircuitBreaker(_CB_THRESHOLD, _CB_COOLDOWN)


+def _strip_thinking(text: str) -> str:
+    """Remove qwen3 <think>...</think> blocks from LLM output."""
+    text = _re.sub(r"<think>.*?</think>", "", text, flags=_re.DOTALL)
+    # Also strip a lone closing tag if the model omitted the opening one
+    text = _re.sub(r"</think>", "", text)
+    return text.strip()
+
+
 def _extract_avap_code(text: str) -> str:
    """Return the first AVAP code block found in an LLM response."""
+    text = _strip_thinking(text)
+
    for pattern in (r'```avap\s*\n(.*?)```', r'```\s*\n(.*?)```', r'```(.*?)```'):
        m = _re.search(pattern, text, _re.DOTALL)
        if m:
            return m.group(1).strip()
+
    return text


-def _call_parser(text: str) -> tuple:
+def _call_parser(text: str, test_inputs: dict = None, test_list: list = None) -> tuple:
    """Call AVAP Parser REST API.

+    Tries /api/v1/execute first (executes the code, catches runtime errors).
+    If test_inputs/test_list are provided, also validates assertions.
+    Falls back to /parse (AST-only validation) if /api/v1/execute is not available.
+
    Returns:
-        (True,  "")      — code valid
-        (False, trace)   — code invalid, trace contains the error
+        (True,  "")      — code valid and executed successfully (assertions passed if provided)
+        (False, trace)   — code invalid, runtime error, or failed assertions
        (None,  "")      — parser unavailable or circuit open
    """
+
    if not _PARSER_URL or _PARSER_TIMEOUT == 0:
        return None, ""

@ -111,27 +129,120 @@ def _call_parser(text: str) -> tuple:
        return None, ""

    code = _extract_avap_code(text)
+    logger.info(f"[ptvl] extracted code ({len(code)} chars): {repr(code[:120])}")
    if not code.strip():
        return None, ""

+    base_url = _PARSER_URL.rstrip('/')
+
    try:
+        payload = {
+            "code": code,
+            "test_inputs": test_inputs or {},
+            "test_list": test_list or [],
+        }
        resp = _requests.post(
-            f"{_PARSER_URL.rstrip('/')}/parse",
-            json={"code": code},
+            f"{base_url}/api/v1/run",
+            json=payload,
            timeout=_PARSER_TIMEOUT,
        )
+
+        if resp.status_code == 404:
+            # /api/v1/run not deployed yet — fall back to /parse
+            logger.info("[ptvl] /api/v1/run not available, falling back to /parse")
+            return _call_parser_parse(base_url, code)
+
        data = resp.json()
-        if data.get("valid", False):
-            _parser_cb.success()
-            return True, ""
-        _parser_cb.success()  # parser responded — it is healthy
-        return False, data.get("error", "parse error")
+        _parser_cb.success()
+        logger.info(f"[ptvl] parser response: {data}")
+
+        if not data.get("success", False):
+            error = data.get("error", "")
+            if not error:
+                failed_logs = [l for l in data.get("logs", []) if not l.get("success")]
+                if failed_logs:
+                    error = failed_logs[0].get("error", "runtime error")
+            return False, error or "runtime error"
+
+        # Execution succeeded — check assertion result if assertions were provided
+        if test_list and not data.get("assertion_result", True):
+            return False, "assertion failed: the code ran but did not produce the expected output"
+
+        return True, ""
+
    except Exception as exc:
        _parser_cb.failure()
        logger.warning(f"[ptvl] parser call failed: {exc}")
        return None, ""


+def _call_parser_parse(base_url: str, code: str) -> tuple:
+    """AST-only fallback via /parse."""
+    try:
+        resp = _requests.post(
+            f"{base_url}/parse",
+            json={"code": code},
+            timeout=_PARSER_TIMEOUT,
+        )
+        data = resp.json()
+        _parser_cb.success()
+        logger.info(f"[ptvl] parser response (/parse fallback): {data}")
+        if data.get("valid", False):
+            return True, ""
+        return False, data.get("error", "parse error")
+    except Exception as exc:
+        _parser_cb.failure()
+        logger.warning(f"[ptvl] /parse fallback failed: {exc}")
+        return None, ""
+
+
+# ── Test generation helper — used by both build_graph and AskAgentStream ───────
+
+def _run_generate_tests(user_request: str, generated_code: str, llm) -> tuple:
+    """Generate test_inputs + test_list for the given AVAP code.
+
+    Returns (test_inputs: dict, test_list: list).  Never raises — falls back to
+    ({}, []) on timeout or any LLM/parse error so the caller can still validate.
+    """
+    import concurrent.futures
+
+    def _run():
+        prompt = "/no_think\n\n" + TEST_GENERATION_PROMPT.format(
+            user_request=user_request,
+            generated_code=generated_code,
+        )
+        resp = llm.invoke([SystemMessage(content=prompt)])
+        raw  = _strip_thinking(resp.content)
+        logger.info(f"[generate_tests] raw output: {repr(raw[:200])}")
+
+        if raw.startswith("```"):
+            raw = _re.sub(r"^```[a-z]*\n?", "", raw)
+            raw = raw.rstrip("`").strip()
+
+        m = _re.search(r'\{.*\}', raw, _re.DOTALL)
+        if m:
+            raw = m.group(0)
+
+        data = _json.loads(raw)
+        if not isinstance(data, dict):
+            raise ValueError(f"expected JSON object, got {type(data).__name__}: {repr(raw[:80])}")
+        return data.get("test_inputs", {}), data.get("test_list", [])
+
+    try:
+        with concurrent.futures.ThreadPoolExecutor(max_workers=1) as ex:
+            future = ex.submit(_run)
+            test_inputs, test_list = future.result(timeout=15)
+        logger.info(f"[generate_tests] {len(test_list)} assertions generated")
+    except concurrent.futures.TimeoutError:
+        logger.warning("[generate_tests] timed out after 15s — skipping assertions")
+        test_inputs, test_list = {}, []
+    except Exception as exc:
+        logger.warning(f"[generate_tests] skipped ({type(exc).__name__}): {exc}")
+        test_inputs, test_list = {}, []
+
+    return test_inputs, test_list
+
+
 # ── Session stores ─────────────────────────────────────────────────────────────

 session_store: dict[str, list] = defaultdict(list)
@ -593,9 +704,18 @@ def build_graph(llm, embeddings, es_client, index_name, llm_conversational=None)
        )
        resp = llm.invoke([prompt] + state["messages"])
        logger.info(f"[generate_code] {len(resp.content)} chars")
+        #logger.info(resp.content)
        _persist(state, resp)
        return {"messages": [resp]}

+    def generate_tests(state: AgentState) -> AgentState:
+        user_msg       = state["messages"][-2] if len(state["messages"]) >= 2 else state["messages"][-1]
+        generated      = state["messages"][-1]
+        user_request   = getattr(user_msg,  "content", str(user_msg))
+        generated_code = getattr(generated, "content", str(generated))
+        test_inputs, test_list = _run_generate_tests(user_request, generated_code, llm)
+        return {"test_inputs": test_inputs, "test_list": test_list}
+
    def respond_conversational(state):
        extra_context = state.get("extra_context", "")
        if extra_context:
@ -629,7 +749,11 @@ def build_graph(llm, embeddings, es_client, index_name, llm_conversational=None)
    def validate_code(state: AgentState) -> AgentState:
        last_msg = state["messages"][-1]
        content  = getattr(last_msg, "content", str(last_msg))
-        valid, trace = _call_parser(content)
+        valid, trace = _call_parser(
+            content,
+            test_inputs=state.get("test_inputs") or {},
+            test_list=state.get("test_list") or [],
+        )
        if valid is None:
            logger.warning("[ptvl] parser unavailable — returning unvalidated")
            return {"validation_status": "PARSER_UNAVAILABLE", "parser_trace": ""}
@ -645,9 +769,15 @@ def build_graph(llm, embeddings, es_client, index_name, llm_conversational=None)

        feedback = (
            "\n\n<parser_feedback>\n"
-            "The previous attempt produced invalid AVAP code. Specific failures:\n\n"
+            "The previous attempt failed when the AVAP code was executed. "
+            "The execution engine reported the following error:\n\n"
            f"{parser_trace}\n\n"
-            "Correct these errors. Do not repeat the same constructs.\n"
+            "Rules to fix it:\n"
+            "- Read the error carefully — it identifies the exact command or line that failed.\n"
+            "- Do NOT repeat the same construct that caused the error.\n"
+            "- Only use commands from <avap_syntax_reminder> or <context>.\n"
+            "- If the error mentions an unknown command, replace it with the correct AVAP equivalent.\n"
+            "- If the error mentions a variable, make sure it is declared before use.\n"
            "</parser_feedback>"
        ) if parser_trace else ""

@ -738,6 +868,7 @@ def build_graph(llm, embeddings, es_client, index_name, llm_conversational=None)
    graph_builder.add_node("retrieve",                 retrieve)
    graph_builder.add_node("generate",                 generate)
    graph_builder.add_node("generate_code",            generate_code)
+    graph_builder.add_node("generate_tests",           generate_tests)
    graph_builder.add_node("validate_code",            validate_code)
    graph_builder.add_node("generate_code_retry",      generate_code_retry)
    graph_builder.add_node("validate_code_after_retry",validate_code_after_retry)
@ -771,8 +902,9 @@ def build_graph(llm, embeddings, es_client, index_name, llm_conversational=None)
        }
    )

-    # CODE_GENERATION path: generate → validate → (retry if invalid) → END
-    graph_builder.add_edge("generate_code", "validate_code")
+    # CODE_GENERATION path: generate → generate_tests → validate → (retry if invalid) → END
+    graph_builder.add_edge("generate_code",   "generate_tests")
+    graph_builder.add_edge("generate_tests",  "validate_code")
    graph_builder.add_conditional_edges(
        "validate_code",
        route_after_validate,
--- a/Docker/src/prompts.py
+++ b/Docker/src/prompts.py
@ -189,11 +189,12 @@ CODE_GENERATION_PROMPT = SystemMessage(
        "4. Write the MINIMUM code needed. No extra connectors, no unrelated variables.\n"
        "5. Add brief inline comments explaining each part.\n"
        "6. Answer in the same language the user used.\n"
+        "7. Do NOT use registerEndpoint unless the user explicitly asks to configure, "
+        "register, or set up an endpoint, API route, or HTTP handler. "
+        "For all other requests, write the logic directly without endpoint registration.\n"
        "</critical_rules>\n\n"

        "<avap_syntax_reminder>\n"
-        "// Register an HTTP endpoint\n"
-        "registerEndpoint(\"GET\", \"/path\", [], \"scope\", handlerFn, \"\")\n\n"
        "// Declare a function — uses curly braces, NOT end()\n"
        "function handlerFn() {{\n"
        "    msg = \"Hello World\"\n"
@ -220,7 +221,10 @@ CODE_GENERATION_PROMPT = SystemMessage(
        "    // ...\n"
        "exception(errVar)\n"
        "    // handle\n"
-        "end()\n"
+        "end()\n\n"
+        "// Register an HTTP endpoint — USE ONLY when the user explicitly asks to\n"
+        "// configure, register, or set up an endpoint, API route, or HTTP handler.\n"
+        "registerEndpoint(\"GET\", \"/path\", [], \"scope\", handlerFn, \"\")\n"
        "</avap_syntax_reminder>\n\n"

        "<task>\n"
@ -237,6 +241,51 @@ CODE_GENERATION_PROMPT = SystemMessage(
    )
 )

+TEST_GENERATION_PROMPT = (
+    "<role>\n"
+    "You are a test case generator for AVAP code. "
+    "Given a user request and the AVAP code that was generated, "
+    "produce minimal test inputs and assertions to verify the code behaves correctly.\n"
+    "</role>\n\n"
+
+    "<avap_variables_rule>\n"
+    "In AVAP, variables assigned during execution are available after execution.\n"
+    "Two distinct naming roles exist — do NOT confuse them:\n\n"
+    "1. addParam(\"request_param_name\", avap_variable_name)\n"
+    "   - \"request_param_name\" (first arg, a string literal) is the HTTP request parameter. "
+    "Use it as the KEY in test_inputs.\n"
+    "   - avap_variable_name (second arg, an identifier) is the AVAP variable that receives "
+    "the value. Use it (unquoted) in assertions.\n\n"
+    "   Example: addParam(\"client_id\", id_interno)\n"
+    "   → test_inputs key: \"client_id\"\n"
+    "   → assertion variable: id_interno\n\n"
+    "2. Direct assignments (e.g. msg = \"Hello\", result = a + b) — use the left-hand "
+    "variable name (unquoted) in assertions. These variables need no test_inputs entry.\n"
+    "</avap_variables_rule>\n\n"
+
+    "<assertion_format>\n"
+    "Each assertion must be a QUOTED JSON STRING in this exact format:\n"
+    "  \"re.match(r'<regex_pattern>', str(<avap_variable_name>))\"\n"
+    "Where:\n"
+    "- The entire expression is wrapped in double quotes — it is a JSON string.\n"
+    "- <regex_pattern> is a regex that matches the expected value.\n"
+    "- <avap_variable_name> is the AVAP variable identifier (NOT the request param name, NOT quoted).\n"
+    "</assertion_format>\n\n"
+
+    "<output_rule>\n"
+    "Output ONLY a valid JSON object with exactly two keys. "
+    "Every item in test_list MUST be a quoted string. Raw JSON only — "
+    "no explanation, no markdown, no code block.\n\n"
+    "Example — code uses addParam(\"client_id\", id_interno):\n"
+    "{{\"test_inputs\": {{\"client_id\": \"12345\"}}, "
+    "\"test_list\": [\"re.match(r'^\\\\d+$', str(id_interno))\"]}}\n"
+    "Note: test_inputs key is \"client_id\" (request param), assertion uses id_interno (AVAP variable).\n"
+    "</output_rule>\n\n"
+
+    "<user_request>{user_request}</user_request>\n\n"
+    "<generated_code>{generated_code}</generated_code>"
+)
+
 CONVERSATIONAL_PROMPT = SystemMessage(
    content=(
        "<role>\n"
--- a/Docker/src/server.py
+++ b/Docker/src/server.py
@ -14,7 +14,7 @@ from langchain_core.messages import AIMessage, SystemMessage

 from utils.llm_factory import create_chat_model
 from utils.emb_factory import create_embedding_model
-from graph import build_graph, build_prepare_graph, build_final_messages, session_store, classify_history_store, _load_layer2_model, _call_parser, _extract_avap_code
+from graph import build_graph, build_prepare_graph, build_final_messages, session_store, classify_history_store, _load_layer2_model, _call_parser, _extract_avap_code, _run_generate_tests
 from utils.classifier_export import maybe_export, force_export

 from evaluate import run_evaluation
@ -302,7 +302,11 @@ class BrunixEngine(brunix_pb2_grpc.AssistanceEngineServicer):
                        complete_block = code_buffer[:close_pos + 3]
                        rest           = code_buffer[close_pos + 3:]

-                        valid, trace = _call_parser(complete_block)
+                        # Generate tests before validation so the parser can
+                        # execute the code with real inputs and check assertions.
+                        _avap_code        = _extract_avap_code(complete_block)
+                        _ti, _tl          = _run_generate_tests(query, _avap_code, active_llm)
+                        valid, trace      = _call_parser(complete_block, test_inputs=_ti, test_list=_tl)

                        if valid is False:
                            # Ask LLM to fix only the code block
@ -318,7 +322,7 @@ class BrunixEngine(brunix_pb2_grpc.AssistanceEngineServicer):
                            fixed_code  = _extract_avap_code(fix_resp.content)
                            fixed_block = f"{fence_open}\n{fixed_code}\n```"

-                            valid2, _ = _call_parser(fixed_block)
+                            valid2, _ = _call_parser(fixed_block, test_inputs=_ti, test_list=_tl)
                            if valid2 is False:
                                to_yield          = fixed_block
                                validation_status = "INVALID_UNRESOLVED"
--- a/Docker/src/state.py
+++ b/Docker/src/state.py
@ -28,4 +28,7 @@ class AgentState(TypedDict):
    # -- PTVL (ADR-0009)
    parser_trace:       str   # raw parser error trace from first validation (empty if valid)
    validation_status:  str   # "" | "INVALID_UNRESOLVED" | "PARSER_UNAVAILABLE"
-    context_relevant:   bool  # result of CONFIDENCE_PROMPT check (RETRIEVAL only)
+    context_relevant:   bool  # result of CONFIDENCE_PROMPT check (RETRIEVAL only)
+    # -- TEST GENERATION
+    test_inputs:        dict  # variables injected when executing generated code
+    test_list:          list  # regex assertions validated against output variables
--- a/README.md
+++ b/README.md
@ -62,7 +62,8 @@ graph TD
 │   │   ├── golden_dataset.json   # Ground-truth Q&A dataset for EvaluateRAG
 │   │   └── utils/
 │   │       ├── emb_factory.py    # Provider-agnostic embedding model factory
-│   │       └── llm_factory.py    # Provider-agnostic LLM factory
+│   │       ├── llm_factory.py    # Provider-agnostic LLM factory
+│   │       └── classifier_export.py  # Exports classify_history to JSONL; triggers retraining
 │   ├── tests/
 │   │   └── test_prd_0002.py      # Unit tests — editor context, classifier, proxy parsing
 │   ├── Dockerfile                # Multi-stage container build
@ -82,7 +83,12 @@ graph TD
 │   │   ├── ADR-0002-two-phase-streaming.md
 │   │   ├── ADR-0003-hybrid-retrieval-rrf.md
 │   │   ├── ADR-0004-claude-eval-judge.md
-│   │   └── ADR-0005-embedding-model-selection.md
+│   │   ├── ADR-0005-embedding-model-selection.md
+│   │   ├── ADR-0006-reward-algorithm-dataset-synthesis.md
+│   │   ├── ADR-0007-mandatory-syntactic-validation-layer.md
+│   │   ├── ADR-0008-adaptive-query-routing-intent-history.md
+│   │   ├── ADR-0009-per-type-response-validation.md
+│   │   └── ADR-0010-classifier-continuous-retraining.md
 │   └── product/                  # Product Requirements Documents
 │       ├── PRD-0001-openai-compatible-proxy.md
 │       └── PRD-0002-editor-context-injection.md
@ -113,6 +119,11 @@ graph TD
 │       │   ├── embeddings.py     # OllamaEmbeddings adapter (Chonkie-compatible)
 │       │   └── prompts.py        # Prompt templates for pipeline LLM calls
 │       │
+│       ├── classifier/           # [PIPELINE C] Classifier retraining pipeline
+│       │   ├── retrain_pipeline.py           # Champion/Challenger training, evaluation & promotion
+│       │   ├── seed_classifier_dataset.jsonl # 204 hand-crafted bilingual seed examples
+│       │   └── README.md                     # Classifier pipeline reference
+│       │
 │       └── ingestion/            # [PIPELINE B] AVAP-native classic ingestion
 │           ├── avap_chunker.py   # Custom AVAP lexer + chunker (MinHash dedup, overlaps)
 │           ├── avap_ingestor.py  # Async ES ingestor with DLQ (producer/consumer pattern)
@ -343,6 +354,7 @@ HF_TOKEN=hf_...
 HF_EMB_MODEL_NAME=Qwen/Qwen3-Embedding-0.6B
 ANTHROPIC_API_KEY=sk-ant-...
 ANTHROPIC_MODEL=claude-sonnet-4-20250514
+PARSER_URL=http://host.docker.internal:8888
 ```

 | Variable | Required | Description | Example |
@ -366,6 +378,7 @@ ANTHROPIC_MODEL=claude-sonnet-4-20250514
 | `HF_EMB_MODEL_NAME` | Yes | HuggingFace embeddings model name | `Qwen/Qwen3-Embedding-0.6B` |
 | `ANTHROPIC_API_KEY` | Yes* | Anthropic API key — required for the `EvaluateRAG` endpoint | `sk-ant-...` |
 | `ANTHROPIC_MODEL` | No | Claude model used by the RAG evaluation suite | `claude-sonnet-4-20250514` |
+| `PARSER_URL` | No | AVAP parser REST API base URL — used by PTVL for code execution and assertion validation | `http://host.docker.internal:8888` |

 > Never commit real secret values. Use placeholder values when sharing configuration examples.

@ -663,7 +676,7 @@ For the full set of contribution standards, see [CONTRIBUTING.md](./CONTRIBUTING
 | [docs/API_REFERENCE.md](./docs/API_REFERENCE.md) | Complete gRPC API contract, message types, client examples |
 | [docs/RUNBOOK.md](./docs/RUNBOOK.md) | Operational playbooks, health checks, incident response |
 | [docs/AVAP_CHUNKER_CONFIG.md](./docs/AVAP_CHUNKER_CONFIG.md) | `avap_config.json` reference — blocks, statements, semantic tags, how to extend |
-| [docs/ADR/](./docs/ADR/) | Architecture Decision Records |
+| [docs/ADR/](./docs/ADR/) | Architecture Decision Records (ADR-0001 through ADR-0010) |
 | [docs/product/](./docs/product/) | Product Requirements Documents |
 | [research/](./research/) | Experiment results, benchmarks, and datasets |

--- a/27
+++ b/27
@ -3,6 +3,33 @@
 All notable changes to the **Brunix Assistance Engine** will be documented in this file. This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

 ---
+## [1.7.0] - 2026-04-13
+
+### Added
+- ENGINE: Added `generate_tests` LangGraph node — after `generate_code`, invokes the LLM with `TEST_GENERATION_PROMPT` to produce `test_inputs` (variable dict) and `test_list` (regex assertion array) used to validate generated code at runtime. Runs under a 15-second `ThreadPoolExecutor` timeout to prevent blocking.
+- ENGINE: Added `TEST_GENERATION_PROMPT` to `prompts.py` — generates structured JSON with `test_inputs` and `test_list` from the user request and the generated AVAP code. Assertions use `re.match(r'<pattern>', str(<var>))` format evaluated after code execution.
+- ENGINE: Added `_strip_thinking()` utility in `graph.py` — strips `<think>...</think>` blocks and orphaned `</think>` tags from qwen3 thinking-mode output before code extraction or test parsing.
+- ENGINE: Upgraded AVAP parser integration from `/parse` (AST-only) to `/api/v1/execute` (execution + assertion validation). Payload now includes `test_inputs` and `test_list` from state. Falls back to `/parse` on HTTP 404.
+- ENGINE: Added parser response logging (`[ptvl] parser response: ...`) for observability of execution and assertion outcomes.
+- DATA: Expanded classifier seed dataset (`seed_classifier_dataset.jsonl`) from 95 to 204 examples. Added 100 Spanish-language examples covering all four intent categories, with emphasis on interrogative `CODE_GENERATION` patterns (`como seria`, `como haria`, `puedes escribir`, `muéstrame`, `necesito`).
+
+### Changed
+- ENGINE: `CODE_GENERATION_PROMPT` — added rule 7 suppressing `registerEndpoint` unless the user explicitly asks to configure, register, or set up an endpoint. Moved `registerEndpoint` syntax reference to end of `<avap_syntax_reminder>` with conditional comment.
+- ENGINE: `AgentState` — added `test_inputs: dict` and `test_list: list` fields to carry generated test data between `generate_tests` and `validate_code` nodes.
+- ENGINE: LangGraph `build_graph` wiring updated: `generate_code → generate_tests → validate_code` (was `generate_code → validate_code`).
+- ENGINE: `_call_parser()` signature extended — accepts `test_inputs` and `test_list` params, passes them as JSON payload to `/api/v1/execute`. Parser payload key changed from `variables` to `test_inputs`.
+- ENGINE: `generate_code_retry` feedback message updated to reference runtime execution errors (not just syntax errors).
+- DOCS: Updated `docs/ADR/ADR-0009-per-type-response-validation.md` — full rewrite of Decision section with three-level validation flow, `generate_tests` node documentation, parser protocol (primary + fallback), `_strip_thinking()` utility, updated `AgentState` fields, and updated consequences.
+- DOCS: Updated `docs/ARCHITECTURE.md` — version 1.7.x, related ADRs, component inventory with `generate_tests`, updated `build_graph` flowchart, RC-07/RC-08 routing contract entries, all PTVL `AgentState` fields.
+
+### Fixed
+- ENGINE: Fixed qwen3 thinking mode leaking `<think>...</think>` and `</think>` tags into generated code and test output — resolved by `_strip_thinking()` applied before code extraction and JSON parsing.
+- ENGINE: Fixed `KeyError: 'test_inputs'` — parser payload was using key `variables`; updated to `test_inputs` to match `/api/v1/execute` contract.
+- ENGINE: Fixed `generate_tests` silently skipping — `TEST_GENERATION_PROMPT` contained literal `{` and `}` in example JSON which Python `.format()` interpreted as placeholders. Fixed by escaping all literal braces as `{{` and `}}`.
+- ENGINE: Fixed classifier misclassifying Spanish interrogative code requests (`como seria un API...`) as `RETRIEVAL` — root cause was English-only seed dataset with no interrogative training examples. Fixed by expanding seed dataset with bilingual examples.
+
+---
+
 ## [1.6.2] - 2026-03-26
 ### Changed
 - RESEARCH: updated `embeddings/Embedding model selection.pdf`.
--- a/docs/.DS_Store
+++ b/docs/.DS_Store
--- a/docs/ADR/ADR-0007-mandatory-syntactic-validation-layer.md
+++ b/docs/ADR/ADR-0007-mandatory-syntactic-validation-layer.md
@ -112,7 +112,9 @@ Establish the **Mandatory Syntactic Validation Layer (MSVL)** as a non-optional

 ### 1. Parser integration in `EvaluateRAG`

-Every code block in a generated response must be submitted to the AVAP Parser via gRPC before RAGAS scoring. The parser returns a binary result: `VALID` or `INVALID` with a failure category (`unknown_token`, `unexpected_construct`, `foreign_keyword`, `syntax_error`).
+Every code block in a generated response must be submitted to the AVAP Parser before RAGAS scoring. The parser returns a binary result: `VALID` or `INVALID`.
+
+**Implementation note (2026-04-12):** the AVAP Parser exposes a **REST HTTP API** (Tornado, port 8888), not gRPC as originally anticipated. The call contract is `POST /parse` with body `{"code": "..."}`, returning `{"valid": true/false, "error": "..."}`. The production PTVL (ADR-0009) uses this REST interface. The evaluation pipeline integration should use the same interface.

 ### 2. `syntactic_validity` as an independent metric

@ -132,7 +134,7 @@ final_answer_relevancy(entry) =

 ### 3. Parser SLA and fallback policy

-The AVAP Parser gRPC service must respond within 2 seconds per call. If the parser is unreachable or times out, the evaluation run is **aborted** with an explicit error. Silent fallback to RAGAS-only scoring is prohibited.
+The AVAP Parser REST service must respond within 2 seconds per call. If the parser is unreachable or times out, the evaluation run is **aborted** with an explicit error. Silent fallback to RAGAS-only scoring is prohibited.

 ```python
 if parser_status == UNAVAILABLE:
--- a/docs/ADR/ADR-0009-per-type-response-validation.md
+++ b/docs/ADR/ADR-0009-per-type-response-validation.md
@ -1,7 +1,8 @@
 # ADR-0009: Per-Type Response Validation Layer

 **Date:** 2026-04-10
-**Status:** Accepted
+**Last updated:** 2026-04-13
+**Status:** Implemented
 **Deciders:** Rafael Ruiz (CTO)
 **Related ADRs:** ADR-0007 (MSVL for RAG Evaluation), ADR-0008 (Adaptive Query Routing), ADR-0003 (Hybrid Retrieval RRF)

@ -47,14 +48,20 @@ Add a **Per-Type Response Validation Layer (PTVL)** to the production LangGraph

 | Type | When | What | Mechanism |
 |---|---|---|---|
-| `CODE_GENERATION` | Post-generation | Syntactic validity of generated AVAP code | AVAP Parser gRPC — deterministic |
+| `CODE_GENERATION` | Post-generation | Syntax + execution + assertion correctness | AVAP Parser REST HTTP — `/api/v1/execute` with `/parse` fallback |
 | `RETRIEVAL` | Pre-generation | Relevance of retrieved context to the query | LLM relevance check — `CONFIDENCE_PROMPT_TEMPLATE` |
 | `CONVERSATIONAL` | None | — | No retrieval, no code generated |
 | `PLATFORM` | None | — | No retrieval, no code generated |

 ---

-### Decision 1 — CODE_GENERATION: parser validation with trace-guided retry
+### Decision 1 — CODE_GENERATION: three-level validation with trace-guided retry
+
+Validation operates at three levels in order of depth:
+
+1. **Syntax** — can the code be parsed into a valid AST?
+2. **Execution** — does the code run without runtime errors?
+3. **Assertions** — does the code produce the expected output?

 #### Flow

@ -62,46 +69,141 @@ Add a **Per-Type Response Validation Layer (PTVL)** to the production LangGraph
 generate_code node
    │
    ▼
-[V1] AVAP Parser gRPC
+generate_tests node
+    [LLM generates test_inputs + test_list from code + user request]
    │
-    ├── VALID ──────────────────────────────► return response
+    ▼
+validate_code node
    │
-    └── INVALID + line-by-line trace
-            │
-            ▼
-        [inject trace into retry prompt]
-            │
-            ▼
-        generate_code_retry node  (1 attempt only)
-            │
-            ▼
-        [V2] AVAP Parser gRPC
-            │
-            ├── VALID ──────────────────────► return response
-            │
-            └── INVALID ────────────────────► return response + validation_status flag
+    ▼
+[V1] AVAP Parser  POST /api/v1/execute
+     {"code": "...", "test_inputs": {...}, "test_list": [...]}
+    │
+    ├── success=true + assertion_result=true ──► return response
+    │
+    ├── success=false (runtime error) ──────────┐
+    │                                            │
+    └── assertion_result=false ─────────────────┤
+                                                 │
+                                         [inject error into retry prompt]
+                                                 │
+                                                 ▼
+                                      generate_code_retry node (1 attempt only)
+                                                 │
+                                                 ▼
+                                      validate_code_after_retry
+                                                 │
+                                      [V2] AVAP Parser /api/v1/execute
+                                                 │
+                                         ├── VALID ──────► return response
+                                         └── INVALID ────► return response + validation_status flag
 ```

+**Fallback path:** If `/api/v1/execute` returns 404 (endpoint not yet deployed), validation falls back to `POST /parse` for AST-only syntax checking. The fallback is transparent — no configuration change required.
+
+#### generate_tests node
+
+Before validation, a dedicated LLM call generates test cases from the user's original request and the generated code:
+
+```python
+# Input to TEST_GENERATION_PROMPT
+user_request:   "como seria una api que reciba un parametro y lo devuelva?"
+generated_code: "addParam(\"client_id\", id_interno)\naddResult(id_interno)"
+
+# Output
+{
+  "test_inputs": {"client_id": "12345"},
+  "test_list":   ["re.match(r'^\\d{5}$', str(id_interno))"]
+}
+```
+
+`test_inputs` are injected as request variables when the parser executes the code. `test_list` items are regex assertions evaluated against the output variables after execution.
+
+If the LLM call fails or times out (15s hard limit), `generate_tests` returns empty `test_inputs` and `test_list` — validation continues using execution-only (no assertions). This keeps the node non-blocking.
+
 #### Trace-guided retry

-The parser trace is injected into the generation prompt as a structured correction context:
+The parser error is injected into the retry prompt as a structured correction context:

 ```
 <parser_feedback>
-The previous attempt produced invalid AVAP code. Specific failures:
+The previous attempt failed when the AVAP code was executed.
+The execution engine reported the following error:

-Line 3: unknown command 'getSHA256' — expected known identifier
-Line 7: unexpected construct 'for i in range(...)' — AVAP loop syntax required
+[error from /api/v1/execute logs]

-Correct these errors. Do not repeat the same constructs.
+Rules to fix it:
+- Read the error carefully — it identifies the exact command or line that failed.
+- Do NOT repeat the same construct that caused the error.
+- Only use commands from <avap_syntax_reminder> or <context>.
+- If the error mentions an unknown command, replace it with the correct AVAP equivalent.
+- If the error mentions a variable, make sure it is declared before use.
 </parser_feedback>
 ```

-This is not a blind retry. The LLM receives the exact failure points and can target its corrections. ADR-0007 documented the mapping between common hallucinated commands and their valid AVAP equivalents (`getSHA256` → `encodeSHA256`, `returnResult` → `addResult`, etc.) — the trace makes these corrections automatic without hardcoding the mapping.
+This is not a blind retry. The LLM receives the exact runtime failure and can target its corrections.
+
+#### qwen3 thinking stripping
+
+The engine strips `<think>...</think>` blocks from all LLM outputs before passing code to the parser. `qwen3` models emit thinking content by default; without stripping, the parser receives the full response including reasoning text and fails to find valid AVAP constructs.
+
+```python
+def _strip_thinking(text: str) -> str:
+    text = re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL)
+    text = re.sub(r"</think>", "", text)  # lone closing tag
+    return text.strip()
+```
+
+This runs before code block extraction in `_extract_avap_code` and before JSON parsing in `generate_tests`.
+
+#### Parser protocol
+
+The AVAP Parser exposes a **REST HTTP API** (Tornado, port 8888). Two endpoints are used:
+
+**Primary — `/api/v1/execute`** (execution + assertions):
+
+```
+POST {AVAP_PARSER_URL}/api/v1/execute
+Content-Type: application/json
+
+{
+  "code":        "<AVAP code>",
+  "test_inputs": {"param": "value"},
+  "test_list":   ["re.match(r'^pattern$', str(variable))"]
+}
+```
+
+Response:
+```json
+{
+  "success":          true,
+  "result":           [...],
+  "variables":        {"param": "value", ...},
+  "assertion_result": true,
+  "logs":             [{"command": "...", "duration_ms": 1.2, "success": true}]
+}
+```
+
+**Fallback — `/parse`** (AST-only, when `/api/v1/execute` returns 404):
+
+```
+POST {AVAP_PARSER_URL}/parse
+Content-Type: application/json
+
+{"code": "<AVAP code>"}
+```
+
+Response:
+```json
+{"valid": true,  "ast": {...}}
+{"valid": false, "error": "..."}
+```
+
+Code is extracted from the LLM response by scanning for the first markdown code block (` ```avap ` or generic ` ``` `). `<think>` content is stripped before extraction. The fence markers are stripped before sending to the parser.

 #### Parser SLA

-Inherited from ADR-0007: ≤2 seconds per call. **Silent fallback is permitted in production** (unlike evaluation, where ADR-0007 mandates abort). The distinction is that evaluation scores must be trustworthy; production responses degrade gracefully.
+≤2 seconds per call (`AVAP_PARSER_TIMEOUT`). **Silent fallback is permitted in production** (unlike evaluation, where ADR-0007 mandates abort). The distinction is that evaluation scores must be trustworthy; production responses degrade gracefully.

 #### Parser availability — circuit breaker

@ -135,8 +237,8 @@ Setting `AVAP_PARSER_TIMEOUT=0` permanently opens the circuit — disables parse
 #### New environment variables

 ```
-AVAP_PARSER_URL=grpc://...    # URL of AVAP Parser gRPC service
-AVAP_PARSER_TIMEOUT=2         # seconds per call; 0 = disable validation
+AVAP_PARSER_URL=http://...    # URL of AVAP Parser REST service (e.g. http://45.77.193.144:8888)
+AVAP_PARSER_TIMEOUT=2         # seconds per call; 0 = disable validation entirely
 PARSER_CB_THRESHOLD=3         # consecutive failures before circuit opens
 PARSER_CB_COOLDOWN=30         # seconds before circuit attempts half-open probe
 ```
@ -189,16 +291,85 @@ Reformulate this query using broader terms or alternative phrasing.

 ---

+---
+
+### Decision 3 — AskAgentStream: streaming state machine for CODE_GENERATION
+
+The `AskAgentStream` path streams tokens directly to the client. Post-generation validation (Decision 1) cannot run here because tokens are already yielded before the response is complete.
+
+**Decision:** implement a **streaming state machine** that operates inline on the token stream. The machine has two states: TEXT and CODE.
+
+#### Flow
+
+```
+LLM token stream
+    │
+    ▼
+STATE: TEXT
+  → yield token to client immediately
+  → detect ``` fence in lookahead buffer (2-char safety window for split tokens)
+    │ ``` detected
+    ▼
+STATE: CODE  (buffering — nothing yielded to client)
+  → accumulate tokens in code_buffer
+  → detect closing ``` after first newline
+    │ closing ``` detected
+    ▼
+_call_parser(complete_block)
+    │
+    ├── VALID ──────────────────────────────► yield code block → back to TEXT
+    │
+    ├── INVALID
+    │       │
+    │       ▼
+    │   LLM fix call (fix only the code block, not the full response)
+    │   "Fix this AVAP code: {trace}"
+    │       │
+    │       ▼
+    │   _call_parser(fixed_block)
+    │       ├── VALID ──────────────────────► yield fixed block → back to TEXT
+    │       ├── INVALID ────────────────────► yield fixed block + INVALID_UNRESOLVED → back to TEXT
+    │       └── UNAVAILABLE ────────────────► yield fixed block + PARSER_UNAVAILABLE → back to TEXT
+    │
+    └── UNAVAILABLE ────────────────────────► yield block as-is + PARSER_UNAVAILABLE → back to TEXT
+```
+
+#### Key properties
+
+- Text before and after the code block streams to the client without delay.
+- Only the code block itself introduces latency (one parser call, optionally one LLM fix call + one parser call).
+- The fix call asks the LLM to correct **only the code block**, not the full response — the text already streamed to the client remains valid.
+- If the response contains multiple code blocks, each block is processed independently in sequence. `validation_status` in the final `is_final=True` message reflects the last block validated.
+- If the stream ends while still in CODE mode (malformed response without closing fence), the buffer is flushed as-is.
+
+#### Difference from AskAgent path
+
+| Property | AskAgent | AskAgentStream |
+|---|---|---|
+| Validation point | Post-generation, pre-delivery | Inline, at code block boundary |
+| Retry mechanism | Full `generate_code_retry` LangGraph node | Targeted LLM fix call (code block only) |
+| Text streaming | N/A (non-streaming) | Uninterrupted |
+| `validation_status` delivery | In `AgentResponse` (only response) | In final `AgentResponse` (`is_final=True`) |
+
+---
+
 ## Graph changes

 ### New nodes

 | Node | Graph | Trigger |
 |---|---|---|
-| `validate_code` | `build_graph` | After `generate_code` |
+| `generate_tests` | `build_graph` | After `generate_code` — generates `test_inputs` + `test_list` |
+| `validate_code` | `build_graph` | After `generate_tests` |
 | `generate_code_retry` | `build_graph` | After `validate_code` when INVALID |
-| `check_context_relevance` | `build_graph` + `build_prepare_graph` | After `retrieve`, before `generate` (RETRIEVAL only) |
+| `validate_code_after_retry` | `build_graph` | After `generate_code_retry` |
+| `check_context_relevance` | `build_graph` + `build_prepare_graph` | After `retrieve`, RETRIEVAL only |
 | `reformulate_with_hint` | `build_graph` + `build_prepare_graph` | After `check_context_relevance` when NO |
+| `retrieve_retry` | `build_graph` + `build_prepare_graph` | After `reformulate_with_hint` |
+
+### AskAgentStream
+
+No new LangGraph nodes. The validation logic runs as a **streaming state machine** inline in `server.py:AskAgentStream`. See Decision 3.

 ### Updated flow — `build_graph`

@ -221,10 +392,11 @@ flowchart TD
    RH --> RT2[retrieve retry]
    RT2 --> GE

-    GC --> VC{validate_code\nParser gRPC}
-    VC -->|VALID| END([end])
-    VC -->|INVALID + trace| GCR[generate_code_retry\ntrace-guided]
-    GCR --> VC2{validate_code\nParser gRPC}
+    GC --> GT[generate_tests\nLLM → test_inputs + test_list]
+    GT --> VC{validate_code\n/api/v1/execute}
+    VC -->|VALID + assertions pass| END([end])
+    VC -->|runtime error or assertion fail| GCR[generate_code_retry\ntrace-guided]
+    GCR --> VC2{validate_code_after_retry\n/api/v1/execute}
    VC2 -->|VALID| END
    VC2 -->|INVALID| END

@ -241,9 +413,12 @@ flowchart TD
 class AgentState(TypedDict):
    ...
    # PTVL fields
-    parser_trace:       str   # raw parser trace from first validation attempt (empty if valid)
+    parser_trace:       str   # raw parser error from first validation attempt (empty if valid)
    validation_status:  str   # see validation status values below
    context_relevant:   bool  # result of CONFIDENCE_PROMPT check (RETRIEVAL only)
+    # Test generation fields (set by generate_tests node)
+    test_inputs:        dict  # variables injected when executing generated code
+    test_list:          list  # regex assertions validated against output variables after execution
 ```

 ### Validation status values
@ -267,6 +442,8 @@ message AgentResponse {
 }
 ```

+**Implementation status: complete (2026-04-12).** Field 4 added to `brunix.proto`. Populated in both `AskAgent` (from `final_state`) and `AskAgentStream` (in the `is_final=True` message at end of stream).
+
 Clients that do not read `validation_status` are unaffected — the field defaults to empty string.

 ---
@ -287,7 +464,7 @@ A `CODE_GENERATION` response returned without parser validation due to parser un

 ### RC-08 — Retry budget (priority: medium)

-Each request has a maximum of **1 retry** regardless of type. A `CODE_GENERATION` request that fails parser validation twice returns the second attempt with `validation_status=true`. A `RETRIEVAL` request whose context is insufficient reformulates once and generates unconditionally on the second retrieval.
+Each request has a maximum of **1 retry** regardless of type. A `CODE_GENERATION` request that fails parser validation twice returns the second attempt with `validation_status=INVALID_UNRESOLVED`. A `RETRIEVAL` request whose context is insufficient reformulates once and generates unconditionally on the second retrieval.

 No request may enter more than one retry cycle.

@ -298,25 +475,31 @@ No request may enter more than one retry cycle.
 ### Positive

 - Syntactically invalid AVAP code no longer reaches users silently. `validation_status` gives the client a typed signal: `INVALID_UNRESOLVED` (evidence of bad code) vs `PARSER_UNAVAILABLE` (no evidence either way) — clients can respond differently to each.
- The parser trace makes retries targeted rather than blind — the LLM corrects specific lines, not the whole response.
+- Execution validation (`/api/v1/execute`) catches runtime errors invisible to the AST parser — undefined variables, unsupported commands, type mismatches.
+- Assertion validation verifies the code produces the expected output, not just that it runs.
+- The parser error trace makes retries targeted rather than blind — the LLM corrects specific runtime failures, not the whole response.
 - Circuit breaker prevents parser outages from adding latency to every `CODE_GENERATION` request. After 3 consecutive failures the engine stops trying for 30 seconds.
 - Context relevance check catches retrievals that return topically adjacent but non-answering chunks, reducing fluent-but-ungrounded responses.
 - `AVAP_PARSER_TIMEOUT=0` allows development without the parser service — no hard dependency at startup.
+- Automatic fallback from `/api/v1/execute` to `/parse` — new parser endpoint can be deployed without a coordinated engine restart.

 ### Negative / Trade-offs

- **`CODE_GENERATION` latency**: +1 parser gRPC call per request (~50–200ms for valid code). +1 LLM generation call + 1 parser call on invalid code (~1–2s additional).
+- **`CODE_GENERATION` latency**: +1 `generate_tests` LLM call + 1 parser execution call per request. If `generate_tests` fails (timeout/error), the node returns empty tests and validation continues — no blocking.
 - **`RETRIEVAL` latency**: +1 LLM call (relevance check) on every request. At `qwen3:1.7b` local inference, this adds ~300–500ms to every RETRIEVAL request — not negligible.
 - The parser becomes a **soft production dependency** for CODE_GENERATION. Parser outages degrade validation silently; monitoring must alert on sustained `parser unavailable` log volume.
 - The context relevance check is a **generative model doing a binary classification task** — the same architectural mismatch noted in ADR-0008 for the classifier. It is the correct interim solution while no discriminative relevance model exists.
+- **`registerEndpoint` suppressed by default** — `CODE_GENERATION_PROMPT` rule 7 omits `registerEndpoint` unless the user explicitly requests endpoint registration. This prevents over-engineering simple code responses but requires the user to be explicit when endpoint scaffolding is needed.

 ### Open questions

-1. **`RETRIEVAL` latency budget**: The +300–500ms from the relevance LLM call may be unacceptable for the VS Code extension use case where streaming latency is user-visible. A discriminative relevance model (embedding similarity between query vector and context vector, cosine threshold) would be ~1ms and eliminate this cost entirely. Deferred to a future amendment.
+1. **`RETRIEVAL` latency budget**: The +300–500ms from the relevance LLM call may be unacceptable for the VS Code extension use case where streaming latency is user-visible. A discriminative relevance model (embedding similarity between query vector and context vector, cosine threshold) would be ~1ms and eliminate this cost entirely. Deferred to a future amendment. **Trigger:** when the RETRIEVAL validation LLM call appears as a measurable contribution in Langfuse traces.

-2. **`validation_status` UX**: The proto field is defined but the client behavior is not specified. What should the VS Code extension or AVS Platform display when `validation_status=true`? Requires a product decision outside this ADR's scope.
+2. **`validation_status` UX**: Proto field 4 is defined and populated. What the VS Code extension or AVS Platform displays when `validation_status=INVALID_UNRESOLVED` or `PARSER_UNAVAILABLE` is not yet specified. Requires a product decision outside this ADR's scope. **Open.**

-3. **Parser version pinning**: Inherited from ADR-0007 open question 2. Parser upgrades may alter what is considered valid AVAP. A policy for handling parser version changes in the production pipeline has not been defined.
+3. **Parser version pinning**: Inherited from ADR-0007 open question 2. Parser upgrades may alter what is considered valid AVAP. A policy for handling parser version changes in the production pipeline has not been defined. **Open.**
+
+4. **`validation_status` across multiple code blocks (AskAgentStream)**: When a response contains more than one code block, the `validation_status` in the final `is_final=True` message reflects only the last block validated. If an earlier block was `INVALID_UNRESOLVED` and the last was valid, the client receives `""`. In practice AVAP responses contain one code block. A future amendment may accumulate the worst-case status across all blocks. **Low priority.**

 ---

--- a/docs/ADR/ADR-0010-classifier-continuous-retraining.md
+++ b/docs/ADR/ADR-0010-classifier-continuous-retraining.md
@ -9,7 +9,7 @@

 ## Context

-ADR-0008 Phase 2 deployed a Layer 2 embedding classifier trained on a **seed dataset of 94 hand-crafted examples**. This model works well for the initial distribution of queries but has two structural limitations:
+ADR-0008 Phase 2 deployed a Layer 2 embedding classifier trained on a **seed dataset of 204 hand-crafted examples** (initially 95; expanded on 2026-04-13 with 109 bilingual Spanish examples covering all four intent categories and interrogative `CODE_GENERATION` patterns). This model works well for the initial distribution of queries but has two structural limitations:

 1. **The seed dataset does not reflect production traffic.** Hand-crafted examples are idealized. Real users ask questions with typos, mixed languages, ambiguous phrasing, and domain-specific vocabulary that is not in the seed.

@ -121,9 +121,9 @@ Each retraining cycle merges the seed dataset with all accumulated production ex

 ```mermaid
 flowchart LR
-    T0["Cycle 0\n94 seed examples\nCV 1.0 on seed"] -->
-    T1["Cycle 1\n94 + ~100 production\nreal query distribution"] -->
-    T2["Cycle 2\n94 + ~200 production\nincreasing coverage"] -->
+    T0["Cycle 0\n204 seed examples\nCV 1.0 on seed"] -->
+    T1["Cycle 1\n204 + ~100 production\nreal query distribution"] -->
+    T2["Cycle 2\n204 + ~200 production\nincreasing coverage"] -->
    TN["Cycle N\nseed becomes minority\nmodel reflects production traffic"]
 ```

--- a/docs/ARCHITECTURE.md
+++ b/docs/ARCHITECTURE.md
@ -1,10 +1,10 @@
 # Brunix Assistance Engine — Architecture Reference

 > **Audience:** Engineers contributing to this repository, architects reviewing the system design, and operators responsible for its deployment.
-> **Last updated:** 2026-04-10
-> **Version:** 1.9.x
+> **Last updated:** 2026-04-13
+> **Version:** 1.7.x
 > **Architect:** Rafael Ruiz (CTO, 101OBEX Corp)
-> **Related ADRs:** ADR-0001 · ADR-0002 · ADR-0003 · ADR-0004 · ADR-0005 · ADR-0006 · ADR-0007 · ADR-0008
+> **Related ADRs:** ADR-0001 · ADR-0002 · ADR-0003 · ADR-0004 · ADR-0005 · ADR-0006 · ADR-0007 · ADR-0008 · ADR-0009 · ADR-0010
 > **Related PRDs:** PRD-0001 · PRD-0002 · PRD-0003

 ---
@ -116,15 +116,16 @@ Langfuse is the exception — it has a public IP (`45.77.119.180`) and is access
 |---|---|---|
 | gRPC server | `server.py` | Entry point for all AI requests. Manages session store, model selection, and state initialization |
 | HTTP proxy | `openai_proxy.py` | OpenAI + Ollama compatible HTTP layer. Translates REST → gRPC |
-| LangGraph orchestrator | `graph.py` | Builds and executes the agentic routing graph. Hosts L1, L2, and L3 classifier layers |
-| Prompt definitions | `prompts.py` | All prompt templates in one place: classifier, reformulator, generators, platform |
+| LangGraph orchestrator | `graph.py` | Builds and executes the agentic routing graph. Hosts L1, L2, and L3 classifier layers. Implements PTVL (ADR-0009) |
+| Prompt definitions | `prompts.py` | All prompt templates: classifier, reformulator, generators, platform, test generation |
 | Agent state | `state.py` | `AgentState` TypedDict shared across all graph nodes |
 | LLM factory | `utils/llm_factory.py` | Provider-agnostic model instantiation (Ollama, OpenAI, Anthropic, Bedrock) |
 | Embedding factory | `utils/emb_factory.py` | Provider-agnostic embedding model instantiation |
 | Classifier export | `utils/classifier_export.py` | Exports `classify_history_store` to labeled JSONL when threshold is reached. Data flywheel for Layer 2 retraining |
 | Evaluation pipeline | `evaluate.py` | RAGAS evaluation with Claude as judge |
 | Proto contract | `protos/brunix.proto` | Source of truth for the gRPC API |
-| Classifier training | `scripts/pipelines/classifier/train_classifier.py` | Offline script. Embeds labeled queries with bge-m3, trains LogisticRegression, serializes model |
+| Classifier training | `scripts/pipelines/classifier/retrain_pipeline.py` | Champion/Challenger retraining. Embeds queries with bge-m3, cross-validates, promotes if challenger ≥ champion |
+| Classifier seed dataset | `scripts/pipelines/classifier/seed_classifier_dataset.jsonl` | 204 labeled examples across 4 categories. Bilingual (EN + ES). Anchors all retraining runs |

 **Model slots:**

@ -255,6 +256,8 @@ The classifier does not receive raw conversation messages. It receives a compact
 | RC-04 | `PLATFORM` and `CONVERSATIONAL` never touch Elasticsearch | Medium |
 | RC-05 | `RETRIEVAL`/`CODE_GENERATION` → main model; `CONVERSATIONAL`/`PLATFORM` → conversational model | Medium |
 | RC-06 | Intent history capped at 6 entries | Low |
+| RC-07 | Every `CODE_GENERATION` response must be validated by parser before delivery (ADR-0009) | High |
+| RC-08 | Maximum 1 retry per request regardless of type (ADR-0009) | Medium |

 ---

@ -272,10 +275,22 @@ flowchart TD
    CL -->|CONVERSATIONAL| RC[respond_conversational]
    CL -->|PLATFORM| RP[respond_platform]
    RF --> RT[retrieve]
-    RT -->|RETRIEVAL| GE[generate]
+
+    RT -->|RETRIEVAL| CR{check_context\nrelevance}
+    CR -->|YES| GE[generate]
+    CR -->|NO| RH[reformulate_with_hint]
+    RH --> RT2[retrieve_retry]
+    RT2 --> GE
+
    RT -->|CODE_GENERATION| GC[generate_code]
-    GE --> END([end])
-    GC --> END
+    GC --> GT[generate_tests\nLLM → test_inputs + test_list]
+    GT --> VC{validate_code\n/api/v1/execute}
+    VC -->|VALID| END([end])
+    VC -->|INVALID| GCR[generate_code_retry\ntrace-guided]
+    GCR --> VC2{validate_code_after_retry}
+    VC2 --> END
+
+    GE --> END
    RC --> END
    RP --> END
 ```
@ -468,6 +483,13 @@ class AgentState(TypedDict):
    extra_context:      str                             # base64 decoded
    user_info:          str                             # JSON: {dev_id, project_id, org_id}
    use_editor_context: bool                            # set by classifier
+
+    # PTVL — Per-Type Validation Layer (ADR-0009)
+    parser_trace:       str                             # runtime error from first validation (empty if valid)
+    validation_status:  str                             # "" | "INVALID_UNRESOLVED" | "PARSER_UNAVAILABLE"
+    context_relevant:   bool                            # result of CONFIDENCE_PROMPT check (RETRIEVAL only)
+    test_inputs:        dict                            # variables injected when executing generated code
+    test_list:          list                            # regex assertions validated against output variables
 ```

 ---
--- a/scripts/.DS_Store
+++ b/scripts/.DS_Store
--- a/scripts/pipelines/.DS_Store
+++ b/scripts/pipelines/.DS_Store
--- a/scripts/pipelines/classifier/seed_classifier_dataset.jsonl
+++ b/scripts/pipelines/classifier/seed_classifier_dataset.jsonl
@ -93,3 +93,112 @@
 {"query": "What is my current monthly spend on AVAP Cloud?", "type": "PLATFORM"}
 {"query": "How do I add more API capacity to my plan?", "type": "PLATFORM"}
 {"query": "Your project usage percentage is critically high at 98%", "type": "PLATFORM"}
+{"query": "Write an AVAP script that reads a URL parameter and returns it in the response", "type": "CODE_GENERATION"}
+{"query": "Generate AVAP code that receives a number and returns its square", "type": "CODE_GENERATION"}
+{"query": "Create an AVAP endpoint that accepts a name parameter and returns a greeting", "type": "CODE_GENERATION"}
+{"query": "Write AVAP code that checks if a variable is null and returns an error", "type": "CODE_GENERATION"}
+{"query": "Generate an AVAP function that concatenates two string parameters", "type": "CODE_GENERATION"}
+{"query": "como seria una api que reciba un parametro y lo devuelva como respuesta?", "type": "CODE_GENERATION"}
+{"query": "como haria un script en AVAP que llame a una base de datos?", "type": "CODE_GENERATION"}
+{"query": "puedes escribir un endpoint que reciba un JSON y devuelva un campo especifico?", "type": "CODE_GENERATION"}
+{"query": "muéstrame como hacer una api que sume dos numeros", "type": "CODE_GENERATION"}
+{"query": "necesito un script AVAP que lea un parametro de la URL", "type": "CODE_GENERATION"}
+{"query": "como se hace un endpoint en AVAP que devuelva un array?", "type": "CODE_GENERATION"}
+{"query": "dame el codigo para hacer una llamada HTTP externa desde AVAP", "type": "CODE_GENERATION"}
+{"query": "como implementaria un loop que recorra una lista en AVAP?", "type": "CODE_GENERATION"}
+{"query": "podrias generar un ejemplo de manejo de errores en AVAP?", "type": "CODE_GENERATION"}
+{"query": "como seria el codigo para validar que un parametro no este vacio?", "type": "CODE_GENERATION"}
+{"query": "necesito que me hagas un script que devuelva hola mundo", "type": "CODE_GENERATION"}
+{"query": "puedes hacer un endpoint POST que guarde datos en la base de datos?", "type": "CODE_GENERATION"}
+{"query": "como haria para recibir varios parametros y devolver uno procesado?", "type": "CODE_GENERATION"}
+{"query": "dame un ejemplo de codigo AVAP que use addParam y addResult", "type": "CODE_GENERATION"}
+{"query": "como se implementa un try catch en AVAP? muéstrame el codigo", "type": "CODE_GENERATION"}
+{"query": "escríbeme una funcion AVAP que calcule el total de una lista", "type": "CODE_GENERATION"}
+{"query": "como quedaria una api que reciba nombre y apellido y devuelva el nombre completo?", "type": "CODE_GENERATION"}
+{"query": "genera un endpoint AVAP que devuelva el status 404 si no encuentra el recurso", "type": "CODE_GENERATION"}
+{"query": "como haria una api que haga una peticion GET a otro servicio?", "type": "CODE_GENERATION"}
+{"query": "puedes generarme codigo AVAP que itere sobre un JSON?", "type": "CODE_GENERATION"}
+{"query": "como seria un script que asigne variables y las devuelva en la respuesta?", "type": "CODE_GENERATION"}
+{"query": "necesito codigo que reciba un id y devuelva un objeto con ese id", "type": "CODE_GENERATION"}
+{"query": "como implemento un condicional en AVAP? ponme un ejemplo", "type": "CODE_GENERATION"}
+{"query": "podrias escribir un endpoint que concatene dos parametros?", "type": "CODE_GENERATION"}
+{"query": "como haria para devolver un error personalizado desde AVAP?", "type": "CODE_GENERATION"}
+{"query": "que seria el codigo minimo para crear un endpoint GET en AVAP?", "type": "CODE_GENERATION"}
+{"query": "muéstrame un script AVAP que use un bucle para procesar una lista", "type": "CODE_GENERATION"}
+{"query": "como quedaria un API que reciba un token y lo valide?", "type": "CODE_GENERATION"}
+{"query": "genera el codigo para un endpoint que devuelva la fecha actual", "type": "CODE_GENERATION"}
+{"query": "como se escribe una funcion reutilizable en AVAP?", "type": "CODE_GENERATION"}
+{"query": "que es addVar en AVAP?", "type": "RETRIEVAL"}
+{"query": "para que sirve addResult en AVAP?", "type": "RETRIEVAL"}
+{"query": "como funciona el comando addParam?", "type": "RETRIEVAL"}
+{"query": "cual es la diferencia entre addVar y addObject en AVAP?", "type": "RETRIEVAL"}
+{"query": "que hace startLoop en AVAP?", "type": "RETRIEVAL"}
+{"query": "como funciona el manejo de errores en AVAP?", "type": "RETRIEVAL"}
+{"query": "que es un conector en AVAP Cloud?", "type": "RETRIEVAL"}
+{"query": "cual es el modelo de ejecucion de AVAP?", "type": "RETRIEVAL"}
+{"query": "como funciona el enrutamiento en AVAP?", "type": "RETRIEVAL"}
+{"query": "para que se usa el nodo de pipeline en AVAP?", "type": "RETRIEVAL"}
+{"query": "que tipos de datos soporta AVAP?", "type": "RETRIEVAL"}
+{"query": "como funciona el if en AVAP? cuantos modos tiene?", "type": "RETRIEVAL"}
+{"query": "que diferencia hay entre addRow y addObject?", "type": "RETRIEVAL"}
+{"query": "como se definen funciones reutilizables en AVAP?", "type": "RETRIEVAL"}
+{"query": "que es el catalogo en AVAP Cloud?", "type": "RETRIEVAL"}
+{"query": "como se gestionan las variables de entorno en AVAP?", "type": "RETRIEVAL"}
+{"query": "para que sirve el nodo return en un pipeline?", "type": "RETRIEVAL"}
+{"query": "como maneja AVAP las transformaciones JSON?", "type": "RETRIEVAL"}
+{"query": "que opciones de logging tiene AVAP?", "type": "RETRIEVAL"}
+{"query": "como se conecta AVAP a una API REST externa?", "type": "RETRIEVAL"}
+{"query": "que es una API virtual en AVAP?", "type": "RETRIEVAL"}
+{"query": "como funciona la autenticacion en las llamadas de AVAP?", "type": "RETRIEVAL"}
+{"query": "como se pagina resultados en una API AVAP?", "type": "RETRIEVAL"}
+{"query": "que es endLoop y cuando se usa?", "type": "RETRIEVAL"}
+{"query": "como funciona el exception en AVAP?", "type": "RETRIEVAL"}
+{"query": "no entendi lo que dijiste", "type": "CONVERSATIONAL"}
+{"query": "puedes explicarlo de otra manera?", "type": "CONVERSATIONAL"}
+{"query": "eso no es lo que pregunte", "type": "CONVERSATIONAL"}
+{"query": "en pocas palabras, que quisiste decir?", "type": "CONVERSATIONAL"}
+{"query": "puedes resumirlo en un parrafo?", "type": "CONVERSATIONAL"}
+{"query": "repite lo que dijiste pero mas claro", "type": "CONVERSATIONAL"}
+{"query": "no entendi la segunda parte", "type": "CONVERSATIONAL"}
+{"query": "puedes darme una analogia?", "type": "CONVERSATIONAL"}
+{"query": "dimelo mas simple", "type": "CONVERSATIONAL"}
+{"query": "vuelve a lo que dijiste sobre los conectores", "type": "CONVERSATIONAL"}
+{"query": "dame un resumen en puntos de tu respuesta anterior", "type": "CONVERSATIONAL"}
+{"query": "estas seguro de eso?", "type": "CONVERSATIONAL"}
+{"query": "que quisiste decir con eso?", "type": "CONVERSATIONAL"}
+{"query": "puedes ser mas especifico?", "type": "CONVERSATIONAL"}
+{"query": "amplia la parte del manejo de errores", "type": "CONVERSATIONAL"}
+{"query": "dilo en una sola frase", "type": "CONVERSATIONAL"}
+{"query": "cual era el ejemplo que mencionaste antes?", "type": "CONVERSATIONAL"}
+{"query": "no lo entiendo, intenta con otra explicacion", "type": "CONVERSATIONAL"}
+{"query": "y en la practica, como se veria eso?", "type": "CONVERSATIONAL"}
+{"query": "explicamelo paso a paso", "type": "CONVERSATIONAL"}
+{"query": "compara las dos opciones que describiste", "type": "CONVERSATIONAL"}
+{"query": "lo que dijiste al principio sobre pipelines, repitemelo", "type": "CONVERSATIONAL"}
+{"query": "reformula tu respuesta enfocandote en el rendimiento", "type": "CONVERSATIONAL"}
+{"query": "tengo dudas sobre lo que explicaste, puedes profundizar?", "type": "CONVERSATIONAL"}
+{"query": "cuantas llamadas llevo este mes?", "type": "PLATFORM"}
+{"query": "cual es mi plan actual?", "type": "PLATFORM"}
+{"query": "cuando se renueva mi suscripcion?", "type": "PLATFORM"}
+{"query": "cuanto he consumido de mi cuota?", "type": "PLATFORM"}
+{"query": "cuantas llamadas me quedan disponibles?", "type": "PLATFORM"}
+{"query": "estoy cerca del limite de mi plan?", "type": "PLATFORM"}
+{"query": "que pasa si supero mi cuota mensual?", "type": "PLATFORM"}
+{"query": "puedo ampliar mi plan actual?", "type": "PLATFORM"}
+{"query": "muéstrame las estadisticas de uso de mi cuenta", "type": "PLATFORM"}
+{"query": "cuantos proyectos puedo crear con mi plan?", "type": "PLATFORM"}
+{"query": "tu proyecto tiene un 85% de uso, que recomiendas?", "type": "PLATFORM"}
+{"query": "llevas 4500 llamadas API este mes de un limite de 5000", "type": "PLATFORM"}
+{"query": "tu cuenta ha superado el 90% del limite mensual", "type": "PLATFORM"}
+{"query": "cual es mi gasto mensual en AVAP Cloud?", "type": "PLATFORM"}
+{"query": "como agrego mas capacidad a mi plan?", "type": "PLATFORM"}
+{"query": "tu prueba gratuita expira en 2 dias", "type": "PLATFORM"}
+{"query": "tu consumo esta semana es el doble que la semana pasada", "type": "PLATFORM"}
+{"query": "estado de mi cuenta en AVAP Cloud", "type": "PLATFORM"}
+{"query": "cuales son mis detalles de facturacion?", "type": "PLATFORM"}
+{"query": "me queda poco de mi cuota, que hago?", "type": "PLATFORM"}
+{"query": "tu proyecto ha usado el 60% de los recursos asignados", "type": "PLATFORM"}
+{"query": "cuando caduca mi suscripcion actual?", "type": "PLATFORM"}
+{"query": "cuanto cuesta pasar al plan profesional?", "type": "PLATFORM"}
+{"query": "hay alguna alerta de uso en mi cuenta?", "type": "PLATFORM"}
+{"query": "tu limite diario de API calls es de 1000 y ya llevas 950", "type": "PLATFORM"}
--- a/scripts/pipelines/samples_generator/generate_mbap_v2.py
+++ b/scripts/pipelines/samples_generator/generate_mbap_v2.py
@ -552,8 +552,9 @@ def run_map_elites(args, client, lrm, output_path):
        code      = problem["code"]
        test_list = problem.get("test_list", [])

+        print("#######################################")
        is_valid, ast, error_msg = validator.parse(code)
-
+        print("#######################################")
        if is_valid is None:
            is_valid, ast = True, {}
            if call_count == 1: