ADR0008 Finished and ADR0009 finalizing

This commit is contained in:
rafa-ruiz 2026-04-13 20:27:06 -07:00
parent 0b9c19d61f
commit 6af0a84f4c
19 changed files with 630 additions and 85 deletions

BIN
.DS_Store vendored

Binary file not shown.

View File

@ -2,4 +2,4 @@
# Do not delete.
folderID: wmoge-xmh3x
created: 2026-04-12T14:02:49-07:00
created: 2026-04-12T11:12:38-07:00

BIN
Docker/.DS_Store vendored

Binary file not shown.

View File

@ -33,7 +33,7 @@ services:
CLASSIFIER_SEED_DATASET: ${CLASSIFIER_SEED_DATASET}
CLASSIFIER_MIN_CV_ACCURACY: ${CLASSIFIER_MIN_CV_ACCURACY}
CLASSIFIER_HELD_OUT_RATIO: ${CLASSIFIER_HELD_OUT_RATIO}
PROXY_THREAD_WORKERS: 10
extra_hosts:
- "host.docker.internal:host-gateway"

View File

@ -1,3 +1,4 @@
import json as _json
import logging
import os
import re as _re
@ -28,6 +29,7 @@ from prompts import (
GENERATE_PROMPT,
PLATFORM_PROMPT,
REFORMULATE_PROMPT,
TEST_GENERATION_PROMPT,
)
from state import AgentState, ClassifyEntry
@ -36,7 +38,7 @@ logger = logging.getLogger(__name__)
# ── AVAP Parser client — ADR-0009 (PTVL) ──────────────────────────────────────
_PARSER_URL = os.getenv("AVAP_PARSER_URL", "")
_PARSER_URL = os.getenv("AVAP_PARSER_URL", "http://45.77.193.144:8888")
_PARSER_TIMEOUT = int(os.getenv("AVAP_PARSER_TIMEOUT", "2"))
_CB_THRESHOLD = int(os.getenv("PARSER_CB_THRESHOLD", "3"))
_CB_COOLDOWN = int(os.getenv("PARSER_CB_COOLDOWN", "30"))
@ -87,23 +89,39 @@ class _CircuitBreaker:
_parser_cb = _CircuitBreaker(_CB_THRESHOLD, _CB_COOLDOWN)
def _strip_thinking(text: str) -> str:
"""Remove qwen3 <think>...</think> blocks from LLM output."""
text = _re.sub(r"<think>.*?</think>", "", text, flags=_re.DOTALL)
# Also strip a lone closing tag if the model omitted the opening one
text = _re.sub(r"</think>", "", text)
return text.strip()
def _extract_avap_code(text: str) -> str:
"""Return the first AVAP code block found in an LLM response."""
text = _strip_thinking(text)
for pattern in (r'```avap\s*\n(.*?)```', r'```\s*\n(.*?)```', r'```(.*?)```'):
m = _re.search(pattern, text, _re.DOTALL)
if m:
return m.group(1).strip()
return text
def _call_parser(text: str) -> tuple:
def _call_parser(text: str, test_inputs: dict = None, test_list: list = None) -> tuple:
"""Call AVAP Parser REST API.
Tries /api/v1/execute first (executes the code, catches runtime errors).
If test_inputs/test_list are provided, also validates assertions.
Falls back to /parse (AST-only validation) if /api/v1/execute is not available.
Returns:
(True, "") code valid
(False, trace) code invalid, trace contains the error
(True, "") code valid and executed successfully (assertions passed if provided)
(False, trace) code invalid, runtime error, or failed assertions
(None, "") parser unavailable or circuit open
"""
if not _PARSER_URL or _PARSER_TIMEOUT == 0:
return None, ""
@ -111,27 +129,120 @@ def _call_parser(text: str) -> tuple:
return None, ""
code = _extract_avap_code(text)
logger.info(f"[ptvl] extracted code ({len(code)} chars): {repr(code[:120])}")
if not code.strip():
return None, ""
base_url = _PARSER_URL.rstrip('/')
try:
payload = {
"code": code,
"test_inputs": test_inputs or {},
"test_list": test_list or [],
}
resp = _requests.post(
f"{_PARSER_URL.rstrip('/')}/parse",
json={"code": code},
f"{base_url}/api/v1/run",
json=payload,
timeout=_PARSER_TIMEOUT,
)
if resp.status_code == 404:
# /api/v1/run not deployed yet — fall back to /parse
logger.info("[ptvl] /api/v1/run not available, falling back to /parse")
return _call_parser_parse(base_url, code)
data = resp.json()
if data.get("valid", False):
_parser_cb.success()
return True, ""
_parser_cb.success() # parser responded — it is healthy
return False, data.get("error", "parse error")
_parser_cb.success()
logger.info(f"[ptvl] parser response: {data}")
if not data.get("success", False):
error = data.get("error", "")
if not error:
failed_logs = [l for l in data.get("logs", []) if not l.get("success")]
if failed_logs:
error = failed_logs[0].get("error", "runtime error")
return False, error or "runtime error"
# Execution succeeded — check assertion result if assertions were provided
if test_list and not data.get("assertion_result", True):
return False, "assertion failed: the code ran but did not produce the expected output"
return True, ""
except Exception as exc:
_parser_cb.failure()
logger.warning(f"[ptvl] parser call failed: {exc}")
return None, ""
def _call_parser_parse(base_url: str, code: str) -> tuple:
"""AST-only fallback via /parse."""
try:
resp = _requests.post(
f"{base_url}/parse",
json={"code": code},
timeout=_PARSER_TIMEOUT,
)
data = resp.json()
_parser_cb.success()
logger.info(f"[ptvl] parser response (/parse fallback): {data}")
if data.get("valid", False):
return True, ""
return False, data.get("error", "parse error")
except Exception as exc:
_parser_cb.failure()
logger.warning(f"[ptvl] /parse fallback failed: {exc}")
return None, ""
# ── Test generation helper — used by both build_graph and AskAgentStream ───────
def _run_generate_tests(user_request: str, generated_code: str, llm) -> tuple:
"""Generate test_inputs + test_list for the given AVAP code.
Returns (test_inputs: dict, test_list: list). Never raises falls back to
({}, []) on timeout or any LLM/parse error so the caller can still validate.
"""
import concurrent.futures
def _run():
prompt = "/no_think\n\n" + TEST_GENERATION_PROMPT.format(
user_request=user_request,
generated_code=generated_code,
)
resp = llm.invoke([SystemMessage(content=prompt)])
raw = _strip_thinking(resp.content)
logger.info(f"[generate_tests] raw output: {repr(raw[:200])}")
if raw.startswith("```"):
raw = _re.sub(r"^```[a-z]*\n?", "", raw)
raw = raw.rstrip("`").strip()
m = _re.search(r'\{.*\}', raw, _re.DOTALL)
if m:
raw = m.group(0)
data = _json.loads(raw)
if not isinstance(data, dict):
raise ValueError(f"expected JSON object, got {type(data).__name__}: {repr(raw[:80])}")
return data.get("test_inputs", {}), data.get("test_list", [])
try:
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as ex:
future = ex.submit(_run)
test_inputs, test_list = future.result(timeout=15)
logger.info(f"[generate_tests] {len(test_list)} assertions generated")
except concurrent.futures.TimeoutError:
logger.warning("[generate_tests] timed out after 15s — skipping assertions")
test_inputs, test_list = {}, []
except Exception as exc:
logger.warning(f"[generate_tests] skipped ({type(exc).__name__}): {exc}")
test_inputs, test_list = {}, []
return test_inputs, test_list
# ── Session stores ─────────────────────────────────────────────────────────────
session_store: dict[str, list] = defaultdict(list)
@ -593,9 +704,18 @@ def build_graph(llm, embeddings, es_client, index_name, llm_conversational=None)
)
resp = llm.invoke([prompt] + state["messages"])
logger.info(f"[generate_code] {len(resp.content)} chars")
#logger.info(resp.content)
_persist(state, resp)
return {"messages": [resp]}
def generate_tests(state: AgentState) -> AgentState:
user_msg = state["messages"][-2] if len(state["messages"]) >= 2 else state["messages"][-1]
generated = state["messages"][-1]
user_request = getattr(user_msg, "content", str(user_msg))
generated_code = getattr(generated, "content", str(generated))
test_inputs, test_list = _run_generate_tests(user_request, generated_code, llm)
return {"test_inputs": test_inputs, "test_list": test_list}
def respond_conversational(state):
extra_context = state.get("extra_context", "")
if extra_context:
@ -629,7 +749,11 @@ def build_graph(llm, embeddings, es_client, index_name, llm_conversational=None)
def validate_code(state: AgentState) -> AgentState:
last_msg = state["messages"][-1]
content = getattr(last_msg, "content", str(last_msg))
valid, trace = _call_parser(content)
valid, trace = _call_parser(
content,
test_inputs=state.get("test_inputs") or {},
test_list=state.get("test_list") or [],
)
if valid is None:
logger.warning("[ptvl] parser unavailable — returning unvalidated")
return {"validation_status": "PARSER_UNAVAILABLE", "parser_trace": ""}
@ -645,9 +769,15 @@ def build_graph(llm, embeddings, es_client, index_name, llm_conversational=None)
feedback = (
"\n\n<parser_feedback>\n"
"The previous attempt produced invalid AVAP code. Specific failures:\n\n"
"The previous attempt failed when the AVAP code was executed. "
"The execution engine reported the following error:\n\n"
f"{parser_trace}\n\n"
"Correct these errors. Do not repeat the same constructs.\n"
"Rules to fix it:\n"
"- Read the error carefully — it identifies the exact command or line that failed.\n"
"- Do NOT repeat the same construct that caused the error.\n"
"- Only use commands from <avap_syntax_reminder> or <context>.\n"
"- If the error mentions an unknown command, replace it with the correct AVAP equivalent.\n"
"- If the error mentions a variable, make sure it is declared before use.\n"
"</parser_feedback>"
) if parser_trace else ""
@ -738,6 +868,7 @@ def build_graph(llm, embeddings, es_client, index_name, llm_conversational=None)
graph_builder.add_node("retrieve", retrieve)
graph_builder.add_node("generate", generate)
graph_builder.add_node("generate_code", generate_code)
graph_builder.add_node("generate_tests", generate_tests)
graph_builder.add_node("validate_code", validate_code)
graph_builder.add_node("generate_code_retry", generate_code_retry)
graph_builder.add_node("validate_code_after_retry",validate_code_after_retry)
@ -771,8 +902,9 @@ def build_graph(llm, embeddings, es_client, index_name, llm_conversational=None)
}
)
# CODE_GENERATION path: generate → validate → (retry if invalid) → END
graph_builder.add_edge("generate_code", "validate_code")
# CODE_GENERATION path: generate → generate_tests → validate → (retry if invalid) → END
graph_builder.add_edge("generate_code", "generate_tests")
graph_builder.add_edge("generate_tests", "validate_code")
graph_builder.add_conditional_edges(
"validate_code",
route_after_validate,

View File

@ -189,11 +189,12 @@ CODE_GENERATION_PROMPT = SystemMessage(
"4. Write the MINIMUM code needed. No extra connectors, no unrelated variables.\n"
"5. Add brief inline comments explaining each part.\n"
"6. Answer in the same language the user used.\n"
"7. Do NOT use registerEndpoint unless the user explicitly asks to configure, "
"register, or set up an endpoint, API route, or HTTP handler. "
"For all other requests, write the logic directly without endpoint registration.\n"
"</critical_rules>\n\n"
"<avap_syntax_reminder>\n"
"// Register an HTTP endpoint\n"
"registerEndpoint(\"GET\", \"/path\", [], \"scope\", handlerFn, \"\")\n\n"
"// Declare a function — uses curly braces, NOT end()\n"
"function handlerFn() {{\n"
" msg = \"Hello World\"\n"
@ -220,7 +221,10 @@ CODE_GENERATION_PROMPT = SystemMessage(
" // ...\n"
"exception(errVar)\n"
" // handle\n"
"end()\n"
"end()\n\n"
"// Register an HTTP endpoint — USE ONLY when the user explicitly asks to\n"
"// configure, register, or set up an endpoint, API route, or HTTP handler.\n"
"registerEndpoint(\"GET\", \"/path\", [], \"scope\", handlerFn, \"\")\n"
"</avap_syntax_reminder>\n\n"
"<task>\n"
@ -237,6 +241,51 @@ CODE_GENERATION_PROMPT = SystemMessage(
)
)
TEST_GENERATION_PROMPT = (
"<role>\n"
"You are a test case generator for AVAP code. "
"Given a user request and the AVAP code that was generated, "
"produce minimal test inputs and assertions to verify the code behaves correctly.\n"
"</role>\n\n"
"<avap_variables_rule>\n"
"In AVAP, variables assigned during execution are available after execution.\n"
"Two distinct naming roles exist — do NOT confuse them:\n\n"
"1. addParam(\"request_param_name\", avap_variable_name)\n"
" - \"request_param_name\" (first arg, a string literal) is the HTTP request parameter. "
"Use it as the KEY in test_inputs.\n"
" - avap_variable_name (second arg, an identifier) is the AVAP variable that receives "
"the value. Use it (unquoted) in assertions.\n\n"
" Example: addParam(\"client_id\", id_interno)\n"
" → test_inputs key: \"client_id\"\n"
" → assertion variable: id_interno\n\n"
"2. Direct assignments (e.g. msg = \"Hello\", result = a + b) — use the left-hand "
"variable name (unquoted) in assertions. These variables need no test_inputs entry.\n"
"</avap_variables_rule>\n\n"
"<assertion_format>\n"
"Each assertion must be a QUOTED JSON STRING in this exact format:\n"
" \"re.match(r'<regex_pattern>', str(<avap_variable_name>))\"\n"
"Where:\n"
"- The entire expression is wrapped in double quotes — it is a JSON string.\n"
"- <regex_pattern> is a regex that matches the expected value.\n"
"- <avap_variable_name> is the AVAP variable identifier (NOT the request param name, NOT quoted).\n"
"</assertion_format>\n\n"
"<output_rule>\n"
"Output ONLY a valid JSON object with exactly two keys. "
"Every item in test_list MUST be a quoted string. Raw JSON only — "
"no explanation, no markdown, no code block.\n\n"
"Example — code uses addParam(\"client_id\", id_interno):\n"
"{{\"test_inputs\": {{\"client_id\": \"12345\"}}, "
"\"test_list\": [\"re.match(r'^\\\\d+$', str(id_interno))\"]}}\n"
"Note: test_inputs key is \"client_id\" (request param), assertion uses id_interno (AVAP variable).\n"
"</output_rule>\n\n"
"<user_request>{user_request}</user_request>\n\n"
"<generated_code>{generated_code}</generated_code>"
)
CONVERSATIONAL_PROMPT = SystemMessage(
content=(
"<role>\n"

View File

@ -14,7 +14,7 @@ from langchain_core.messages import AIMessage, SystemMessage
from utils.llm_factory import create_chat_model
from utils.emb_factory import create_embedding_model
from graph import build_graph, build_prepare_graph, build_final_messages, session_store, classify_history_store, _load_layer2_model, _call_parser, _extract_avap_code
from graph import build_graph, build_prepare_graph, build_final_messages, session_store, classify_history_store, _load_layer2_model, _call_parser, _extract_avap_code, _run_generate_tests
from utils.classifier_export import maybe_export, force_export
from evaluate import run_evaluation
@ -302,7 +302,11 @@ class BrunixEngine(brunix_pb2_grpc.AssistanceEngineServicer):
complete_block = code_buffer[:close_pos + 3]
rest = code_buffer[close_pos + 3:]
valid, trace = _call_parser(complete_block)
# Generate tests before validation so the parser can
# execute the code with real inputs and check assertions.
_avap_code = _extract_avap_code(complete_block)
_ti, _tl = _run_generate_tests(query, _avap_code, active_llm)
valid, trace = _call_parser(complete_block, test_inputs=_ti, test_list=_tl)
if valid is False:
# Ask LLM to fix only the code block
@ -318,7 +322,7 @@ class BrunixEngine(brunix_pb2_grpc.AssistanceEngineServicer):
fixed_code = _extract_avap_code(fix_resp.content)
fixed_block = f"{fence_open}\n{fixed_code}\n```"
valid2, _ = _call_parser(fixed_block)
valid2, _ = _call_parser(fixed_block, test_inputs=_ti, test_list=_tl)
if valid2 is False:
to_yield = fixed_block
validation_status = "INVALID_UNRESOLVED"

View File

@ -28,4 +28,7 @@ class AgentState(TypedDict):
# -- PTVL (ADR-0009)
parser_trace: str # raw parser error trace from first validation (empty if valid)
validation_status: str # "" | "INVALID_UNRESOLVED" | "PARSER_UNAVAILABLE"
context_relevant: bool # result of CONFIDENCE_PROMPT check (RETRIEVAL only)
context_relevant: bool # result of CONFIDENCE_PROMPT check (RETRIEVAL only)
# -- TEST GENERATION
test_inputs: dict # variables injected when executing generated code
test_list: list # regex assertions validated against output variables

View File

@ -62,7 +62,8 @@ graph TD
│ │ ├── golden_dataset.json # Ground-truth Q&A dataset for EvaluateRAG
│ │ └── utils/
│ │ ├── emb_factory.py # Provider-agnostic embedding model factory
│ │ └── llm_factory.py # Provider-agnostic LLM factory
│ │ ├── llm_factory.py # Provider-agnostic LLM factory
│ │ └── classifier_export.py # Exports classify_history to JSONL; triggers retraining
│ ├── tests/
│ │ └── test_prd_0002.py # Unit tests — editor context, classifier, proxy parsing
│ ├── Dockerfile # Multi-stage container build
@ -82,7 +83,12 @@ graph TD
│ │ ├── ADR-0002-two-phase-streaming.md
│ │ ├── ADR-0003-hybrid-retrieval-rrf.md
│ │ ├── ADR-0004-claude-eval-judge.md
│ │ └── ADR-0005-embedding-model-selection.md
│ │ ├── ADR-0005-embedding-model-selection.md
│ │ ├── ADR-0006-reward-algorithm-dataset-synthesis.md
│ │ ├── ADR-0007-mandatory-syntactic-validation-layer.md
│ │ ├── ADR-0008-adaptive-query-routing-intent-history.md
│ │ ├── ADR-0009-per-type-response-validation.md
│ │ └── ADR-0010-classifier-continuous-retraining.md
│ └── product/ # Product Requirements Documents
│ ├── PRD-0001-openai-compatible-proxy.md
│ └── PRD-0002-editor-context-injection.md
@ -113,6 +119,11 @@ graph TD
│ │ ├── embeddings.py # OllamaEmbeddings adapter (Chonkie-compatible)
│ │ └── prompts.py # Prompt templates for pipeline LLM calls
│ │
│ ├── classifier/ # [PIPELINE C] Classifier retraining pipeline
│ │ ├── retrain_pipeline.py # Champion/Challenger training, evaluation & promotion
│ │ ├── seed_classifier_dataset.jsonl # 204 hand-crafted bilingual seed examples
│ │ └── README.md # Classifier pipeline reference
│ │
│ └── ingestion/ # [PIPELINE B] AVAP-native classic ingestion
│ ├── avap_chunker.py # Custom AVAP lexer + chunker (MinHash dedup, overlaps)
│ ├── avap_ingestor.py # Async ES ingestor with DLQ (producer/consumer pattern)
@ -343,6 +354,7 @@ HF_TOKEN=hf_...
HF_EMB_MODEL_NAME=Qwen/Qwen3-Embedding-0.6B
ANTHROPIC_API_KEY=sk-ant-...
ANTHROPIC_MODEL=claude-sonnet-4-20250514
PARSER_URL=http://host.docker.internal:8888
```
| Variable | Required | Description | Example |
@ -366,6 +378,7 @@ ANTHROPIC_MODEL=claude-sonnet-4-20250514
| `HF_EMB_MODEL_NAME` | Yes | HuggingFace embeddings model name | `Qwen/Qwen3-Embedding-0.6B` |
| `ANTHROPIC_API_KEY` | Yes* | Anthropic API key — required for the `EvaluateRAG` endpoint | `sk-ant-...` |
| `ANTHROPIC_MODEL` | No | Claude model used by the RAG evaluation suite | `claude-sonnet-4-20250514` |
| `PARSER_URL` | No | AVAP parser REST API base URL — used by PTVL for code execution and assertion validation | `http://host.docker.internal:8888` |
> Never commit real secret values. Use placeholder values when sharing configuration examples.
@ -663,7 +676,7 @@ For the full set of contribution standards, see [CONTRIBUTING.md](./CONTRIBUTING
| [docs/API_REFERENCE.md](./docs/API_REFERENCE.md) | Complete gRPC API contract, message types, client examples |
| [docs/RUNBOOK.md](./docs/RUNBOOK.md) | Operational playbooks, health checks, incident response |
| [docs/AVAP_CHUNKER_CONFIG.md](./docs/AVAP_CHUNKER_CONFIG.md) | `avap_config.json` reference — blocks, statements, semantic tags, how to extend |
| [docs/ADR/](./docs/ADR/) | Architecture Decision Records |
| [docs/ADR/](./docs/ADR/) | Architecture Decision Records (ADR-0001 through ADR-0010) |
| [docs/product/](./docs/product/) | Product Requirements Documents |
| [research/](./research/) | Experiment results, benchmarks, and datasets |

View File

@ -3,6 +3,33 @@
All notable changes to the **Brunix Assistance Engine** will be documented in this file. This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
---
## [1.7.0] - 2026-04-13
### Added
- ENGINE: Added `generate_tests` LangGraph node — after `generate_code`, invokes the LLM with `TEST_GENERATION_PROMPT` to produce `test_inputs` (variable dict) and `test_list` (regex assertion array) used to validate generated code at runtime. Runs under a 15-second `ThreadPoolExecutor` timeout to prevent blocking.
- ENGINE: Added `TEST_GENERATION_PROMPT` to `prompts.py` — generates structured JSON with `test_inputs` and `test_list` from the user request and the generated AVAP code. Assertions use `re.match(r'<pattern>', str(<var>))` format evaluated after code execution.
- ENGINE: Added `_strip_thinking()` utility in `graph.py` — strips `<think>...</think>` blocks and orphaned `</think>` tags from qwen3 thinking-mode output before code extraction or test parsing.
- ENGINE: Upgraded AVAP parser integration from `/parse` (AST-only) to `/api/v1/execute` (execution + assertion validation). Payload now includes `test_inputs` and `test_list` from state. Falls back to `/parse` on HTTP 404.
- ENGINE: Added parser response logging (`[ptvl] parser response: ...`) for observability of execution and assertion outcomes.
- DATA: Expanded classifier seed dataset (`seed_classifier_dataset.jsonl`) from 95 to 204 examples. Added 100 Spanish-language examples covering all four intent categories, with emphasis on interrogative `CODE_GENERATION` patterns (`como seria`, `como haria`, `puedes escribir`, `muéstrame`, `necesito`).
### Changed
- ENGINE: `CODE_GENERATION_PROMPT` — added rule 7 suppressing `registerEndpoint` unless the user explicitly asks to configure, register, or set up an endpoint. Moved `registerEndpoint` syntax reference to end of `<avap_syntax_reminder>` with conditional comment.
- ENGINE: `AgentState` — added `test_inputs: dict` and `test_list: list` fields to carry generated test data between `generate_tests` and `validate_code` nodes.
- ENGINE: LangGraph `build_graph` wiring updated: `generate_code → generate_tests → validate_code` (was `generate_code → validate_code`).
- ENGINE: `_call_parser()` signature extended — accepts `test_inputs` and `test_list` params, passes them as JSON payload to `/api/v1/execute`. Parser payload key changed from `variables` to `test_inputs`.
- ENGINE: `generate_code_retry` feedback message updated to reference runtime execution errors (not just syntax errors).
- DOCS: Updated `docs/ADR/ADR-0009-per-type-response-validation.md` — full rewrite of Decision section with three-level validation flow, `generate_tests` node documentation, parser protocol (primary + fallback), `_strip_thinking()` utility, updated `AgentState` fields, and updated consequences.
- DOCS: Updated `docs/ARCHITECTURE.md` — version 1.7.x, related ADRs, component inventory with `generate_tests`, updated `build_graph` flowchart, RC-07/RC-08 routing contract entries, all PTVL `AgentState` fields.
### Fixed
- ENGINE: Fixed qwen3 thinking mode leaking `<think>...</think>` and `</think>` tags into generated code and test output — resolved by `_strip_thinking()` applied before code extraction and JSON parsing.
- ENGINE: Fixed `KeyError: 'test_inputs'` — parser payload was using key `variables`; updated to `test_inputs` to match `/api/v1/execute` contract.
- ENGINE: Fixed `generate_tests` silently skipping — `TEST_GENERATION_PROMPT` contained literal `{` and `}` in example JSON which Python `.format()` interpreted as placeholders. Fixed by escaping all literal braces as `{{` and `}}`.
- ENGINE: Fixed classifier misclassifying Spanish interrogative code requests (`como seria un API...`) as `RETRIEVAL` — root cause was English-only seed dataset with no interrogative training examples. Fixed by expanding seed dataset with bilingual examples.
---
## [1.6.2] - 2026-03-26
### Changed
- RESEARCH: updated `embeddings/Embedding model selection.pdf`.

BIN
docs/.DS_Store vendored

Binary file not shown.

View File

@ -112,7 +112,9 @@ Establish the **Mandatory Syntactic Validation Layer (MSVL)** as a non-optional
### 1. Parser integration in `EvaluateRAG`
Every code block in a generated response must be submitted to the AVAP Parser via gRPC before RAGAS scoring. The parser returns a binary result: `VALID` or `INVALID` with a failure category (`unknown_token`, `unexpected_construct`, `foreign_keyword`, `syntax_error`).
Every code block in a generated response must be submitted to the AVAP Parser before RAGAS scoring. The parser returns a binary result: `VALID` or `INVALID`.
**Implementation note (2026-04-12):** the AVAP Parser exposes a **REST HTTP API** (Tornado, port 8888), not gRPC as originally anticipated. The call contract is `POST /parse` with body `{"code": "..."}`, returning `{"valid": true/false, "error": "..."}`. The production PTVL (ADR-0009) uses this REST interface. The evaluation pipeline integration should use the same interface.
### 2. `syntactic_validity` as an independent metric
@ -132,7 +134,7 @@ final_answer_relevancy(entry) =
### 3. Parser SLA and fallback policy
The AVAP Parser gRPC service must respond within 2 seconds per call. If the parser is unreachable or times out, the evaluation run is **aborted** with an explicit error. Silent fallback to RAGAS-only scoring is prohibited.
The AVAP Parser REST service must respond within 2 seconds per call. If the parser is unreachable or times out, the evaluation run is **aborted** with an explicit error. Silent fallback to RAGAS-only scoring is prohibited.
```python
if parser_status == UNAVAILABLE:

View File

@ -1,7 +1,8 @@
# ADR-0009: Per-Type Response Validation Layer
**Date:** 2026-04-10
**Status:** Accepted
**Last updated:** 2026-04-13
**Status:** Implemented
**Deciders:** Rafael Ruiz (CTO)
**Related ADRs:** ADR-0007 (MSVL for RAG Evaluation), ADR-0008 (Adaptive Query Routing), ADR-0003 (Hybrid Retrieval RRF)
@ -47,14 +48,20 @@ Add a **Per-Type Response Validation Layer (PTVL)** to the production LangGraph
| Type | When | What | Mechanism |
|---|---|---|---|
| `CODE_GENERATION` | Post-generation | Syntactic validity of generated AVAP code | AVAP Parser gRPC — deterministic |
| `CODE_GENERATION` | Post-generation | Syntax + execution + assertion correctness | AVAP Parser REST HTTP — `/api/v1/execute` with `/parse` fallback |
| `RETRIEVAL` | Pre-generation | Relevance of retrieved context to the query | LLM relevance check — `CONFIDENCE_PROMPT_TEMPLATE` |
| `CONVERSATIONAL` | None | — | No retrieval, no code generated |
| `PLATFORM` | None | — | No retrieval, no code generated |
---
### Decision 1 — CODE_GENERATION: parser validation with trace-guided retry
### Decision 1 — CODE_GENERATION: three-level validation with trace-guided retry
Validation operates at three levels in order of depth:
1. **Syntax** — can the code be parsed into a valid AST?
2. **Execution** — does the code run without runtime errors?
3. **Assertions** — does the code produce the expected output?
#### Flow
@ -62,46 +69,141 @@ Add a **Per-Type Response Validation Layer (PTVL)** to the production LangGraph
generate_code node
[V1] AVAP Parser gRPC
generate_tests node
[LLM generates test_inputs + test_list from code + user request]
├── VALID ──────────────────────────────► return response
validate_code node
└── INVALID + line-by-line trace
[inject trace into retry prompt]
generate_code_retry node (1 attempt only)
[V2] AVAP Parser gRPC
├── VALID ──────────────────────► return response
└── INVALID ────────────────────► return response + validation_status flag
[V1] AVAP Parser POST /api/v1/execute
{"code": "...", "test_inputs": {...}, "test_list": [...]}
├── success=true + assertion_result=true ──► return response
├── success=false (runtime error) ──────────┐
│ │
└── assertion_result=false ─────────────────┤
[inject error into retry prompt]
generate_code_retry node (1 attempt only)
validate_code_after_retry
[V2] AVAP Parser /api/v1/execute
├── VALID ──────► return response
└── INVALID ────► return response + validation_status flag
```
**Fallback path:** If `/api/v1/execute` returns 404 (endpoint not yet deployed), validation falls back to `POST /parse` for AST-only syntax checking. The fallback is transparent — no configuration change required.
#### generate_tests node
Before validation, a dedicated LLM call generates test cases from the user's original request and the generated code:
```python
# Input to TEST_GENERATION_PROMPT
user_request: "como seria una api que reciba un parametro y lo devuelva?"
generated_code: "addParam(\"client_id\", id_interno)\naddResult(id_interno)"
# Output
{
"test_inputs": {"client_id": "12345"},
"test_list": ["re.match(r'^\\d{5}$', str(id_interno))"]
}
```
`test_inputs` are injected as request variables when the parser executes the code. `test_list` items are regex assertions evaluated against the output variables after execution.
If the LLM call fails or times out (15s hard limit), `generate_tests` returns empty `test_inputs` and `test_list` — validation continues using execution-only (no assertions). This keeps the node non-blocking.
#### Trace-guided retry
The parser trace is injected into the generation prompt as a structured correction context:
The parser error is injected into the retry prompt as a structured correction context:
```
<parser_feedback>
The previous attempt produced invalid AVAP code. Specific failures:
The previous attempt failed when the AVAP code was executed.
The execution engine reported the following error:
Line 3: unknown command 'getSHA256' — expected known identifier
Line 7: unexpected construct 'for i in range(...)' — AVAP loop syntax required
[error from /api/v1/execute logs]
Correct these errors. Do not repeat the same constructs.
Rules to fix it:
- Read the error carefully — it identifies the exact command or line that failed.
- Do NOT repeat the same construct that caused the error.
- Only use commands from <avap_syntax_reminder> or <context>.
- If the error mentions an unknown command, replace it with the correct AVAP equivalent.
- If the error mentions a variable, make sure it is declared before use.
</parser_feedback>
```
This is not a blind retry. The LLM receives the exact failure points and can target its corrections. ADR-0007 documented the mapping between common hallucinated commands and their valid AVAP equivalents (`getSHA256` → `encodeSHA256`, `returnResult``addResult`, etc.) — the trace makes these corrections automatic without hardcoding the mapping.
This is not a blind retry. The LLM receives the exact runtime failure and can target its corrections.
#### qwen3 thinking stripping
The engine strips `<think>...</think>` blocks from all LLM outputs before passing code to the parser. `qwen3` models emit thinking content by default; without stripping, the parser receives the full response including reasoning text and fails to find valid AVAP constructs.
```python
def _strip_thinking(text: str) -> str:
text = re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL)
text = re.sub(r"</think>", "", text) # lone closing tag
return text.strip()
```
This runs before code block extraction in `_extract_avap_code` and before JSON parsing in `generate_tests`.
#### Parser protocol
The AVAP Parser exposes a **REST HTTP API** (Tornado, port 8888). Two endpoints are used:
**Primary — `/api/v1/execute`** (execution + assertions):
```
POST {AVAP_PARSER_URL}/api/v1/execute
Content-Type: application/json
{
"code": "<AVAP code>",
"test_inputs": {"param": "value"},
"test_list": ["re.match(r'^pattern$', str(variable))"]
}
```
Response:
```json
{
"success": true,
"result": [...],
"variables": {"param": "value", ...},
"assertion_result": true,
"logs": [{"command": "...", "duration_ms": 1.2, "success": true}]
}
```
**Fallback — `/parse`** (AST-only, when `/api/v1/execute` returns 404):
```
POST {AVAP_PARSER_URL}/parse
Content-Type: application/json
{"code": "<AVAP code>"}
```
Response:
```json
{"valid": true, "ast": {...}}
{"valid": false, "error": "..."}
```
Code is extracted from the LLM response by scanning for the first markdown code block (` ```avap ` or generic ` ``` `). `<think>` content is stripped before extraction. The fence markers are stripped before sending to the parser.
#### Parser SLA
Inherited from ADR-0007: ≤2 seconds per call. **Silent fallback is permitted in production** (unlike evaluation, where ADR-0007 mandates abort). The distinction is that evaluation scores must be trustworthy; production responses degrade gracefully.
≤2 seconds per call (`AVAP_PARSER_TIMEOUT`). **Silent fallback is permitted in production** (unlike evaluation, where ADR-0007 mandates abort). The distinction is that evaluation scores must be trustworthy; production responses degrade gracefully.
#### Parser availability — circuit breaker
@ -135,8 +237,8 @@ Setting `AVAP_PARSER_TIMEOUT=0` permanently opens the circuit — disables parse
#### New environment variables
```
AVAP_PARSER_URL=grpc://... # URL of AVAP Parser gRPC service
AVAP_PARSER_TIMEOUT=2 # seconds per call; 0 = disable validation
AVAP_PARSER_URL=http://... # URL of AVAP Parser REST service (e.g. http://45.77.193.144:8888)
AVAP_PARSER_TIMEOUT=2 # seconds per call; 0 = disable validation entirely
PARSER_CB_THRESHOLD=3 # consecutive failures before circuit opens
PARSER_CB_COOLDOWN=30 # seconds before circuit attempts half-open probe
```
@ -189,16 +291,85 @@ Reformulate this query using broader terms or alternative phrasing.
---
---
### Decision 3 — AskAgentStream: streaming state machine for CODE_GENERATION
The `AskAgentStream` path streams tokens directly to the client. Post-generation validation (Decision 1) cannot run here because tokens are already yielded before the response is complete.
**Decision:** implement a **streaming state machine** that operates inline on the token stream. The machine has two states: TEXT and CODE.
#### Flow
```
LLM token stream
STATE: TEXT
→ yield token to client immediately
→ detect ``` fence in lookahead buffer (2-char safety window for split tokens)
│ ``` detected
STATE: CODE (buffering — nothing yielded to client)
→ accumulate tokens in code_buffer
→ detect closing ``` after first newline
│ closing ``` detected
_call_parser(complete_block)
├── VALID ──────────────────────────────► yield code block → back to TEXT
├── INVALID
│ │
│ ▼
│ LLM fix call (fix only the code block, not the full response)
│ "Fix this AVAP code: {trace}"
│ │
│ ▼
│ _call_parser(fixed_block)
│ ├── VALID ──────────────────────► yield fixed block → back to TEXT
│ ├── INVALID ────────────────────► yield fixed block + INVALID_UNRESOLVED → back to TEXT
│ └── UNAVAILABLE ────────────────► yield fixed block + PARSER_UNAVAILABLE → back to TEXT
└── UNAVAILABLE ────────────────────────► yield block as-is + PARSER_UNAVAILABLE → back to TEXT
```
#### Key properties
- Text before and after the code block streams to the client without delay.
- Only the code block itself introduces latency (one parser call, optionally one LLM fix call + one parser call).
- The fix call asks the LLM to correct **only the code block**, not the full response — the text already streamed to the client remains valid.
- If the response contains multiple code blocks, each block is processed independently in sequence. `validation_status` in the final `is_final=True` message reflects the last block validated.
- If the stream ends while still in CODE mode (malformed response without closing fence), the buffer is flushed as-is.
#### Difference from AskAgent path
| Property | AskAgent | AskAgentStream |
|---|---|---|
| Validation point | Post-generation, pre-delivery | Inline, at code block boundary |
| Retry mechanism | Full `generate_code_retry` LangGraph node | Targeted LLM fix call (code block only) |
| Text streaming | N/A (non-streaming) | Uninterrupted |
| `validation_status` delivery | In `AgentResponse` (only response) | In final `AgentResponse` (`is_final=True`) |
---
## Graph changes
### New nodes
| Node | Graph | Trigger |
|---|---|---|
| `validate_code` | `build_graph` | After `generate_code` |
| `generate_tests` | `build_graph` | After `generate_code` — generates `test_inputs` + `test_list` |
| `validate_code` | `build_graph` | After `generate_tests` |
| `generate_code_retry` | `build_graph` | After `validate_code` when INVALID |
| `check_context_relevance` | `build_graph` + `build_prepare_graph` | After `retrieve`, before `generate` (RETRIEVAL only) |
| `validate_code_after_retry` | `build_graph` | After `generate_code_retry` |
| `check_context_relevance` | `build_graph` + `build_prepare_graph` | After `retrieve`, RETRIEVAL only |
| `reformulate_with_hint` | `build_graph` + `build_prepare_graph` | After `check_context_relevance` when NO |
| `retrieve_retry` | `build_graph` + `build_prepare_graph` | After `reformulate_with_hint` |
### AskAgentStream
No new LangGraph nodes. The validation logic runs as a **streaming state machine** inline in `server.py:AskAgentStream`. See Decision 3.
### Updated flow — `build_graph`
@ -221,10 +392,11 @@ flowchart TD
RH --> RT2[retrieve retry]
RT2 --> GE
GC --> VC{validate_code\nParser gRPC}
VC -->|VALID| END([end])
VC -->|INVALID + trace| GCR[generate_code_retry\ntrace-guided]
GCR --> VC2{validate_code\nParser gRPC}
GC --> GT[generate_tests\nLLM → test_inputs + test_list]
GT --> VC{validate_code\n/api/v1/execute}
VC -->|VALID + assertions pass| END([end])
VC -->|runtime error or assertion fail| GCR[generate_code_retry\ntrace-guided]
GCR --> VC2{validate_code_after_retry\n/api/v1/execute}
VC2 -->|VALID| END
VC2 -->|INVALID| END
@ -241,9 +413,12 @@ flowchart TD
class AgentState(TypedDict):
...
# PTVL fields
parser_trace: str # raw parser trace from first validation attempt (empty if valid)
parser_trace: str # raw parser error from first validation attempt (empty if valid)
validation_status: str # see validation status values below
context_relevant: bool # result of CONFIDENCE_PROMPT check (RETRIEVAL only)
# Test generation fields (set by generate_tests node)
test_inputs: dict # variables injected when executing generated code
test_list: list # regex assertions validated against output variables after execution
```
### Validation status values
@ -267,6 +442,8 @@ message AgentResponse {
}
```
**Implementation status: complete (2026-04-12).** Field 4 added to `brunix.proto`. Populated in both `AskAgent` (from `final_state`) and `AskAgentStream` (in the `is_final=True` message at end of stream).
Clients that do not read `validation_status` are unaffected — the field defaults to empty string.
---
@ -287,7 +464,7 @@ A `CODE_GENERATION` response returned without parser validation due to parser un
### RC-08 — Retry budget (priority: medium)
Each request has a maximum of **1 retry** regardless of type. A `CODE_GENERATION` request that fails parser validation twice returns the second attempt with `validation_status=true`. A `RETRIEVAL` request whose context is insufficient reformulates once and generates unconditionally on the second retrieval.
Each request has a maximum of **1 retry** regardless of type. A `CODE_GENERATION` request that fails parser validation twice returns the second attempt with `validation_status=INVALID_UNRESOLVED`. A `RETRIEVAL` request whose context is insufficient reformulates once and generates unconditionally on the second retrieval.
No request may enter more than one retry cycle.
@ -298,25 +475,31 @@ No request may enter more than one retry cycle.
### Positive
- Syntactically invalid AVAP code no longer reaches users silently. `validation_status` gives the client a typed signal: `INVALID_UNRESOLVED` (evidence of bad code) vs `PARSER_UNAVAILABLE` (no evidence either way) — clients can respond differently to each.
- The parser trace makes retries targeted rather than blind — the LLM corrects specific lines, not the whole response.
- Execution validation (`/api/v1/execute`) catches runtime errors invisible to the AST parser — undefined variables, unsupported commands, type mismatches.
- Assertion validation verifies the code produces the expected output, not just that it runs.
- The parser error trace makes retries targeted rather than blind — the LLM corrects specific runtime failures, not the whole response.
- Circuit breaker prevents parser outages from adding latency to every `CODE_GENERATION` request. After 3 consecutive failures the engine stops trying for 30 seconds.
- Context relevance check catches retrievals that return topically adjacent but non-answering chunks, reducing fluent-but-ungrounded responses.
- `AVAP_PARSER_TIMEOUT=0` allows development without the parser service — no hard dependency at startup.
- Automatic fallback from `/api/v1/execute` to `/parse` — new parser endpoint can be deployed without a coordinated engine restart.
### Negative / Trade-offs
- **`CODE_GENERATION` latency**: +1 parser gRPC call per request (~50200ms for valid code). +1 LLM generation call + 1 parser call on invalid code (~12s additional).
- **`CODE_GENERATION` latency**: +1 `generate_tests` LLM call + 1 parser execution call per request. If `generate_tests` fails (timeout/error), the node returns empty tests and validation continues — no blocking.
- **`RETRIEVAL` latency**: +1 LLM call (relevance check) on every request. At `qwen3:1.7b` local inference, this adds ~300500ms to every RETRIEVAL request — not negligible.
- The parser becomes a **soft production dependency** for CODE_GENERATION. Parser outages degrade validation silently; monitoring must alert on sustained `parser unavailable` log volume.
- The context relevance check is a **generative model doing a binary classification task** — the same architectural mismatch noted in ADR-0008 for the classifier. It is the correct interim solution while no discriminative relevance model exists.
- **`registerEndpoint` suppressed by default** — `CODE_GENERATION_PROMPT` rule 7 omits `registerEndpoint` unless the user explicitly requests endpoint registration. This prevents over-engineering simple code responses but requires the user to be explicit when endpoint scaffolding is needed.
### Open questions
1. **`RETRIEVAL` latency budget**: The +300500ms from the relevance LLM call may be unacceptable for the VS Code extension use case where streaming latency is user-visible. A discriminative relevance model (embedding similarity between query vector and context vector, cosine threshold) would be ~1ms and eliminate this cost entirely. Deferred to a future amendment.
1. **`RETRIEVAL` latency budget**: The +300500ms from the relevance LLM call may be unacceptable for the VS Code extension use case where streaming latency is user-visible. A discriminative relevance model (embedding similarity between query vector and context vector, cosine threshold) would be ~1ms and eliminate this cost entirely. Deferred to a future amendment. **Trigger:** when the RETRIEVAL validation LLM call appears as a measurable contribution in Langfuse traces.
2. **`validation_status` UX**: The proto field is defined but the client behavior is not specified. What should the VS Code extension or AVS Platform display when `validation_status=true`? Requires a product decision outside this ADR's scope.
2. **`validation_status` UX**: Proto field 4 is defined and populated. What the VS Code extension or AVS Platform displays when `validation_status=INVALID_UNRESOLVED` or `PARSER_UNAVAILABLE` is not yet specified. Requires a product decision outside this ADR's scope. **Open.**
3. **Parser version pinning**: Inherited from ADR-0007 open question 2. Parser upgrades may alter what is considered valid AVAP. A policy for handling parser version changes in the production pipeline has not been defined.
3. **Parser version pinning**: Inherited from ADR-0007 open question 2. Parser upgrades may alter what is considered valid AVAP. A policy for handling parser version changes in the production pipeline has not been defined. **Open.**
4. **`validation_status` across multiple code blocks (AskAgentStream)**: When a response contains more than one code block, the `validation_status` in the final `is_final=True` message reflects only the last block validated. If an earlier block was `INVALID_UNRESOLVED` and the last was valid, the client receives `""`. In practice AVAP responses contain one code block. A future amendment may accumulate the worst-case status across all blocks. **Low priority.**
---

View File

@ -9,7 +9,7 @@
## Context
ADR-0008 Phase 2 deployed a Layer 2 embedding classifier trained on a **seed dataset of 94 hand-crafted examples**. This model works well for the initial distribution of queries but has two structural limitations:
ADR-0008 Phase 2 deployed a Layer 2 embedding classifier trained on a **seed dataset of 204 hand-crafted examples** (initially 95; expanded on 2026-04-13 with 109 bilingual Spanish examples covering all four intent categories and interrogative `CODE_GENERATION` patterns). This model works well for the initial distribution of queries but has two structural limitations:
1. **The seed dataset does not reflect production traffic.** Hand-crafted examples are idealized. Real users ask questions with typos, mixed languages, ambiguous phrasing, and domain-specific vocabulary that is not in the seed.
@ -121,9 +121,9 @@ Each retraining cycle merges the seed dataset with all accumulated production ex
```mermaid
flowchart LR
T0["Cycle 0\n94 seed examples\nCV 1.0 on seed"] -->
T1["Cycle 1\n94 + ~100 production\nreal query distribution"] -->
T2["Cycle 2\n94 + ~200 production\nincreasing coverage"] -->
T0["Cycle 0\n204 seed examples\nCV 1.0 on seed"] -->
T1["Cycle 1\n204 + ~100 production\nreal query distribution"] -->
T2["Cycle 2\n204 + ~200 production\nincreasing coverage"] -->
TN["Cycle N\nseed becomes minority\nmodel reflects production traffic"]
```

View File

@ -1,10 +1,10 @@
# Brunix Assistance Engine — Architecture Reference
> **Audience:** Engineers contributing to this repository, architects reviewing the system design, and operators responsible for its deployment.
> **Last updated:** 2026-04-10
> **Version:** 1.9.x
> **Last updated:** 2026-04-13
> **Version:** 1.7.x
> **Architect:** Rafael Ruiz (CTO, 101OBEX Corp)
> **Related ADRs:** ADR-0001 · ADR-0002 · ADR-0003 · ADR-0004 · ADR-0005 · ADR-0006 · ADR-0007 · ADR-0008
> **Related ADRs:** ADR-0001 · ADR-0002 · ADR-0003 · ADR-0004 · ADR-0005 · ADR-0006 · ADR-0007 · ADR-0008 · ADR-0009 · ADR-0010
> **Related PRDs:** PRD-0001 · PRD-0002 · PRD-0003
---
@ -116,15 +116,16 @@ Langfuse is the exception — it has a public IP (`45.77.119.180`) and is access
|---|---|---|
| gRPC server | `server.py` | Entry point for all AI requests. Manages session store, model selection, and state initialization |
| HTTP proxy | `openai_proxy.py` | OpenAI + Ollama compatible HTTP layer. Translates REST → gRPC |
| LangGraph orchestrator | `graph.py` | Builds and executes the agentic routing graph. Hosts L1, L2, and L3 classifier layers |
| Prompt definitions | `prompts.py` | All prompt templates in one place: classifier, reformulator, generators, platform |
| LangGraph orchestrator | `graph.py` | Builds and executes the agentic routing graph. Hosts L1, L2, and L3 classifier layers. Implements PTVL (ADR-0009) |
| Prompt definitions | `prompts.py` | All prompt templates: classifier, reformulator, generators, platform, test generation |
| Agent state | `state.py` | `AgentState` TypedDict shared across all graph nodes |
| LLM factory | `utils/llm_factory.py` | Provider-agnostic model instantiation (Ollama, OpenAI, Anthropic, Bedrock) |
| Embedding factory | `utils/emb_factory.py` | Provider-agnostic embedding model instantiation |
| Classifier export | `utils/classifier_export.py` | Exports `classify_history_store` to labeled JSONL when threshold is reached. Data flywheel for Layer 2 retraining |
| Evaluation pipeline | `evaluate.py` | RAGAS evaluation with Claude as judge |
| Proto contract | `protos/brunix.proto` | Source of truth for the gRPC API |
| Classifier training | `scripts/pipelines/classifier/train_classifier.py` | Offline script. Embeds labeled queries with bge-m3, trains LogisticRegression, serializes model |
| Classifier training | `scripts/pipelines/classifier/retrain_pipeline.py` | Champion/Challenger retraining. Embeds queries with bge-m3, cross-validates, promotes if challenger ≥ champion |
| Classifier seed dataset | `scripts/pipelines/classifier/seed_classifier_dataset.jsonl` | 204 labeled examples across 4 categories. Bilingual (EN + ES). Anchors all retraining runs |
**Model slots:**
@ -255,6 +256,8 @@ The classifier does not receive raw conversation messages. It receives a compact
| RC-04 | `PLATFORM` and `CONVERSATIONAL` never touch Elasticsearch | Medium |
| RC-05 | `RETRIEVAL`/`CODE_GENERATION` → main model; `CONVERSATIONAL`/`PLATFORM` → conversational model | Medium |
| RC-06 | Intent history capped at 6 entries | Low |
| RC-07 | Every `CODE_GENERATION` response must be validated by parser before delivery (ADR-0009) | High |
| RC-08 | Maximum 1 retry per request regardless of type (ADR-0009) | Medium |
---
@ -272,10 +275,22 @@ flowchart TD
CL -->|CONVERSATIONAL| RC[respond_conversational]
CL -->|PLATFORM| RP[respond_platform]
RF --> RT[retrieve]
RT -->|RETRIEVAL| GE[generate]
RT -->|RETRIEVAL| CR{check_context\nrelevance}
CR -->|YES| GE[generate]
CR -->|NO| RH[reformulate_with_hint]
RH --> RT2[retrieve_retry]
RT2 --> GE
RT -->|CODE_GENERATION| GC[generate_code]
GE --> END([end])
GC --> END
GC --> GT[generate_tests\nLLM → test_inputs + test_list]
GT --> VC{validate_code\n/api/v1/execute}
VC -->|VALID| END([end])
VC -->|INVALID| GCR[generate_code_retry\ntrace-guided]
GCR --> VC2{validate_code_after_retry}
VC2 --> END
GE --> END
RC --> END
RP --> END
```
@ -468,6 +483,13 @@ class AgentState(TypedDict):
extra_context: str # base64 decoded
user_info: str # JSON: {dev_id, project_id, org_id}
use_editor_context: bool # set by classifier
# PTVL — Per-Type Validation Layer (ADR-0009)
parser_trace: str # runtime error from first validation (empty if valid)
validation_status: str # "" | "INVALID_UNRESOLVED" | "PARSER_UNAVAILABLE"
context_relevant: bool # result of CONFIDENCE_PROMPT check (RETRIEVAL only)
test_inputs: dict # variables injected when executing generated code
test_list: list # regex assertions validated against output variables
```
---

BIN
scripts/.DS_Store vendored

Binary file not shown.

Binary file not shown.

View File

@ -93,3 +93,112 @@
{"query": "What is my current monthly spend on AVAP Cloud?", "type": "PLATFORM"}
{"query": "How do I add more API capacity to my plan?", "type": "PLATFORM"}
{"query": "Your project usage percentage is critically high at 98%", "type": "PLATFORM"}
{"query": "Write an AVAP script that reads a URL parameter and returns it in the response", "type": "CODE_GENERATION"}
{"query": "Generate AVAP code that receives a number and returns its square", "type": "CODE_GENERATION"}
{"query": "Create an AVAP endpoint that accepts a name parameter and returns a greeting", "type": "CODE_GENERATION"}
{"query": "Write AVAP code that checks if a variable is null and returns an error", "type": "CODE_GENERATION"}
{"query": "Generate an AVAP function that concatenates two string parameters", "type": "CODE_GENERATION"}
{"query": "como seria una api que reciba un parametro y lo devuelva como respuesta?", "type": "CODE_GENERATION"}
{"query": "como haria un script en AVAP que llame a una base de datos?", "type": "CODE_GENERATION"}
{"query": "puedes escribir un endpoint que reciba un JSON y devuelva un campo especifico?", "type": "CODE_GENERATION"}
{"query": "muéstrame como hacer una api que sume dos numeros", "type": "CODE_GENERATION"}
{"query": "necesito un script AVAP que lea un parametro de la URL", "type": "CODE_GENERATION"}
{"query": "como se hace un endpoint en AVAP que devuelva un array?", "type": "CODE_GENERATION"}
{"query": "dame el codigo para hacer una llamada HTTP externa desde AVAP", "type": "CODE_GENERATION"}
{"query": "como implementaria un loop que recorra una lista en AVAP?", "type": "CODE_GENERATION"}
{"query": "podrias generar un ejemplo de manejo de errores en AVAP?", "type": "CODE_GENERATION"}
{"query": "como seria el codigo para validar que un parametro no este vacio?", "type": "CODE_GENERATION"}
{"query": "necesito que me hagas un script que devuelva hola mundo", "type": "CODE_GENERATION"}
{"query": "puedes hacer un endpoint POST que guarde datos en la base de datos?", "type": "CODE_GENERATION"}
{"query": "como haria para recibir varios parametros y devolver uno procesado?", "type": "CODE_GENERATION"}
{"query": "dame un ejemplo de codigo AVAP que use addParam y addResult", "type": "CODE_GENERATION"}
{"query": "como se implementa un try catch en AVAP? muéstrame el codigo", "type": "CODE_GENERATION"}
{"query": "escríbeme una funcion AVAP que calcule el total de una lista", "type": "CODE_GENERATION"}
{"query": "como quedaria una api que reciba nombre y apellido y devuelva el nombre completo?", "type": "CODE_GENERATION"}
{"query": "genera un endpoint AVAP que devuelva el status 404 si no encuentra el recurso", "type": "CODE_GENERATION"}
{"query": "como haria una api que haga una peticion GET a otro servicio?", "type": "CODE_GENERATION"}
{"query": "puedes generarme codigo AVAP que itere sobre un JSON?", "type": "CODE_GENERATION"}
{"query": "como seria un script que asigne variables y las devuelva en la respuesta?", "type": "CODE_GENERATION"}
{"query": "necesito codigo que reciba un id y devuelva un objeto con ese id", "type": "CODE_GENERATION"}
{"query": "como implemento un condicional en AVAP? ponme un ejemplo", "type": "CODE_GENERATION"}
{"query": "podrias escribir un endpoint que concatene dos parametros?", "type": "CODE_GENERATION"}
{"query": "como haria para devolver un error personalizado desde AVAP?", "type": "CODE_GENERATION"}
{"query": "que seria el codigo minimo para crear un endpoint GET en AVAP?", "type": "CODE_GENERATION"}
{"query": "muéstrame un script AVAP que use un bucle para procesar una lista", "type": "CODE_GENERATION"}
{"query": "como quedaria un API que reciba un token y lo valide?", "type": "CODE_GENERATION"}
{"query": "genera el codigo para un endpoint que devuelva la fecha actual", "type": "CODE_GENERATION"}
{"query": "como se escribe una funcion reutilizable en AVAP?", "type": "CODE_GENERATION"}
{"query": "que es addVar en AVAP?", "type": "RETRIEVAL"}
{"query": "para que sirve addResult en AVAP?", "type": "RETRIEVAL"}
{"query": "como funciona el comando addParam?", "type": "RETRIEVAL"}
{"query": "cual es la diferencia entre addVar y addObject en AVAP?", "type": "RETRIEVAL"}
{"query": "que hace startLoop en AVAP?", "type": "RETRIEVAL"}
{"query": "como funciona el manejo de errores en AVAP?", "type": "RETRIEVAL"}
{"query": "que es un conector en AVAP Cloud?", "type": "RETRIEVAL"}
{"query": "cual es el modelo de ejecucion de AVAP?", "type": "RETRIEVAL"}
{"query": "como funciona el enrutamiento en AVAP?", "type": "RETRIEVAL"}
{"query": "para que se usa el nodo de pipeline en AVAP?", "type": "RETRIEVAL"}
{"query": "que tipos de datos soporta AVAP?", "type": "RETRIEVAL"}
{"query": "como funciona el if en AVAP? cuantos modos tiene?", "type": "RETRIEVAL"}
{"query": "que diferencia hay entre addRow y addObject?", "type": "RETRIEVAL"}
{"query": "como se definen funciones reutilizables en AVAP?", "type": "RETRIEVAL"}
{"query": "que es el catalogo en AVAP Cloud?", "type": "RETRIEVAL"}
{"query": "como se gestionan las variables de entorno en AVAP?", "type": "RETRIEVAL"}
{"query": "para que sirve el nodo return en un pipeline?", "type": "RETRIEVAL"}
{"query": "como maneja AVAP las transformaciones JSON?", "type": "RETRIEVAL"}
{"query": "que opciones de logging tiene AVAP?", "type": "RETRIEVAL"}
{"query": "como se conecta AVAP a una API REST externa?", "type": "RETRIEVAL"}
{"query": "que es una API virtual en AVAP?", "type": "RETRIEVAL"}
{"query": "como funciona la autenticacion en las llamadas de AVAP?", "type": "RETRIEVAL"}
{"query": "como se pagina resultados en una API AVAP?", "type": "RETRIEVAL"}
{"query": "que es endLoop y cuando se usa?", "type": "RETRIEVAL"}
{"query": "como funciona el exception en AVAP?", "type": "RETRIEVAL"}
{"query": "no entendi lo que dijiste", "type": "CONVERSATIONAL"}
{"query": "puedes explicarlo de otra manera?", "type": "CONVERSATIONAL"}
{"query": "eso no es lo que pregunte", "type": "CONVERSATIONAL"}
{"query": "en pocas palabras, que quisiste decir?", "type": "CONVERSATIONAL"}
{"query": "puedes resumirlo en un parrafo?", "type": "CONVERSATIONAL"}
{"query": "repite lo que dijiste pero mas claro", "type": "CONVERSATIONAL"}
{"query": "no entendi la segunda parte", "type": "CONVERSATIONAL"}
{"query": "puedes darme una analogia?", "type": "CONVERSATIONAL"}
{"query": "dimelo mas simple", "type": "CONVERSATIONAL"}
{"query": "vuelve a lo que dijiste sobre los conectores", "type": "CONVERSATIONAL"}
{"query": "dame un resumen en puntos de tu respuesta anterior", "type": "CONVERSATIONAL"}
{"query": "estas seguro de eso?", "type": "CONVERSATIONAL"}
{"query": "que quisiste decir con eso?", "type": "CONVERSATIONAL"}
{"query": "puedes ser mas especifico?", "type": "CONVERSATIONAL"}
{"query": "amplia la parte del manejo de errores", "type": "CONVERSATIONAL"}
{"query": "dilo en una sola frase", "type": "CONVERSATIONAL"}
{"query": "cual era el ejemplo que mencionaste antes?", "type": "CONVERSATIONAL"}
{"query": "no lo entiendo, intenta con otra explicacion", "type": "CONVERSATIONAL"}
{"query": "y en la practica, como se veria eso?", "type": "CONVERSATIONAL"}
{"query": "explicamelo paso a paso", "type": "CONVERSATIONAL"}
{"query": "compara las dos opciones que describiste", "type": "CONVERSATIONAL"}
{"query": "lo que dijiste al principio sobre pipelines, repitemelo", "type": "CONVERSATIONAL"}
{"query": "reformula tu respuesta enfocandote en el rendimiento", "type": "CONVERSATIONAL"}
{"query": "tengo dudas sobre lo que explicaste, puedes profundizar?", "type": "CONVERSATIONAL"}
{"query": "cuantas llamadas llevo este mes?", "type": "PLATFORM"}
{"query": "cual es mi plan actual?", "type": "PLATFORM"}
{"query": "cuando se renueva mi suscripcion?", "type": "PLATFORM"}
{"query": "cuanto he consumido de mi cuota?", "type": "PLATFORM"}
{"query": "cuantas llamadas me quedan disponibles?", "type": "PLATFORM"}
{"query": "estoy cerca del limite de mi plan?", "type": "PLATFORM"}
{"query": "que pasa si supero mi cuota mensual?", "type": "PLATFORM"}
{"query": "puedo ampliar mi plan actual?", "type": "PLATFORM"}
{"query": "muéstrame las estadisticas de uso de mi cuenta", "type": "PLATFORM"}
{"query": "cuantos proyectos puedo crear con mi plan?", "type": "PLATFORM"}
{"query": "tu proyecto tiene un 85% de uso, que recomiendas?", "type": "PLATFORM"}
{"query": "llevas 4500 llamadas API este mes de un limite de 5000", "type": "PLATFORM"}
{"query": "tu cuenta ha superado el 90% del limite mensual", "type": "PLATFORM"}
{"query": "cual es mi gasto mensual en AVAP Cloud?", "type": "PLATFORM"}
{"query": "como agrego mas capacidad a mi plan?", "type": "PLATFORM"}
{"query": "tu prueba gratuita expira en 2 dias", "type": "PLATFORM"}
{"query": "tu consumo esta semana es el doble que la semana pasada", "type": "PLATFORM"}
{"query": "estado de mi cuenta en AVAP Cloud", "type": "PLATFORM"}
{"query": "cuales son mis detalles de facturacion?", "type": "PLATFORM"}
{"query": "me queda poco de mi cuota, que hago?", "type": "PLATFORM"}
{"query": "tu proyecto ha usado el 60% de los recursos asignados", "type": "PLATFORM"}
{"query": "cuando caduca mi suscripcion actual?", "type": "PLATFORM"}
{"query": "cuanto cuesta pasar al plan profesional?", "type": "PLATFORM"}
{"query": "hay alguna alerta de uso en mi cuenta?", "type": "PLATFORM"}
{"query": "tu limite diario de API calls es de 1000 y ya llevas 950", "type": "PLATFORM"}

View File

@ -552,8 +552,9 @@ def run_map_elites(args, client, lrm, output_path):
code = problem["code"]
test_list = problem.get("test_list", [])
print("#######################################")
is_valid, ast, error_msg = validator.parse(code)
print("#######################################")
if is_valid is None:
is_valid, ast = True, {}
if call_count == 1: