From b29c1214f4f71ac496e37c2ba0f458aa3dfd9427 Mon Sep 17 00:00:00 2001 From: Rafael Ruiz Date: Tue, 10 Feb 2026 07:46:58 -0800 Subject: [PATCH 1/3] Update role for Engine in component overview --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 6107db0..fb50632 100644 --- a/README.md +++ b/README.md @@ -93,7 +93,7 @@ The engine will be listening for gRPC requests on port `50052`. | Service | Container Name | Description | Role | | :--- | :--- | :--- | :--- | -| **Engine** | `brunix-assistance-engine` | The AVAP-powered brain. | 101OBEX Corp | +| **Engine** | `brunix-assistance-engine` | The AVAP-powered brain. | Engine | | **Vector DB** | `brunix-vector-db` | Elasticsearch instance (Knowledge Base). | Training Support | | **Observability** | `brunix-observability` | Langfuse UI (Tracing & Costs). | System Quality | | **System DB** | `brunix-postgres` | Internal storage for Langfuse. | Infrastructure | @@ -145,4 +145,4 @@ graph LR gRPC --> Engine Sec -.->|Injected as Env| Engine Engine <--> DB -``` \ No newline at end of file +``` From 1daac66f8906c4212da37b51705d0a324f8f3f99 Mon Sep 17 00:00:00 2001 From: rafa-ruiz Date: Wed, 18 Mar 2026 18:55:48 -0700 Subject: [PATCH 2/3] UPGRADE: New RAG functional --- Docker/Dockerfile | 10 +- Docker/docker-compose.yaml | 2 + Docker/entrypoint.sh | 30 +++ Docker/protos/brunix.proto | 54 ++++- Docker/requirements.txt | 7 + Docker/src/evaluate.py | 230 ++++++++++++++++++++ Docker/src/graph.py | 371 ++++++++++++++++++++++++++++++-- Docker/src/openai_proxy.py | 420 +++++++++++++++++++++++++++++++++++++ Docker/src/prompts.py | 289 +++++++++++++++++++------ Docker/src/server.py | 202 +++++++++++++++--- Docker/src/state.py | 8 +- 11 files changed, 1501 insertions(+), 122 deletions(-) create mode 100644 Docker/entrypoint.sh create mode 100644 Docker/src/evaluate.py create mode 100644 Docker/src/openai_proxy.py diff --git a/Docker/Dockerfile b/Docker/Dockerfile index 240cf50..4166505 100644 --- a/Docker/Dockerfile +++ b/Docker/Dockerfile @@ -10,7 +10,7 @@ COPY ./requirements.txt . RUN apt-get update && apt-get install -y --no-install-recommends \ build-essential \ curl \ - protobuf-compiler \ + protobuf-compiler \ && rm -rf /var/lib/apt/lists/* RUN pip install --no-cache-dir --upgrade pip @@ -25,6 +25,10 @@ RUN python -m grpc_tools.protoc \ --grpc_python_out=./src \ ./protos/brunix.proto -EXPOSE 50051 +COPY entrypoint.sh /entrypoint.sh +RUN chmod +x /entrypoint.sh -CMD ["python", "src/server.py"] \ No newline at end of file +EXPOSE 50051 +EXPOSE 8000 + +ENTRYPOINT ["/entrypoint.sh"] \ No newline at end of file diff --git a/Docker/docker-compose.yaml b/Docker/docker-compose.yaml index a1df028..7716065 100644 --- a/Docker/docker-compose.yaml +++ b/Docker/docker-compose.yaml @@ -6,6 +6,7 @@ services: container_name: brunix-assistance-engine ports: - "50052:50051" + - "8000:8000" environment: ELASTICSEARCH_URL: ${ELASTICSEARCH_URL} ELASTICSEARCH_INDEX: ${ELASTICSEARCH_INDEX} @@ -16,6 +17,7 @@ services: OLLAMA_URL: ${OLLAMA_URL} OLLAMA_MODEL_NAME: ${OLLAMA_MODEL_NAME} OLLAMA_EMB_MODEL_NAME: ${OLLAMA_EMB_MODEL_NAME} + PROXY_THREAD_WORKERS: 10 extra_hosts: - "host.docker.internal:host-gateway" diff --git a/Docker/entrypoint.sh b/Docker/entrypoint.sh new file mode 100644 index 0000000..4e27203 --- /dev/null +++ b/Docker/entrypoint.sh @@ -0,0 +1,30 @@ +#!/bin/sh +set -e + +echo "[entrypoint] Starting Brunix Engine (gRPC :50051)..." +python src/server.py & +ENGINE_PID=$! + +echo "[entrypoint] Starting OpenAI Proxy (HTTP :8000)..." +uvicorn openai_proxy:app --host 0.0.0.0 --port 8000 --workers 4 --app-dir src & +PROXY_PID=$! + +wait_any() { + while kill -0 $ENGINE_PID 2>/dev/null && kill -0 $PROXY_PID 2>/dev/null; do + sleep 2 + done + + if ! kill -0 $ENGINE_PID 2>/dev/null; then + echo "[entrypoint] Engine died — stopping proxy" + kill $PROXY_PID 2>/dev/null + exit 1 + fi + + if ! kill -0 $PROXY_PID 2>/dev/null; then + echo "[entrypoint] Proxy died — stopping engine" + kill $ENGINE_PID 2>/dev/null + exit 1 + fi +} + +wait_any \ No newline at end of file diff --git a/Docker/protos/brunix.proto b/Docker/protos/brunix.proto index 420662b..adde716 100644 --- a/Docker/protos/brunix.proto +++ b/Docker/protos/brunix.proto @@ -3,16 +3,60 @@ syntax = "proto3"; package brunix; service AssistanceEngine { - rpc AskAgent (AgentRequest) returns (stream AgentResponse); + // Respuesta completa — compatible con clientes existentes + rpc AskAgent (AgentRequest) returns (stream AgentResponse); + + // Streaming real token a token desde Ollama + rpc AskAgentStream (AgentRequest) returns (stream AgentResponse); + + // Evaluación RAGAS con Claude como juez + rpc EvaluateRAG (EvalRequest) returns (EvalResponse); } +// --------------------------------------------------------------------------- +// AskAgent / AskAgentStream — mismos mensajes, dos comportamientos +// --------------------------------------------------------------------------- + message AgentRequest { - string query = 1; - string session_id = 2; + string query = 1; + string session_id = 2; } message AgentResponse { - string text = 1; + string text = 1; string avap_code = 2; - bool is_final = 3; + bool is_final = 3; +} + +// --------------------------------------------------------------------------- +// EvaluateRAG +// --------------------------------------------------------------------------- + +message EvalRequest { + string category = 1; + int32 limit = 2; + string index = 3; +} + +message EvalResponse { + string status = 1; + int32 questions_evaluated = 2; + float elapsed_seconds = 3; + string judge_model = 4; + string index = 5; + float faithfulness = 6; + float answer_relevancy = 7; + float context_recall = 8; + float context_precision = 9; + float global_score = 10; + string verdict = 11; + repeated QuestionDetail details = 12; +} + +message QuestionDetail { + string id = 1; + string category = 2; + string question = 3; + string answer_preview = 4; + int32 n_chunks = 5; } diff --git a/Docker/requirements.txt b/Docker/requirements.txt index 78c9ce1..5ff3ce9 100644 --- a/Docker/requirements.txt +++ b/Docker/requirements.txt @@ -316,3 +316,10 @@ yarl==1.22.0 # via aiohttp zstandard==0.25.0 # via langsmith + +ragas +datasets +langchain-anthropic + +fastapi>=0.111.0 +uvicorn[standard]>=0.29.0 \ No newline at end of file diff --git a/Docker/src/evaluate.py b/Docker/src/evaluate.py new file mode 100644 index 0000000..791f9fb --- /dev/null +++ b/Docker/src/evaluate.py @@ -0,0 +1,230 @@ +import os +import time +import json +import logging +from collections import defaultdict +from pathlib import Path +from typing import Optional +from ragas import evaluate as ragas_evaluate +from ragas.metrics import ( faithfulness, answer_relevancy, context_recall, context_precision,) +from ragas.llms import LangchainLLMWrapper +from ragas.embeddings import LangchainEmbeddingsWrapper +from datasets import Dataset +from langchain_anthropic import ChatAnthropic + +logger = logging.getLogger(__name__) + +GOLDEN_DATASET_PATH = Path(__file__).parent / "golden_dataset.json" +CLAUDE_MODEL = os.getenv("ANTHROPIC_MODEL", "claude-sonnet-4-20250514") +ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY", "") +K_RETRIEVE = 5 + + + +ANTHROPIC_AVAILABLE = True + + +from elasticsearch import Elasticsearch +from langchain_core.messages import SystemMessage, HumanMessage + +def retrieve_context( es_client, embeddings, question, index, k = K_RETRIEVE,): + + query_vector = None + try: + query_vector = embeddings.embed_query(question) + except Exception as e: + logger.warning(f"[eval] embed_query fails: {e}") + + bm25_hits = [] + try: + resp = es_client.search( + index=index, + body={ + "size": k, + "query": { + "multi_match": { + "query": question, + "fields": ["content^2", "text^2"], + "type": "best_fields", + "fuzziness": "AUTO", + } + }, + "_source": {"excludes": ["embedding"]}, + } + ) + bm25_hits = resp["hits"]["hits"] + except Exception as e: + logger.warning(f"[eval] BM25 fails: {e}") + + knn_hits = [] + if query_vector: + try: + resp = es_client.search( + index=index, + body={ + "size": k, + "knn": { + "field": "embedding", + "query_vector": query_vector, + "k": k, + "num_candidates": k * 5, + }, + "_source": {"excludes": ["embedding"]}, + } + ) + knn_hits = resp["hits"]["hits"] + except Exception as e: + logger.warning(f"[eval] kNN falló: {e}") + + rrf_scores: dict[str, float] = defaultdict(float) + hit_by_id: dict[str, dict] = {} + + for rank, hit in enumerate(bm25_hits): + doc_id = hit["_id"] + rrf_scores[doc_id] += 1.0 / (rank + 60) + hit_by_id[doc_id] = hit + + for rank, hit in enumerate(knn_hits): + doc_id = hit["_id"] + rrf_scores[doc_id] += 1.0 / (rank + 60) + if doc_id not in hit_by_id: + hit_by_id[doc_id] = hit + + ranked = sorted(rrf_scores.items(), key=lambda x: x[1], reverse=True)[:k] + + return [ + hit_by_id[doc_id]["_source"].get("content") + or hit_by_id[doc_id]["_source"].get("text", "") + for doc_id, _ in ranked + if ( + hit_by_id[doc_id]["_source"].get("content") + or hit_by_id[doc_id]["_source"].get("text", "") + ).strip() + ] + + +def generate_answer(llm, question: str, contexts: list[str]) -> str: + try: + from prompts import GENERATE_PROMPT + context_text = "\n\n".join( + f"[{i+1}] {ctx}" for i, ctx in enumerate(contexts) + ) + prompt = SystemMessage( + content=GENERATE_PROMPT.content.format(context=context_text) + ) + resp = llm.invoke([prompt, HumanMessage(content=question)]) + return resp.content.strip() + except Exception as e: + logger.warning(f"[eval] generate_answer fails: {e}") + return "" + +def run_evaluation( es_client, llm, embeddings, index_name, category = None, limit = None,): + + if not ANTHROPIC_AVAILABLE: + return {"error": "langchain-anthropic no instalado. pip install langchain-anthropic"} + if not ANTHROPIC_API_KEY: + return {"error": "ANTHROPIC_API_KEY no configurada en .env"} + if not GOLDEN_DATASET_PATH.exists(): + return {"error": f"Golden dataset no encontrado en {GOLDEN_DATASET_PATH}"} + + + questions = json.loads(GOLDEN_DATASET_PATH.read_text(encoding="utf-8")) + if category: + questions = [q for q in questions if q.get("category") == category] + if limit: + questions = questions[:limit] + if not questions: + return {"error": "NO QUESTIONS WITH THIS FILTERS"} + + logger.info(f"[eval] makind: {len(questions)} questions, index={index_name}") + + claude_judge = ChatAnthropic( + model=CLAUDE_MODEL, + api_key=ANTHROPIC_API_KEY, + temperature=0, + max_tokens=2048, + ) + + rows = {"question": [], "answer": [], "contexts": [], "ground_truth": []} + details = [] + t_start = time.time() + + for item in questions: + q_id = item["id"] + question = item["question"] + gt = item["ground_truth"] + + logger.info(f"[eval] {q_id}: {question[:60]}") + + contexts = retrieve_context(es_client, embeddings, question, index_name) + if not contexts: + logger.warning(f"[eval] No context for {q_id} — skipping") + continue + + answer = generate_answer(llm, question, contexts) + if not answer: + logger.warning(f"[eval] No answers for {q_id} — skipping") + continue + + rows["question"].append(question) + rows["answer"].append(answer) + rows["contexts"].append(contexts) + rows["ground_truth"].append(gt) + + details.append({ + "id": q_id, + "category": item.get("category", ""), + "question": question, + "answer_preview": answer[:300], + "n_chunks": len(contexts), + }) + + if not rows["question"]: + return {"error": "NO SAMPLES GENETARED"} + + dataset = Dataset.from_dict(rows) + ragas_llm = LangchainLLMWrapper(claude_judge) + ragas_emb = LangchainEmbeddingsWrapper(embeddings) + + metrics = [faithfulness, answer_relevancy, context_recall, context_precision] + for metric in metrics: + metric.llm = ragas_llm + if hasattr(metric, "embeddings"): + metric.embeddings = ragas_emb + + logger.info("[eval] JUDGING BY CLAUDE...") + result = ragas_evaluate(dataset, metrics=metrics) + + elapsed = time.time() - t_start + + scores = { + "faithfulness": round(float(result.get("faithfulness", 0)), 4), + "answer_relevancy": round(float(result.get("answer_relevancy", 0)), 4), + "context_recall": round(float(result.get("context_recall", 0)), 4), + "context_precision": round(float(result.get("context_precision", 0)), 4), + } + + valid_scores = [v for v in scores.values() if v > 0] + global_score = round(sum(valid_scores) / len(valid_scores), 4) if valid_scores else 0.0 + + verdict = ( + "EXCELLENT" if global_score >= 0.8 else + "ACCEPTABLE" if global_score >= 0.6 else + "INSUFFICIENT" + ) + + logger.info(f"[eval] FINISHED — global={global_score} verdict={verdict} " + f"elapsed={elapsed:.0f}s") + + return { + "status": "ok", + "questions_evaluated": len(rows["question"]), + "elapsed_seconds": round(elapsed, 1), + "judge_model": CLAUDE_MODEL, + "index": index_name, + "category_filter": category or "all", + "scores": scores, + "global_score": global_score, + "verdict": verdict, + "details": details, + } \ No newline at end of file diff --git a/Docker/src/graph.py b/Docker/src/graph.py index 0ee0cf5..b092e74 100644 --- a/Docker/src/graph.py +++ b/Docker/src/graph.py @@ -1,60 +1,391 @@ -# graph.py import logging - +from collections import defaultdict +from elasticsearch import Elasticsearch from langchain_core.documents import Document -from langchain_core.messages import SystemMessage +from langchain_core.messages import AIMessage, SystemMessage, HumanMessage, BaseMessage from langgraph.graph import END, StateGraph from langgraph.graph.state import CompiledStateGraph -from prompts import GENERATE_PROMPT, REFORMULATE_PROMPT + +from prompts import ( + CLASSIFY_PROMPT_TEMPLATE, + CODE_GENERATION_PROMPT, + CONVERSATIONAL_PROMPT, + GENERATE_PROMPT, + REFORMULATE_PROMPT, +) + from state import AgentState logger = logging.getLogger(__name__) +session_store: dict[str, list] = defaultdict(list) -def format_context(docs: list[Document]) -> str: +def format_context(docs): chunks = [] for i, doc in enumerate(docs, 1): - source = (doc.metadata or {}).get("source", "Untitled") - source_id = (doc.metadata or {}).get("id", f"chunk-{i}") - text = doc.page_content or "" - chunks.append(f"[{i}] id={source_id} source={source}\n{text}") + meta = doc.metadata or {} + chunk_id = meta.get("chunk_id", meta.get("id", f"chunk-{i}")) + source = meta.get("source_file", meta.get("source", "unknown")) + doc_type = meta.get("doc_type", "") + block_type = meta.get("block_type", "") + section = meta.get("section", "") + + text = (doc.page_content or "").strip() + if not text: + text = meta.get("content") or meta.get("text") or "" + + header_parts = [f"[{i}]", f"id={chunk_id}"] + if doc_type: header_parts.append(f"type={doc_type}") + if block_type: header_parts.append(f"block={block_type}") + if section: header_parts.append(f"section={section}") + header_parts.append(f"source={source}") + + if doc_type in ("code", "code_example", "bnf") or \ + block_type in ("function", "if", "startLoop", "try"): + header_parts.append("[AVAP CODE]") + + chunks.append(" ".join(header_parts) + "\n" + text) + return "\n\n".join(chunks) -def build_graph(llm, vector_store) -> CompiledStateGraph: +def format_history_for_classify(messages): + lines = [] + for msg in messages[-6:]: + if isinstance(msg, HumanMessage): + lines.append(f"User: {msg.content}") + elif isinstance(msg, AIMessage): + lines.append(f"Assistant: {msg.content[:300]}") + elif isinstance(msg, dict): + role = msg.get("role", "user") + content = msg.get("content", "")[:300] + lines.append(f"{role.capitalize()}: {content}") + return "\n".join(lines) if lines else "(no history)" + + +def hybrid_search_native(es_client, embeddings, query, index_name, k=8): + query_vector = None + try: + query_vector = embeddings.embed_query(query) + except Exception as e: + logger.warning(f"[hybrid] embed_query fails: {e}") + + bm25_hits = [] + try: + resp = es_client.search( + index=index_name, + body={ + "size": k, + "query": { + "multi_match": { + "query": query, + "fields": ["content^2", "text^2"], + "type": "best_fields", + "fuzziness": "AUTO", + } + }, + "_source": {"excludes": ["embedding"]}, + } + ) + bm25_hits = resp["hits"]["hits"] + logger.info(f"[hybrid] BM25 -> {len(bm25_hits)} hits") + except Exception as e: + logger.warning(f"[hybrid] BM25 fails: {e}") + + knn_hits = [] + if query_vector: + try: + resp = es_client.search( + index=index_name, + body={ + "size": k, + "knn": { + "field": "embedding", + "query_vector": query_vector, + "k": k, + "num_candidates": k * 5, + }, + "_source": {"excludes": ["embedding"]}, + } + ) + knn_hits = resp["hits"]["hits"] + logger.info(f"[hybrid] kNN -> {len(knn_hits)} hits") + except Exception as e: + logger.warning(f"[hybrid] kNN fails: {e}") + + rrf_scores: dict[str, float] = defaultdict(float) + hit_by_id: dict[str, dict] = {} + + for rank, hit in enumerate(bm25_hits): + doc_id = hit["_id"] + rrf_scores[doc_id] += 1.0 / (rank + 60) + hit_by_id[doc_id] = hit + + for rank, hit in enumerate(knn_hits): + doc_id = hit["_id"] + rrf_scores[doc_id] += 1.0 / (rank + 60) + if doc_id not in hit_by_id: + hit_by_id[doc_id] = hit + + ranked = sorted(rrf_scores.items(), key=lambda x: x[1], reverse=True)[:k] + + docs = [] + for doc_id, score in ranked: + src = hit_by_id[doc_id]["_source"] + text = src.get("content") or src.get("text") or "" + meta = {k: v for k, v in src.items() + if k not in ("content", "text", "embedding")} + meta["id"]= doc_id + meta["rrf_score"] = score + docs.append(Document(page_content=text, metadata=meta)) + + logger.info(f"[hybrid] RRF -> {len(docs)} final docs") + return docs + +def build_graph(llm, embeddings, es_client, index_name): + + def _persist(state: AgentState, response: BaseMessage): + session_id = state.get("session_id", "") + if session_id: + session_store[session_id] = list(state["messages"]) + [response] + + def classify(state): + messages = state["messages"] + user_msg = messages[-1] + question = getattr(user_msg, "content", + user_msg.get("content", "") + if isinstance(user_msg, dict) else "") + history_msgs = messages[:-1] + + if not history_msgs: + prompt_content = ( + CLASSIFY_PROMPT_TEMPLATE + .replace("{history}", "(no history)") + .replace("{message}", question) + ) + resp = llm.invoke([SystemMessage(content=prompt_content)]) + raw = resp.content.strip().upper() + query_type = _parse_query_type(raw) + logger.info(f"[classify] no historic content raw='{raw}' -> {query_type}") + return {"query_type": query_type} + + history_text = format_history_for_classify(history_msgs) + prompt_content = ( + CLASSIFY_PROMPT_TEMPLATE + .replace("{history}", history_text) + .replace("{message}", question) + ) + resp = llm.invoke([SystemMessage(content=prompt_content)]) + raw = resp.content.strip().upper() + query_type = _parse_query_type(raw) + logger.info(f"[classify] raw='{raw}' -> {query_type}") + return {"query_type": query_type} + + def _parse_query_type(raw: str) -> str: + if raw.startswith("CODE_GENERATION") or "CODE" in raw: + return "CODE_GENERATION" + if raw.startswith("CONVERSATIONAL"): + return "CONVERSATIONAL" + return "RETRIEVAL" + def reformulate(state: AgentState) -> AgentState: user_msg = state["messages"][-1] resp = llm.invoke([REFORMULATE_PROMPT, user_msg]) reformulated = resp.content.strip() - logger.info(f"[reformulate] '{user_msg.content}' → '{reformulated}'") + logger.info(f"[reformulate] -> '{reformulated}'") return {"reformulated_query": reformulated} def retrieve(state: AgentState) -> AgentState: query = state["reformulated_query"] - docs = vector_store.as_retriever( - search_type="similarity", - search_kwargs={"k": 3}, - ).invoke(query) + docs = hybrid_search_native( + es_client=es_client, + embeddings=embeddings, + query=query, + index_name=index_name, + k=8, + ) context = format_context(docs) - logger.info(f"[retrieve] {len(docs)} docs fetched") - logger.info(context) + logger.info(f"[retrieve] {len(docs)} docs, context len={len(context)}") return {"context": context} - def generate(state: AgentState) -> AgentState: + def generate(state): prompt = SystemMessage( content=GENERATE_PROMPT.content.format(context=state["context"]) ) resp = llm.invoke([prompt] + state["messages"]) + logger.info(f"[generate] {len(resp.content)} chars") + _persist(state, resp) return {"messages": [resp]} + def generate_code(state): + prompt = SystemMessage( + content=CODE_GENERATION_PROMPT.content.format(context=state["context"]) + ) + resp = llm.invoke([prompt] + state["messages"]) + logger.info(f"[generate_code] {len(resp.content)} chars") + _persist(state, resp) + return {"messages": [resp]} + + def respond_conversational(state): + resp = llm.invoke([CONVERSATIONAL_PROMPT] + state["messages"]) + logger.info("[conversational] from comversation") + _persist(state, resp) + return {"messages": [resp]} + + def route_by_type(state): + return state.get("query_type", "RETRIEVAL") + + def route_after_retrieve(state): + qt = state.get("query_type", "RETRIEVAL") + return "generate_code" if qt == "CODE_GENERATION" else "generate" + graph_builder = StateGraph(AgentState) + + graph_builder.add_node("classify", classify) graph_builder.add_node("reformulate", reformulate) graph_builder.add_node("retrieve", retrieve) graph_builder.add_node("generate", generate) + graph_builder.add_node("generate_code", generate_code) + graph_builder.add_node("respond_conversational", respond_conversational) + + graph_builder.set_entry_point("classify") + + graph_builder.add_conditional_edges( + "classify", + route_by_type, + { + "RETRIEVAL": "reformulate", + "CODE_GENERATION": "reformulate", + "CONVERSATIONAL": "respond_conversational", + } + ) - graph_builder.set_entry_point("reformulate") graph_builder.add_edge("reformulate", "retrieve") - graph_builder.add_edge("retrieve", "generate") + + graph_builder.add_conditional_edges( + "retrieve", + route_after_retrieve, + { + "generate": "generate", + "generate_code": "generate_code", + } + ) + graph_builder.add_edge("generate", END) + graph_builder.add_edge("generate_code", END) + graph_builder.add_edge("respond_conversational", END) return graph_builder.compile() + + +def build_prepare_graph(llm, embeddings, es_client, index_name): + + def classify(state): + messages = state["messages"] + user_msg = messages[-1] + question = getattr(user_msg, "content", + user_msg.get("content", "") + if isinstance(user_msg, dict) else "") + history_msgs = messages[:-1] + + if not history_msgs: + prompt_content = ( + CLASSIFY_PROMPT_TEMPLATE + .replace("{history}", "(no history)") + .replace("{message}", question) + ) + resp = llm.invoke([SystemMessage(content=prompt_content)]) + raw = resp.content.strip().upper() + query_type = _parse_query_type(raw) + logger.info(f"[prepare/classify] no history raw='{raw}' -> {query_type}") + return {"query_type": query_type} + + history_text = format_history_for_classify(history_msgs) + prompt_content = ( + CLASSIFY_PROMPT_TEMPLATE + .replace("{history}", history_text) + .replace("{message}", question) + ) + resp = llm.invoke([SystemMessage(content=prompt_content)]) + raw = resp.content.strip().upper() + query_type = _parse_query_type(raw) + logger.info(f"[prepare/classify] raw='{raw}' -> {query_type}") + return {"query_type": query_type} + + def _parse_query_type(raw: str) -> str: + if raw.startswith("CODE_GENERATION") or "CODE" in raw: + return "CODE_GENERATION" + if raw.startswith("CONVERSATIONAL"): + return "CONVERSATIONAL" + return "RETRIEVAL" + + def reformulate(state: AgentState) -> AgentState: + user_msg = state["messages"][-1] + resp = llm.invoke([REFORMULATE_PROMPT, user_msg]) + reformulated = resp.content.strip() + logger.info(f"[prepare/reformulate] -> '{reformulated}'") + return {"reformulated_query": reformulated} + + def retrieve(state: AgentState) -> AgentState: + query = state["reformulated_query"] + docs = hybrid_search_native( + es_client=es_client, + embeddings=embeddings, + query=query, + index_name=index_name, + k=8, + ) + context = format_context(docs) + logger.info(f"[prepare/retrieve] {len(docs)} docs, context len={len(context)}") + return {"context": context} + + def skip_retrieve(state: AgentState) -> AgentState: + return {"context": ""} + + def route_by_type(state): + return state.get("query_type", "RETRIEVAL") + + graph_builder = StateGraph(AgentState) + + graph_builder.add_node("classify", classify) + graph_builder.add_node("reformulate", reformulate) + graph_builder.add_node("retrieve", retrieve) + graph_builder.add_node("skip_retrieve", skip_retrieve) + + graph_builder.set_entry_point("classify") + + graph_builder.add_conditional_edges( + "classify", + route_by_type, + { + "RETRIEVAL": "reformulate", + "CODE_GENERATION": "reformulate", + "CONVERSATIONAL": "skip_retrieve", + } + ) + + graph_builder.add_edge("reformulate", "retrieve") + graph_builder.add_edge("retrieve", END) + graph_builder.add_edge("skip_retrieve",END) + + return graph_builder.compile() + + +def build_final_messages(state: AgentState) -> list: + query_type = state.get("query_type", "RETRIEVAL") + context = state.get("context", "") + messages = state.get("messages", []) + + if query_type == "CONVERSATIONAL": + return [CONVERSATIONAL_PROMPT] + messages + + if query_type == "CODE_GENERATION": + prompt = SystemMessage( + content=CODE_GENERATION_PROMPT.content.format(context=context) + ) + else: + prompt = SystemMessage( + content=GENERATE_PROMPT.content.format(context=context) + ) + + return [prompt] + messages \ No newline at end of file diff --git a/Docker/src/openai_proxy.py b/Docker/src/openai_proxy.py new file mode 100644 index 0000000..157f609 --- /dev/null +++ b/Docker/src/openai_proxy.py @@ -0,0 +1,420 @@ +import json +import os +import time +import uuid +import logging +import asyncio +import concurrent.futures +from typing import AsyncIterator, Optional, Any, Literal, Union + +import grpc +import brunix_pb2 +import brunix_pb2_grpc + +from fastapi import FastAPI, HTTPException +from fastapi.responses import JSONResponse, StreamingResponse +from pydantic import BaseModel + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger("openai-proxy") + +_thread_pool = concurrent.futures.ThreadPoolExecutor( + max_workers=int(os.getenv("PROXY_THREAD_WORKERS", "20")) +) + +GRPC_TARGET = os.getenv("BRUNIX_GRPC_TARGET", "localhost:50051") +PROXY_MODEL = os.getenv("PROXY_MODEL_ID", "brunix") + +_channel: Optional[grpc.Channel] = None +_stub: Optional[brunix_pb2_grpc.AssistanceEngineStub] = None + + +def get_stub() -> brunix_pb2_grpc.AssistanceEngineStub: + global _channel, _stub + if _stub is None: + _channel = grpc.insecure_channel(GRPC_TARGET) + _stub = brunix_pb2_grpc.AssistanceEngineStub(_channel) + logger.info(f"[gRPC] connected to {GRPC_TARGET}") + return _stub + + +app = FastAPI( + title="Brunix OpenAI-Compatible Proxy", + version="2.0.0", + description="stream:false → AskAgent | stream:true → AskAgentStream", +) + +class ChatMessage(BaseModel): + role: Literal["system", "user", "assistant", "function"] = "user" + content: str = "" + name: Optional[str] = None + + +class ChatCompletionRequest(BaseModel): + model: str = PROXY_MODEL + messages: list[ChatMessage] + stream: bool = False + temperature: Optional[float] = None + max_tokens: Optional[int] = None + session_id: Optional[str] = None # extensión Brunix + top_p: Optional[float] = None + n: Optional[int] = 1 + stop: Optional[Any] = None + presence_penalty: Optional[float] = None + frequency_penalty: Optional[float] = None + user: Optional[str] = None + + +class CompletionRequest(BaseModel): + model: str = PROXY_MODEL + prompt: Union[str, list[str]] = "" + stream: bool = False + temperature: Optional[float] = None + max_tokens: Optional[int] = None + session_id: Optional[str] = None + suffix: Optional[str] = None + top_p: Optional[float] = None + n: Optional[int] = 1 + stop: Optional[Any] = None + user: Optional[str] = None + + +# Ollama schemas +class OllamaChatMessage(BaseModel): + role: str = "user" + content: str = "" + + +class OllamaChatRequest(BaseModel): + model: str = PROXY_MODEL + messages: list[OllamaChatMessage] + stream: bool = True # Ollama hace stream por defecto + session_id: Optional[str] = None + + +class OllamaGenerateRequest(BaseModel): + model: str = PROXY_MODEL + prompt: str = "" + stream: bool = True + session_id: Optional[str] = None + + +def _ts() -> int: + return int(time.time()) + + +def _chat_response(content: str, req_id: str) -> dict: + return { + "id": req_id, "object": "chat.completion", "created": _ts(), + "model": PROXY_MODEL, + "choices": [{"index": 0, "message": {"role": "assistant", "content": content}, "finish_reason": "stop"}], + "usage": {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0}, + } + + +def _completion_response(text: str, req_id: str) -> dict: + return { + "id": req_id, "object": "text_completion", "created": _ts(), + "model": PROXY_MODEL, + "choices": [{"text": text, "index": 0, "logprobs": None, "finish_reason": "stop"}], + "usage": {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0}, + } + + +def _chat_chunk(delta: str, req_id: str, finish: Optional[str] = None) -> dict: + return { + "id": req_id, "object": "chat.completion.chunk", "created": _ts(), + "model": PROXY_MODEL, + "choices": [{"index": 0, + "delta": {"role": "assistant", "content": delta} if delta else {}, + "finish_reason": finish}], + } + + +def _completion_chunk(text: str, req_id: str, finish: Optional[str] = None) -> dict: + return { + "id": req_id, "object": "text_completion", "created": _ts(), + "model": PROXY_MODEL, + "choices": [{"text": text, "index": 0, "logprobs": None, "finish_reason": finish}], + } + + +def _sse(data: dict) -> str: + return f"data: {json.dumps(data)}\n\n" + + +def _sse_done() -> str: + return "data: [DONE]\n\n" + + +def _query_from_messages(messages: list[ChatMessage]) -> str: + for m in reversed(messages): + if m.role == "user": + return m.content + return "" + + +async def _invoke_blocking(query: str, session_id: str) -> str: + + loop = asyncio.get_event_loop() + + def _call(): + stub = get_stub() + req = brunix_pb2.AgentRequest(query=query, session_id=session_id) + parts = [] + for resp in stub.AskAgent(req): + if resp.text: + parts.append(resp.text) + return "".join(parts) + + return await loop.run_in_executor(_thread_pool, _call) + + +async def _iter_stream(query: str, session_id: str) -> AsyncIterator[brunix_pb2.AgentResponse]: + + loop = asyncio.get_event_loop() + queue: asyncio.Queue = asyncio.Queue() + + def _producer(): + try: + stub = get_stub() + req = brunix_pb2.AgentRequest(query=query, session_id=session_id) + for resp in stub.AskAgentStream(req): # ← AskAgentStream + asyncio.run_coroutine_threadsafe(queue.put(resp), loop).result() + except Exception as e: + asyncio.run_coroutine_threadsafe(queue.put(e), loop).result() + finally: + asyncio.run_coroutine_threadsafe(queue.put(None), loop).result() # sentinel + + _thread_pool.submit(_producer) + + while True: + item = await queue.get() + if item is None: + break + if isinstance(item, Exception): + raise item + yield item + + +async def _stream_chat(query: str, session_id: str, req_id: str) -> AsyncIterator[str]: + try: + async for resp in _iter_stream(query, session_id): + if resp.is_final: + yield _sse(_chat_chunk("", req_id, finish="stop")) + break + if resp.text: + yield _sse(_chat_chunk(resp.text, req_id)) + except Exception as e: + logger.error(f"[stream_chat] error: {e}") + yield _sse(_chat_chunk(f"[Error: {e}]", req_id, finish="stop")) + + yield _sse_done() + + +async def _stream_completion(query: str, session_id: str, req_id: str) -> AsyncIterator[str]: + try: + async for resp in _iter_stream(query, session_id): + if resp.is_final: + yield _sse(_completion_chunk("", req_id, finish="stop")) + break + if resp.text: + yield _sse(_completion_chunk(resp.text, req_id)) + except Exception as e: + logger.error(f"[stream_completion] error: {e}") + yield _sse(_completion_chunk(f"[Error: {e}]", req_id, finish="stop")) + + yield _sse_done() + + +def _ollama_chat_chunk(token: str, done: bool) -> str: + return json.dumps({ + "model": PROXY_MODEL, + "created_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()), + "message": {"role": "assistant", "content": token}, + "done": done, + }) + "\n" + + +def _ollama_generate_chunk(token: str, done: bool) -> str: + return json.dumps({ + "model": PROXY_MODEL, + "created_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()), + "response": token, + "done": done, + }) + "\n" + + +async def _stream_ollama_chat(query: str, session_id: str) -> AsyncIterator[str]: + try: + async for resp in _iter_stream(query, session_id): + if resp.is_final: + yield _ollama_chat_chunk("", done=True) + break + if resp.text: + yield _ollama_chat_chunk(resp.text, done=False) + except Exception as e: + logger.error(f"[ollama_chat] error: {e}") + yield _ollama_chat_chunk(f"[Error: {e}]", done=True) + + +async def _stream_ollama_generate(query: str, session_id: str) -> AsyncIterator[str]: + try: + async for resp in _iter_stream(query, session_id): + if resp.is_final: + yield _ollama_generate_chunk("", done=True) + break + if resp.text: + yield _ollama_generate_chunk(resp.text, done=False) + except Exception as e: + logger.error(f"[ollama_generate] error: {e}") + yield _ollama_generate_chunk(f"[Error: {e}]", done=True) + + +@app.get("/v1/models") +async def list_models(): + return { + "object": "list", + "data": [{ + "id": PROXY_MODEL, "object": "model", "created": 1700000000, + "owned_by": "brunix", "permission": [], "root": PROXY_MODEL, "parent": None, + }], + } + + +@app.post("/v1/chat/completions") +async def chat_completions(req: ChatCompletionRequest): + query = _query_from_messages(req.messages) + session_id = req.session_id or req.user or "default" + req_id = f"chatcmpl-{uuid.uuid4().hex}" + + logger.info(f"[chat] session={session_id} stream={req.stream} query='{query[:80]}'") + + if not query: + raise HTTPException(status_code=400, detail="No user message found in messages.") + + if req.stream: + + return StreamingResponse( + _stream_chat(query, session_id, req_id), + media_type="text/event-stream", + headers={"Cache-Control": "no-cache", "X-Accel-Buffering": "no"}, + ) + + try: + text = await _invoke_blocking(query, session_id) + except grpc.RpcError as e: + raise HTTPException(status_code=502, detail=f"gRPC error: {e.details()}") + + return JSONResponse(_chat_response(text, req_id)) + + +@app.post("/v1/completions") +async def completions(req: CompletionRequest): + query = req.prompt if isinstance(req.prompt, str) else " ".join(req.prompt) + session_id = req.session_id or req.user or "default" + req_id = f"cmpl-{uuid.uuid4().hex}" + + logger.info(f"[completion] session={session_id} stream={req.stream} prompt='{query[:80]}'") + + if not query: + raise HTTPException(status_code=400, detail="prompt is required.") + + if req.stream: + return StreamingResponse( + _stream_completion(query, session_id, req_id), + media_type="text/event-stream", + headers={"Cache-Control": "no-cache", "X-Accel-Buffering": "no"}, + ) + + try: + text = await _invoke_blocking(query, session_id) + except grpc.RpcError as e: + raise HTTPException(status_code=502, detail=f"gRPC error: {e.details()}") + + return JSONResponse(_completion_response(text, req_id)) + + +@app.get("/health") +async def health(): + return {"status": "ok", "grpc_target": GRPC_TARGET} + + +@app.get("/api/tags") +async def ollama_tags(): + return { + "models": [{ + "name": PROXY_MODEL, + "model":PROXY_MODEL, + "modified_at": "2024-01-01T00:00:00Z", + "size": 0, + "digest":"brunix", + "details": { + "format": "gguf", + "family": "brunix", + "parameter_size": "unknown", + "quantization_level": "unknown", + }, + }] + } + + +@app.post("/api/chat") +async def ollama_chat(req: OllamaChatRequest): + + query = next((m.content for m in reversed(req.messages) if m.role == "user"), "") + session_id = req.session_id or "default" + + logger.info(f"[ollama/chat] session={session_id} stream={req.stream} query='{query[:80]}'") + + if not query: + raise HTTPException(status_code=400, detail="No user message found.") + + if req.stream: + return StreamingResponse( + _stream_ollama_chat(query, session_id), + media_type="application/x-ndjson", + headers={"Cache-Control": "no-cache", "X-Accel-Buffering": "no"}, + ) + + try: + text = await _invoke_blocking(query, session_id) + except grpc.RpcError as e: + raise HTTPException(status_code=502, detail=f"gRPC error: {e.details()}") + + return JSONResponse({ + "model": PROXY_MODEL, + "created_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()), + "message": {"role": "assistant", "content": text}, + "done": True, + }) + + +@app.post("/api/generate") +async def ollama_generate(req: OllamaGenerateRequest): + + session_id = req.session_id or "default" + + logger.info(f"[ollama/generate] session={session_id} stream={req.stream} prompt='{req.prompt[:80]}'") + + if not req.prompt: + raise HTTPException(status_code=400, detail="prompt is required.") + + if req.stream: + return StreamingResponse( + _stream_ollama_generate(req.prompt, session_id), + media_type="application/x-ndjson", + headers={"Cache-Control": "no-cache", "X-Accel-Buffering": "no"}, + ) + + try: + text = await _invoke_blocking(req.prompt, session_id) + except grpc.RpcError as e: + raise HTTPException(status_code=502, detail=f"gRPC error: {e.details()}") + + return JSONResponse({ + "model": PROXY_MODEL, + "created_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()), + "response": text, + "done": True, + }) \ No newline at end of file diff --git a/Docker/src/prompts.py b/Docker/src/prompts.py index f8938d8..adb370d 100644 --- a/Docker/src/prompts.py +++ b/Docker/src/prompts.py @@ -1,89 +1,250 @@ + from langchain_core.messages import SystemMessage +CLASSIFY_PROMPT_TEMPLATE = ( + "\n" + "You are a query classifier for an AVAP language assistant. " + "Your only job is to classify the user message into one of three categories.\n" + "\n\n" + + "\n" + "RETRIEVAL — the user is asking about AVAP concepts, documentation, syntax rules, " + "or how something works. They want an explanation, not code.\n" + "Examples: 'What is addVar?', 'How does registerEndpoint work?', " + "'What is the difference between if() modes?'\n\n" + + "CODE_GENERATION — the user is asking to generate, write, create, build, or show " + "an example of an AVAP script, function, API, or code snippet. " + "They want working code as output.\n" + "Examples: 'Write an API that returns hello world', " + "'Generate a function that queries the DB', " + "'Show me how to create an endpoint', " + "'dame un ejemplo de codigo', 'escribeme un script', " + "'dime como seria un API', 'genera un API', 'como haria'\n\n" + + "CONVERSATIONAL — the user is following up on the previous answer. " + "They want a reformulation, summary, or elaboration of what was already said.\n" + "Examples: 'can you explain that?', 'en menos palabras', " + "'describe it in your own words', 'what did you mean?'\n" + "\n\n" + + "\n" + "Your entire response must be exactly one word: " + "RETRIEVAL, CODE_GENERATION, or CONVERSATIONAL. Nothing else.\n" + "\n\n" + + "\n" + "{history}\n" + "\n\n" + + "{message}" +) + REFORMULATE_PROMPT = SystemMessage( content=( - "You are a deterministic lexical query rewriter used for vector retrieval.\n" - "Your task is to rewrite user questions into optimized keyword search queries.\n\n" + "\n" + "You are a deterministic query rewriter whose sole purpose is to prepare " + "user questions for vector similarity retrieval against an AVAP language " + "knowledge base. You do not answer questions. You only transform phrasing " + "into keyword queries that will find the right AVAP documentation chunks.\n" + "\n\n" - "CRITICAL RULES (ABSOLUTE):\n" - "1. NEVER answer the question.\n" - "2. NEVER expand acronyms.\n" - "3. NEVER introduce new terms not present in the original query.\n" - "4. NEVER infer missing information.\n" - "5. NEVER add explanations, definitions, or interpretations.\n" - "6. Preserve all technical tokens exactly as written.\n" - "7. Only remove filler words (e.g., what, does, is, explain, tell me, please).\n" - "8. You may reorder terms for better retrieval.\n" - "9. Output must be a single-line plain keyword query.\n" - "10. If the query is already optimal, return it unchanged.\n\n" - "11. If you receive something that looks like code, do NOT attempt to rewrite it. Return it verbatim.\n\n" + "\n" + "Rewrite the user message into a compact keyword query for semantic search.\n\n" - "ALLOWED OPERATIONS:\n" - "- Remove interrogative phrasing.\n" - "- Remove stopwords.\n" - "- Reorder words.\n" - "- Convert to noun phrase form.\n\n" + "SPECIAL RULE for code generation requests:\n" + "When the user asks to generate/create/build/show AVAP code, expand the query " + "with the AVAP commands typically needed. Use this mapping:\n\n" - "FORBIDDEN OPERATIONS:\n" - "- Expanding abbreviations.\n" - "- Paraphrasing into unseen vocabulary.\n" - "- Adding definitions.\n" - "- Answering implicitly.\n\n" + "- API / endpoint / route / HTTP response\n" + " expand to: AVAP registerEndpoint addResult _status\n\n" - "Examples:\n" - "Input: What does AVAP stand for?\n" - "Output: AVAP stand for\n" + "- Read input / parameter\n" + " expand to: AVAP addParam getQueryParamList\n\n" - "Input: Hey, I'm trying to understand how AVAP handels a ZeroDivisionError when doing divison or modulus operatoins. Can you explane what situatoins cause a ZeroDivisionError to be raised and how I can catch it in my AVAP scripts?\n" - "Output: AVAP ZeroDivisionError division / modulus % catch try except\n" - - "Input: What does AVAP stand for?\n" - "Output: AVAP stand for\n" + "- Database / ORM / query\n" + " expand to: AVAP ormAccessSelect ormAccessInsert avapConnector\n\n" - "Input: Please explain how the import statement works in AVAP scripts.\n" - "Output: AVAP import statement syntax behavior\n\n" + "- Error handling\n" + " expand to: AVAP try exception end\n\n" - "Return only the rewritten query." + "- Loop / iterate\n" + " expand to: AVAP startLoop endLoop itemFromList getListLen\n\n" + + "- HTTP request / call external\n" + " expand to: AVAP RequestPost RequestGet\n" + "\n\n" + + "\n" + "- Preserve all AVAP identifiers verbatim.\n" + "- Remove filler words.\n" + "- Output a single line.\n" + "- Never answer the question.\n" + "\n\n" + + "\n" + "\n" + "What does AVAP stand for?\n" + "AVAP stand for\n" + "\n\n" + + "\n" + "dime como seria un API que devuelva hello world con AVAP\n" + "AVAP registerEndpoint addResult _status hello world example\n" + "\n\n" + + "\n" + "generate an AVAP script that reads a parameter and queries the DB\n" + "AVAP addParam ormAccessSelect avapConnector registerEndpoint addResult\n" + "\n" + "\n\n" + + "Return only the rewritten query. No labels, no prefixes, no explanation." ) ) +CONFIDENCE_PROMPT_TEMPLATE = ( + "\n" + "You are a relevance evaluator. Decide whether the context contains " + "useful information to address the user question.\n" + "\n\n" + + "\n" + "Answer YES if the context contains at least one relevant passage. " + "Answer NO only if context is empty or completely unrelated.\n" + "\n\n" + + "\n" + "Exactly one word: YES or NO.\n" + "\n\n" + + "{question}\n\n" + "{context}" +) + + +CODE_GENERATION_PROMPT = SystemMessage( + content=( + "\n" + "You are an expert AVAP programmer. AVAP (Advanced Virtual API Programming) " + "is a domain-specific language for orchestrating microservices and HTTP I/O. " + "Write correct, minimal, working AVAP code.\n" + "\n\n" + + "\n" + "1. AVAP is line-oriented: every statement on a single line.\n" + "2. Use ONLY commands from or explicitly described in .\n" + "3. Do NOT copy code examples from that solve a DIFFERENT problem. " + "Context examples are syntax references only — ignore them if unrelated.\n" + "4. Write the MINIMUM code needed. No extra connectors, no unrelated variables.\n" + "5. Add brief inline comments explaining each part.\n" + "6. Answer in the same language the user used.\n" + "\n\n" + + "\n" + "// Register an HTTP endpoint\n" + "registerEndpoint(\"GET\", \"/path\", [], \"scope\", handlerFn, \"\")\n\n" + "// Declare a function — uses curly braces, NOT end()\n" + "function handlerFn() {{\n" + " msg = \"Hello World\"\n" + " addResult(msg)\n" + "}}\n\n" + "// Assign a value to a variable\n" + "addVar(varName, \"value\") // or: varName = \"value\"\n\n" + "// Add variable to HTTP JSON response body\n" + "addResult(varName)\n\n" + "// Set HTTP response status code\n" + "_status = 200 // or: addVar(_status, 200)\n\n" + "// Read a request parameter (URL, body, or form)\n" + "addParam(\"paramName\", targetVar)\n\n" + "// Conditional\n" + "if(var, value, \"==\")\n" + " // ...\n" + "end()\n\n" + "// Loop\n" + "startLoop(i, 0, length)\n" + " // ...\n" + "endLoop()\n\n" + "// Error handling\n" + "try()\n" + " // ...\n" + "exception(errVar)\n" + " // handle\n" + "end()\n" + "\n\n" + + "\n" + "Generate a minimal, complete AVAP example for the user's request.\n\n" + "Structure:\n" + "1. One sentence describing what the code does.\n" + "2. The AVAP code block — clean, minimal, with inline comments.\n" + "3. Two or three lines explaining the key commands used.\n" + "\n\n" + + "\n" + "{context}\n" + "" + ) +) + +CONVERSATIONAL_PROMPT = SystemMessage( + content=( + "\n" + "You are a helpful AVAP assistant continuing an ongoing conversation.\n" + "\n\n" + + "\n" + "The user is following up on something already discussed. " + "Rephrase, summarize, or elaborate using the conversation history.\n" + "\n\n" + + "\n" + "- Base your answer on the conversation history.\n" + "- Do not introduce new AVAP facts not in the history.\n" + "- Keep the same language the user is using.\n" + "- No Answer/Evidence format. Just answer naturally.\n" + "" + ) +) + + GENERATE_PROMPT = SystemMessage( content=( - """You are a retrieval-grounded assistant. + "\n" + "You are a precise, retrieval-grounded assistant specialized in AVAP. " + "Answers are honest, calibrated to evidence, and clearly structured.\n" + "\n\n" - CRITICAL SCOPE RULES - - AVAP (Advanced Virtual API Programming) is a new/unknown programming language. - - If the user message is AVAP-related (explicitly mentions "AVAP" case-insensitive OR the provided context is AVAP code/docs), you MUST answer using ONLY . - - You MUST NOT use general programming knowledge to interpret AVAP. - - You MUST NOT assume AVAP behaves like any other language even if syntax looks similar. - - You MUST NOT infer missing details. Only state what is explicitly present in . + "\n" + "AVAP is a new proprietary language. Use ONLY content inside . " + "Treat any AVAP knowledge outside as unreliable.\n" + "\n\n" - GROUNDING REQUIREMENTS (AVAP-RELATED) - 1) Every non-trivial factual claim MUST be directly supported by an EXACT QUOTE from . - 2) If a claim is not supported by a quote, DO NOT include it. - 3) If does not contain enough information to answer, reply with EXACTLY: - "I don't have enough information in the provided context to answer that." + "\n" + "Answer using exclusively the information in .\n" + "\n\n" - WORKFLOW (AVAP-RELATED) — FOLLOW IN ORDER - A) Identify the specific question(s) being asked. - B) Extract the minimum necessary quotes from that answer those question(s). - C) Write the answer using ONLY those quotes (paraphrase is allowed, but every statement must be backed by at least one quote). - D) Verify: for EACH sentence in your answer, confirm there is a supporting quote. If any sentence lacks a quote, delete it or refuse. + "\n" + "Step 1 — Find relevant passages in .\n" + "Step 2 — Assess if question can be fully or partially answered.\n" + "Step 3 — Write a clear answer backed by those passages.\n" + "Step 4 — If context contains relevant AVAP code, include it exactly.\n" + "\n\n" - OUTPUT FORMAT (AVAP-RELATED ONLY) - Answer: - + "\n" + "Answer:\n" + "\n\n" - Evidence: - - "" - - "" - (Include only quotes you actually used. Prefer the smallest quotes that fully support the statements.) + "Evidence:\n" + "- \"\"\n" + "(only quotes you actually used)\n\n" - NON-AVAP QUESTIONS - - If the question is clearly not AVAP-related, answer normally using general knowledge. + "If context has no relevant information reply with exactly:\n" + "\"I don't have enough information in the provided context to answer that.\"\n" + "\n\n" - - {context} - """ + "\n" + "{context}\n" + "" ) ) \ No newline at end of file diff --git a/Docker/src/server.py b/Docker/src/server.py index 310d5bc..b233527 100644 --- a/Docker/src/server.py +++ b/Docker/src/server.py @@ -8,18 +8,28 @@ import brunix_pb2 import brunix_pb2_grpc import grpc from grpc_reflection.v1alpha import reflection -from langchain_elasticsearch import ElasticsearchStore +from elasticsearch import Elasticsearch +from langchain_core.messages import AIMessage from utils.llm_factory import create_chat_model from utils.emb_factory import create_embedding_model -from graph import build_graph +from graph import build_graph, build_prepare_graph, build_final_messages, session_store +from evaluate import run_evaluation logging.basicConfig(level=logging.INFO) logger = logging.getLogger("brunix-engine") + class BrunixEngine(brunix_pb2_grpc.AssistanceEngineServicer): + def __init__(self): + es_url = os.getenv("ELASTICSEARCH_URL", "http://localhost:9200") + es_user = os.getenv("ELASTICSEARCH_USER") + es_pass = os.getenv("ELASTICSEARCH_PASSWORD") + es_apikey = os.getenv("ELASTICSEARCH_API_KEY") + index = os.getenv("ELASTICSEARCH_INDEX", "avap-knowledge-v1") + self.llm = create_chat_model( provider="ollama", model=os.getenv("OLLAMA_MODEL_NAME"), @@ -27,56 +37,194 @@ class BrunixEngine(brunix_pb2_grpc.AssistanceEngineServicer): temperature=0, validate_model_on_init=True, ) + self.embeddings = create_embedding_model( provider="ollama", model=os.getenv("OLLAMA_EMB_MODEL_NAME"), base_url=os.getenv("OLLAMA_URL"), ) - self.vector_store = ElasticsearchStore( - es_url=os.getenv("ELASTICSEARCH_URL"), - index_name=os.getenv("ELASTICSEARCH_INDEX"), - embedding=self.embeddings, - query_field="text", - vector_query_field="embedding", - ) + + es_kwargs: dict = {"hosts": [es_url], "request_timeout": 60} + if es_apikey: + es_kwargs["api_key"] = es_apikey + elif es_user and es_pass: + es_kwargs["basic_auth"] = (es_user, es_pass) + + self.es_client = Elasticsearch(**es_kwargs) + self.index_name = index + + if self.es_client.ping(): + info = self.es_client.info() + logger.info(f"[ESEARCH] Connected: {info['version']['number']} — index: {index}") + else: + logger.error("[ESEARCH] Cant Connect") + self.graph = build_graph( - llm=self.llm, - vector_store=self.vector_store + llm = self.llm, + embeddings = self.embeddings, + es_client = self.es_client, + index_name = self.index_name, ) - logger.info("Brunix Engine initializing.") + + self.prepare_graph = build_prepare_graph( + llm = self.llm, + embeddings = self.embeddings, + es_client = self.es_client, + index_name = self.index_name, + ) + + logger.info("Brunix Engine initialized.") def AskAgent(self, request, context): - logger.info(f"request {request.session_id}): {request.query[:50]}.") + session_id = request.session_id or "default" + query = request.query + logger.info(f"[AskAgent] session={session_id} query='{query[:80]}'") try: - final_state = self.graph.invoke({"messages": [{"role": "user", - "content": request.query}]}) + history = list(session_store.get(session_id, [])) + logger.info(f"[AskAgent] conversation: {len(history)} previous messages.") + initial_state = { + "messages": history + [{"role": "user", "content": query}], + "session_id": session_id, + "reformulated_query": "", + "context": "", + "query_type": "", + } + + final_state = self.graph.invoke(initial_state) messages = final_state.get("messages", []) last_msg = messages[-1] if messages else None - result_text = getattr(last_msg, "content", str(last_msg)) if last_msg else "" + result_text = getattr(last_msg, "content", str(last_msg)) \ + if last_msg else "" + + logger.info(f"[AskAgent] query_type={final_state.get('query_type')} " + f"answer='{result_text[:100]}'") yield brunix_pb2.AgentResponse( - text=result_text, - avap_code="AVAP-2026", - is_final=True, + text = result_text, + avap_code= "AVAP-2026", + is_final = True, ) - yield brunix_pb2.AgentResponse(text="", avap_code="", is_final=True) - except Exception as e: - logger.error(f"Error in AskAgent: {str(e)}", exc_info=True) + logger.error(f"[AskAgent] Error: {e}", exc_info=True) yield brunix_pb2.AgentResponse( - text=f"[Error Motor]: {str(e)}", - is_final=True, + text = f"[ENG] Error: {str(e)}", + is_final = True, ) + def AskAgentStream(self, request, context): + session_id = request.session_id or "default" + query = request.query + logger.info(f"[AskAgentStream] session={session_id} query='{query[:80]}'") + + try: + history = list(session_store.get(session_id, [])) + logger.info(f"[AskAgentStream] conversation: {len(history)} previous messages.") + + initial_state = { + "messages": history + [{"role": "user", "content": query}], + "session_id": session_id, + "reformulated_query": "", + "context": "", + "query_type": "", + } + + prepared = self.prepare_graph.invoke(initial_state) + logger.info( + f"[AskAgentStream] query_type={prepared.get('query_type')} " + f"context_len={len(prepared.get('context', ''))}" + ) + + final_messages = build_final_messages(prepared) + full_response = [] + + for chunk in self.llm.stream(final_messages): + token = chunk.content + if token: + full_response.append(token) + yield brunix_pb2.AgentResponse( + text = token, + is_final = False, + ) + + complete_text = "".join(full_response) + if session_id: + session_store[session_id] = ( + list(prepared["messages"]) + [AIMessage(content=complete_text)] + ) + + logger.info( + f"[AskAgentStream] done — " + f"chunks={len(full_response)} total_chars={len(complete_text)}" + ) + + yield brunix_pb2.AgentResponse(text="", is_final=True) + + except Exception as e: + logger.error(f"[AskAgentStream] Error: {e}", exc_info=True) + yield brunix_pb2.AgentResponse( + text = f"[ENG] Error: {str(e)}", + is_final = True, + ) + + + def EvaluateRAG(self, request, context): + category = request.category or None + limit = request.limit or None + index = request.index or self.index_name + + logger.info(f"[EvaluateRAG] category={category} limit={limit} index={index}") + + try: + result = run_evaluation( + es_client = self.es_client, + llm = self.llm, + embeddings = self.embeddings, + index_name = index, + category = category, + limit = limit, + ) + except Exception as e: + logger.error(f"[EvaluateRAG] Error: {e}", exc_info=True) + return brunix_pb2.EvalResponse(status=f"error: {e}") + + if result.get("status") != "ok": + return brunix_pb2.EvalResponse(status=result.get("error", "unknown error")) + + details = [ + brunix_pb2.QuestionDetail( + id = d["id"], + category = d["category"], + question = d["question"], + answer_preview = d["answer_preview"], + n_chunks = d["n_chunks"], + ) + for d in result.get("details", []) + ] + + scores = result["scores"] + return brunix_pb2.EvalResponse( + status = "ok", + questions_evaluated = result["questions_evaluated"], + elapsed_seconds = result["elapsed_seconds"], + judge_model = result["judge_model"], + index = result["index"], + faithfulness = scores["faithfulness"], + answer_relevancy = scores["answer_relevancy"], + context_recall = scores["context_recall"], + context_precision = scores["context_precision"], + global_score = result["global_score"], + verdict= result["verdict"], + details= details, + ) + + def serve(): - server = grpc.server(futures.ThreadPoolExecutor(max_workers=10)) - brunix_pb2_grpc.add_AssistanceEngineServicer_to_server(BrunixEngine(), server) SERVICE_NAMES = ( @@ -86,7 +234,7 @@ def serve(): reflection.enable_server_reflection(SERVICE_NAMES, server) server.add_insecure_port("[::]:50051") - logger.info("Brunix Engine on port 50051") + logger.info("[ENGINE] listen on 50051 (gRPC)") server.start() server.wait_for_termination() diff --git a/Docker/src/state.py b/Docker/src/state.py index 2e04d99..1d4d0ce 100644 --- a/Docker/src/state.py +++ b/Docker/src/state.py @@ -1,9 +1,11 @@ +# state.py from typing import TypedDict, Annotated - from langgraph.graph.message import add_messages class AgentState(TypedDict): - messages: Annotated[list, add_messages] + messages: Annotated[list, add_messages] reformulated_query: str - context: str \ No newline at end of file + context: str + query_type: str + session_id: str \ No newline at end of file From fda47edae0494114a508055465c1c9ed532b0999 Mon Sep 17 00:00:00 2001 From: rafa-ruiz Date: Wed, 18 Mar 2026 18:56:01 -0700 Subject: [PATCH 3/3] UPGRADE: New RAG functional --- CONTRIBUTING.md | 86 +- README.md | 436 +++++++++- changelog | 22 + docs/ADR/ADR-0001-grpc-primary-interface.md | 54 ++ docs/ADR/ADR-0002-two-phase-streaming.md | 61 ++ docs/ADR/ADR-0003-hybrid-retrieval-rrf.md | 63 ++ docs/ADR/ADR-0004-claude-eval-judge.md | 54 ++ docs/API_REFERENCE.md | 339 ++++++++ docs/ARCHITECTURE.md | 463 ++++++++++ docs/AVAP_CHUNKER_CONFIG.md | 372 ++++++++ docs/LRM/avap.md | 4 +- docs/RUNBOOK.md | 389 +++++++++ docs/SECURITY.md | 102 +++ scripts/pipelines/ingestion/avap_chunker.py | 794 ++++++++++++++++++ scripts/pipelines/ingestion/avap_config.json | 74 ++ scripts/pipelines/ingestion/avap_ingestor.py | 452 ++++++++++ .../ingestion/ingestion/chunks.jsonl | 105 +++ scripts/pipelines/ingestion/requirements.txt | 8 + scripts/pipelines/ingestion/test.py | 7 + 19 files changed, 3843 insertions(+), 42 deletions(-) create mode 100644 docs/ADR/ADR-0001-grpc-primary-interface.md create mode 100644 docs/ADR/ADR-0002-two-phase-streaming.md create mode 100644 docs/ADR/ADR-0003-hybrid-retrieval-rrf.md create mode 100644 docs/ADR/ADR-0004-claude-eval-judge.md create mode 100644 docs/API_REFERENCE.md create mode 100644 docs/ARCHITECTURE.md create mode 100644 docs/AVAP_CHUNKER_CONFIG.md create mode 100644 docs/RUNBOOK.md create mode 100644 docs/SECURITY.md create mode 100644 scripts/pipelines/ingestion/avap_chunker.py create mode 100644 scripts/pipelines/ingestion/avap_config.json create mode 100644 scripts/pipelines/ingestion/avap_ingestor.py create mode 100644 scripts/pipelines/ingestion/ingestion/chunks.jsonl create mode 100644 scripts/pipelines/ingestion/requirements.txt create mode 100644 scripts/pipelines/ingestion/test.py diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index b490e01..03f60c8 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -14,7 +14,8 @@ 6. [Environment Variables Policy](#6-environment-variables-policy) 7. [Changelog Policy](#7-changelog-policy) 8. [Documentation Policy](#8-documentation-policy) -9. [Incident & Blockage Reporting](#9-incident--blockage-reporting) +9. [Architecture Decision Records (ADRs)](#9-architecture-decision-records-adrs) +10. [Incident & Blockage Reporting](#10-incident--blockage-reporting) --- @@ -95,9 +96,10 @@ A PR is not ready for review unless **all applicable items** in the following ch - [ ] No changelog entry required (internal refactor, comment/typo fix, zero behavioral change) - [ ] Changelog updated with correct version bump and date -**Documentation** *(see [Section 7](#7-documentation-policy))* +**Documentation** *(see [Section 8](#8-documentation-policy))* - [ ] No documentation update required (internal change, no impact on setup or API) - [ ] `README.md` or relevant docs updated to reflect this change +- [ ] If a significant architectural decision was made, an ADR was created in `docs/adr/` --- @@ -206,11 +208,87 @@ Update `README.md` (or the relevant doc file) if the PR includes any of the foll - Internal implementation changes with no impact on setup, usage, or API - Fixes that do not alter any documented behavior +### Documentation files in this repository + +| File | Purpose | +|---|---| +| `README.md` | Setup guide, env vars reference, quick start | +| `CONTRIBUTING.md` | Contribution standards (this file) | +| `SECURITY.md` | Security policy and vulnerability reporting | +| `docs/ARCHITECTURE.md` | Deep technical architecture reference | +| `docs/API_REFERENCE.md` | Complete gRPC API contract and examples | +| `docs/RUNBOOK.md` | Operational playbooks and incident response | +| `docs/AVAP_CHUNKER_CONFIG.md` | `avap_config.json` reference — blocks, statements, semantic tags | +| `docs/adr/` | Architecture Decision Records | + > **PRs that change user-facing behavior or setup without updating documentation will be rejected.** --- -## 9. Incident & Blockage Reporting +## 9. Architecture Decision Records (ADRs) + +Architecture Decision Records document **significant technical decisions** — choices that have lasting consequences on the codebase, infrastructure, or development process. + +### When to write an ADR + +Write an ADR when a PR introduces or changes: + +- A fundamental technology choice (communication protocol, storage backend, framework) +- A design pattern that other components will follow +- A deliberate trade-off with known consequences +- A decision that future engineers might otherwise reverse without understanding the rationale + +### When NOT to write an ADR + +- Implementation details within a single module +- Bug fixes +- Dependency version bumps +- Configuration changes + +### ADR format + +ADRs live in `docs/adr/` and follow this naming convention: + +``` +ADR-XXXX-short-title.md +``` + +Where `XXXX` is a zero-padded sequential number (e.g., `ADR-0005-new-decision.md`). + +Each ADR must contain: + +```markdown +# ADR-XXXX: Title + +**Date:** YYYY-MM-DD +**Status:** Proposed | Accepted | Deprecated | Superseded by ADR-YYYY +**Deciders:** Names or roles + +## Context +What problem are we solving? What forces are at play? + +## Decision +What did we decide? + +## Rationale +Why this option over alternatives? Include a trade-off analysis. + +## Consequences +What are the positive and negative results of this decision? +``` + +### Existing ADRs + +| ADR | Title | Status | +|---|---|---| +| [ADR-0001](docs/adr/ADR-0001-grpc-primary-interface.md) | gRPC as the Primary Communication Interface | Accepted | +| [ADR-0002](docs/adr/ADR-0002-two-phase-streaming.md) | Two-Phase Streaming Design for AskAgentStream | Accepted | +| [ADR-0003](docs/adr/ADR-0003-hybrid-retrieval-rrf.md) | Hybrid Retrieval (BM25 + kNN) with RRF Fusion | Accepted | +| [ADR-0004](docs/adr/ADR-0004-claude-eval-judge.md) | Claude as the RAGAS Evaluation Judge | Accepted | + +--- + +## 10. Incident & Blockage Reporting If you encounter a technical blockage (connection timeouts, service downtime, tunnel failures): @@ -221,6 +299,8 @@ If you encounter a technical blockage (connection timeouts, service downtime, tu - Current status of all `kubectl` tunnels 3. **Resolution** — If the error is not reproducible by the CTO/DevOps team, a 5-minute live debugging session will be scheduled to identify local network or configuration issues. +See [`docs/RUNBOOK.md`](docs/RUNBOOK.md) for full incident playbooks and escalation paths. + --- *These standards exist to protect the integrity of the Brunix Assistance Engine and to ensure every member of the team can work confidently and efficiently. They are not bureaucratic overhead — they are the foundation of a reliable, scalable engineering practice.* diff --git a/README.md b/README.md index 5daf4c1..8931418 100644 --- a/README.md +++ b/README.md @@ -42,39 +42,75 @@ graph TD ## Project Structure ```text - -├── README.md # System documentation & Dev guide +├── README.md # Setup guide & dev reference (this file) +├── CONTRIBUTING.md # Contribution standards, GitFlow, PR process +├── SECURITY.md # Security policy and vulnerability reporting ├── changelog # Version tracking and release history -├── pyproject.toml # Python project configuration -├── Docker/ +├── pyproject.toml # Python project configuration (uv) +├── uv.lock # Locked dependency graph +│ +├── Docker/ # Production container │ ├── protos/ -│ │ └── brunix.proto # Protocol Buffers: The source of truth for the API +│ │ └── brunix.proto # gRPC API contract (source of truth) │ ├── src/ -│ │ ├── graph.py # Workflow graph orchestration -│ │ ├── prompts.py # Centralized prompt definitions -│ │ ├── server.py # gRPC Server & RAG Orchestration -│ │ ├── state.py # Shared state management -│ │ └── utils/ # Utility modules -│ ├── Dockerfile # Container definition for the Engine -│ ├── docker-compose.yaml # Local orchestration for dev environment -│ ├── requirements.txt # Python dependencies for Docker -│ ├── protos/ -│ │ └── brunix.proto # Protocol Buffers: The source of truth for the API -│ └── src/ -│ ├── graph.py # Workflow graph orchestration -│ ├── prompts.py # Centralized prompt definitions -│ ├── server.py # gRPC Server & RAG Orchestration -│ ├── state.py # Shared state management -│ └── utils/ # Utility modules +│ │ ├── server.py # gRPC server — AskAgent, AskAgentStream, EvaluateRAG +│ │ ├── openai_proxy.py # OpenAI & Ollama-compatible HTTP proxy (port 8000) +│ │ ├── graph.py # LangGraph orchestration — build_graph, build_prepare_graph +│ │ ├── prompts.py # Centralized prompt definitions (CLASSIFY, GENERATE, etc.) +│ │ ├── state.py # AgentState TypedDict (shared across graph nodes) +│ │ ├── evaluate.py # RAGAS evaluation pipeline (Claude as judge) +│ │ ├── golden_dataset.json # Ground-truth Q&A dataset for EvaluateRAG +│ │ └── utils/ +│ │ ├── emb_factory.py # Provider-agnostic embedding model factory +│ │ └── llm_factory.py # Provider-agnostic LLM factory +│ ├── Dockerfile # Multi-stage container build +│ ├── docker-compose.yaml # Local dev orchestration +│ ├── entrypoint.sh # Starts gRPC server + HTTP proxy in parallel +│ ├── requirements.txt # Pinned production dependencies (exported by uv) +│ ├── .env # Local secrets (never commit — see .gitignore) +│ └── .dockerignore # Excludes dev artifacts from image build context +│ +├── docs/ # Knowledge base & project documentation +│ ├── ARCHITECTURE.md # Deep technical architecture reference +│ ├── API_REFERENCE.md # Complete gRPC & HTTP API contract with examples +│ ├── RUNBOOK.md # Operational playbooks and incident response +│ ├── AVAP_CHUNKER_CONFIG.md # avap_config.json reference — blocks, statements, semantic tags +│ ├── adr/ # Architecture Decision Records +│ │ ├── ADR-0001-grpc-primary-interface.md +│ │ ├── ADR-0002-two-phase-streaming.md +│ │ ├── ADR-0003-hybrid-retrieval-rrf.md +│ │ └── ADR-0004-claude-eval-judge.md +│ ├── avap_language_github_docs/ # AVAP language reference docs (GitHub source) +│ ├── developer.avapframework.com/ # AVAP developer portal docs +│ ├── LRM/ +│ │ └── avap.md # AVAP Language Reference Manual (LRM) +│ └── samples/ # AVAP code samples (.avap) used for ingestion +│ ├── ingestion/ -│ └── docs/ # AVAP documentation chunks -├── kubernetes/ -│ └── kubeconfig.yaml # Kubernetes cluster configuration +│ └── chunks.json # Last export of ingested chunks (ES bulk output) +│ ├── scripts/ │ └── pipelines/ -│ └── flows/ # Data processing flows -└── src/ - ├── config.py # Environment variables configuration file +│ │ +│ ├── flows/ # Executable pipeline entry points (Typer CLI) +│ │ ├── elasticsearch_ingestion.py # [PIPELINE A] Chonkie-based ingestion flow +│ │ ├── generate_mbap.py # Synthetic MBPP-AVAP dataset generator (Claude) +│ │ └── translate_mbpp.py # MBPP→AVAP dataset translation pipeline +│ │ +│ ├── tasks/ # Reusable task modules for Pipeline A +│ │ ├── chunk.py # Document fetching, Chonkie chunking & ES bulk write +│ │ ├── embeddings.py # OllamaEmbeddings adapter (Chonkie-compatible) +│ │ └── prompts.py # Prompt templates for pipeline LLM calls +│ │ +│ └── ingestion/ # [PIPELINE B] AVAP-native classic ingestion +│ ├── avap_chunker.py # Custom AVAP lexer + chunker (MinHash dedup, overlaps) +│ ├── avap_ingestor.py # Async ES ingestor with DLQ (producer/consumer pattern) +│ ├── avap_config.json # AVAP language config (blocks, statements, semantic tags) +│ └── ingestion/ +│ └── chunks.jsonl # JSONL output from avap_chunker.py +│ +└── src/ # Shared library (used by both Docker and scripts) + ├── config.py # Pydantic settings — reads all environment variables └── utils/ ├── emb_factory.py # Embedding model factory └── llm_factory.py # LLM model factory @@ -116,6 +152,146 @@ sequenceDiagram --- +## Knowledge Base Ingestion + +The Elasticsearch vector index is populated via one of two independent pipelines. Both pipelines require the Elasticsearch tunnel to be active (`localhost:9200`) and the Ollama embedding model (`OLLAMA_EMB_MODEL_NAME`) to be available. + +### Pipeline A — Chonkie (recommended for markdown + .avap) + +Uses the [Chonkie](https://github.com/chonkie-ai/chonkie) library for semantic chunking. Supports `.md` (via `MarkdownChef`) and `.avap` (via `TextChef` + `TokenChunker`). Chunks are embedded with Ollama and bulk-indexed into Elasticsearch via `ElasticHandshakeWithMetadata`. + +**Entry point:** `scripts/pipelines/flows/elasticsearch_ingestion.py` + +```bash +# Index all markdown and AVAP files from docs/LRM +python -m scripts.pipelines.flows.elasticsearch_ingestion \ + --docs-folder-path docs/LRM \ + --output ingestion/chunks.json \ + --docs-extension .md .avap \ + --es-index avap-docs-test \ + --delete-es-index + +# Index the AVAP code samples +python -m scripts.pipelines.flows.elasticsearch_ingestion \ + --docs-folder-path docs/samples \ + --output ingestion/chunks.json \ + --docs-extension .avap \ + --es-index avap-docs-test +``` + +**How it works:** + +``` +docs/**/*.md + docs/**/*.avap + │ + ▼ FileFetcher (Chonkie) + │ + ├─ .md → MarkdownChef → merge code blocks + tables into chunks + │ ↓ + │ TokenChunker (HuggingFace tokenizer: HF_EMB_MODEL_NAME) + │ + └─ .avap → TextChef → TokenChunker + │ + ▼ OllamaEmbeddings.embed_batch() (OLLAMA_EMB_MODEL_NAME) + │ + ▼ ElasticHandshakeWithMetadata.write() + bulk index → {text, embedding, file, start_index, end_index, token_count} + │ + ▼ export_documents() → ingestion/chunks.json +``` + +| Chunk field | Source | +|---|---| +| `text` | Raw chunk text | +| `embedding` | Ollama dense vector | +| `start_index` / `end_index` | Character offsets in source file | +| `token_count` | HuggingFace tokenizer count | +| `file` | Source filename | + +--- + +### Pipeline B — AVAP Native (classic, for .avap files with full semantic analysis) + +A custom lexer-based chunker purpose-built for the AVAP language using `avap_config.json` as its grammar definition. Produces richer metadata (block type, section, semantic tags, complexity score) and includes **MinHash LSH deduplication** and **semantic overlap** between chunks. + +**Entry point:** `scripts/pipelines/ingestion/avap_chunker.py` +**Grammar config:** `scripts/pipelines/ingestion/avap_config.json` — see [`docs/AVAP_CHUNKER_CONFIG.md`](./docs/AVAP_CHUNKER_CONFIG.md) for the full reference on blocks, statements, semantic tags, and how to extend the grammar. + +```bash +python scripts/pipelines/ingestion/avap_chunker.py \ + --lang-config scripts/pipelines/ingestion/avap_config.json \ + --docs-path docs/samples \ + --output scripts/pipelines/ingestion/ingestion/chunks.jsonl \ + --workers 4 +``` + +**Step 2 — Ingest:** `scripts/pipelines/ingestion/avap_ingestor.py` + +```bash +# Ingest from existing JSONL +python scripts/pipelines/ingestion/avap_ingestor.py \ + --chunks scripts/pipelines/ingestion/ingestion/chunks.jsonl \ + --index avap-knowledge-v1 \ + --delete + +# Check model embedding dimensions first +python scripts/pipelines/ingestion/avap_ingestor.py --probe-dim +``` + +**How it works:** + +``` +docs/**/*.avap + docs/**/*.md + │ + ▼ avap_chunker.py (GenericLexer + LanguageConfig) + │ ├─ .avap: block detection (function/if/startLoop/try), statement classification + │ │ semantic tags enrichment, function signature extraction + │ │ semantic overlap injection (OVERLAP_LINES=3) + │ └─ .md: H1/H2/H3 sectioning, fenced code extraction, table isolation, + │ narrative split by token budget (MAX_NARRATIVE_TOKENS=400) + │ ├─ MinHash LSH deduplication (threshold=0.85, 128 permutations) + │ └─ parallel workers (ProcessPoolExecutor) + │ + ▼ chunks.jsonl (one JSON per line) + │ + ▼ avap_ingestor.py (async producer/consumer) + │ ├─ OllamaAsyncEmbedder — batch embed (BATCH_SIZE_EMBED=8) + │ ├─ asyncio.Queue (backpressure, QUEUE_MAXSIZE=5) + │ ├─ ES async_bulk (BATCH_SIZE_ES=50) + │ └─ DeadLetterQueue — failed chunks saved to failed_chunks_.jsonl + │ + ▼ Elasticsearch index + {chunk_id, content, embedding, doc_type, block_type, section, + source_file, start_line, end_line, token_estimate, metadata{...}} +``` + +**Chunk types produced:** + +| `doc_type` | `block_type` | Description | +|---|---|---| +| `code` | `function` | Complete AVAP function block | +| `code` | `if` / `startLoop` / `try` | Control flow blocks | +| `function_signature` | `function_signature` | Extracted function signature only (for fast lookup) | +| `code` | `registerEndpoint` / `addVar` / … | Statement-level chunks by AVAP command category | +| `spec` | `narrative` | Markdown prose sections | +| `code_example` | language tag | Fenced code blocks from markdown | +| `bnf` | `bnf` | BNF grammar blocks from markdown | +| `spec` | `table` | Markdown tables | + +**Semantic tags** (automatically detected, stored in `metadata`): + +`uses_orm` · `uses_http` · `uses_connector` · `uses_async` · `uses_crypto` · `uses_auth` · `uses_error_handling` · `uses_loop` · `uses_json` · `uses_list` · `uses_regex` · `uses_datetime` · `returns_result` · `registers_endpoint` + +**Ingestor environment variables:** + +| Variable | Default | Description | +|---|---|---| +| `OLLAMA_URL` | `http://localhost:11434` | Ollama base URL for embeddings | +| `OLLAMA_MODEL` | `qwen3-0.6B-emb:latest` | Embedding model name | +| `OLLAMA_EMBEDDING_DIM` | `1024` | Expected embedding dimension (must match model) | + +--- + ## Development Setup ### 1. Prerequisites @@ -140,6 +316,9 @@ PYTHONPATH=${PYTHONPATH}:/home/... ELASTICSEARCH_URL=http://host.docker.internal:9200 ELASTICSEARCH_LOCAL_URL=http://localhost:9200 ELASTICSEARCH_INDEX=avap-docs-test +ELASTICSEARCH_USER=elastic +ELASTICSEARCH_PASSWORD=changeme +ELASTICSEARCH_API_KEY= POSTGRES_URL=postgresql://postgres:postgres@localhost:5432/langfuse LANGFUSE_HOST=http://45.77.119.180 LANGFUSE_PUBLIC_KEY=pk-lf-... @@ -150,6 +329,8 @@ OLLAMA_MODEL_NAME=qwen2.5:1.5b OLLAMA_EMB_MODEL_NAME=qwen3-0.6B-emb:latest HF_TOKEN=hf_... HF_EMB_MODEL_NAME=Qwen/Qwen3-Embedding-0.6B +ANTHROPIC_API_KEY=sk-ant-... +ANTHROPIC_MODEL=claude-sonnet-4-20250514 ``` | Variable | Required | Description | Example | @@ -158,6 +339,9 @@ HF_EMB_MODEL_NAME=Qwen/Qwen3-Embedding-0.6B | `ELASTICSEARCH_URL` | Yes | Elasticsearch endpoint used for vector/context retrieval in Docker | `http://host.docker.internal:9200` | | `ELASTICSEARCH_LOCAL_URL` | Yes | Elasticsearch endpoint used for vector/context retrieval in local | `http://localhost:9200` | | `ELASTICSEARCH_INDEX` | Yes | Elasticsearch index name used by the engine | `avap-docs-test` | +| `ELASTICSEARCH_USER` | No | Elasticsearch username (used when API key is not set) | `elastic` | +| `ELASTICSEARCH_PASSWORD` | No | Elasticsearch password (used when API key is not set) | `changeme` | +| `ELASTICSEARCH_API_KEY` | No | Elasticsearch API key (takes precedence over user/password auth) | `abc123...` | | `POSTGRES_URL` | Yes | PostgreSQL connection string used by the service | `postgresql://postgres:postgres@localhost:5432/langfuse` | | `LANGFUSE_HOST` | Yes | Langfuse server endpoint (Devaron Cluster) | `http://45.77.119.180` | | `LANGFUSE_PUBLIC_KEY` | Yes | Langfuse project public key for tracing and observability | `pk-lf-...` | @@ -166,8 +350,10 @@ HF_EMB_MODEL_NAME=Qwen/Qwen3-Embedding-0.6B | `OLLAMA_LOCAL_URL` | Yes | Ollama endpoint used for text generation/embeddings in local | `http://localhost:11434` | | `OLLAMA_MODEL_NAME` | Yes | Ollama model name for generation | `qwen2.5:1.5b` | | `OLLAMA_EMB_MODEL_NAME` | Yes | Ollama embeddings model name | `qwen3-0.6B-emb:latest` | -| `HF_TOKEN` | Yes | Hugginface secret token | `hf_...` | -| `HF_EMB_MODEL_NAME` | Yes | Hugginface embeddings model name | `Qwen/Qwen3-Embedding-0.6B` | +| `HF_TOKEN` | Yes | HuggingFace secret token | `hf_...` | +| `HF_EMB_MODEL_NAME` | Yes | HuggingFace embeddings model name | `Qwen/Qwen3-Embedding-0.6B` | +| `ANTHROPIC_API_KEY` | Yes* | Anthropic API key — required for the `EvaluateRAG` endpoint | `sk-ant-...` | +| `ANTHROPIC_MODEL` | No | Claude model used by the RAG evaluation suite | `claude-sonnet-4-20250514` | > Never commit real secret values. Use placeholder values when sharing configuration examples. @@ -194,25 +380,186 @@ docker-compose up -d --build ## Testing & Debugging -The service is exposed on port `50052` with **gRPC Reflection** enabled. +The gRPC service is exposed on port `50052` with **gRPC Reflection** enabled — introspect it at any time without needing the `.proto` file. + +```bash +# List available services +grpcurl -plaintext localhost:50052 list + +# Describe the full service contract +grpcurl -plaintext localhost:50052 describe brunix.AssistanceEngine +``` + +### `AskAgent` — complete response (non-streaming) + +Returns the full answer as a single message with `is_final: true`. Suitable for clients that do not support streaming. -### Streaming Query Example ```bash grpcurl -plaintext \ - -d '{"query": "Hola Brunix, ¿qué es AVAP?", "session_id": "dev-test-123"}' \ + -d '{"query": "What is addVar in AVAP?", "session_id": "dev-001"}' \ localhost:50052 \ brunix.AssistanceEngine/AskAgent ``` +Expected response: +```json +{ + "text": "addVar is an AVAP command used to declare a variable...", + "avap_code": "AVAP-2026", + "is_final": true +} +``` + +### `AskAgentStream` — real token streaming + +Emits one `AgentResponse` per token from Ollama. The final message has `is_final: true` and empty `text` — it is a termination signal, not part of the answer. + +```bash +grpcurl -plaintext \ + -d '{"query": "Write an AVAP API that returns hello world", "session_id": "dev-001"}' \ + localhost:50052 \ + brunix.AssistanceEngine/AskAgentStream +``` + +Expected response stream: +```json +{"text": "Here", "is_final": false} +{"text": " is", "is_final": false} +... +{"text": "", "is_final": true} +``` + +**Multi-turn conversation:** send subsequent requests with the same `session_id` to maintain context. + +```bash +# Turn 1 +grpcurl -plaintext \ + -d '{"query": "What is registerEndpoint?", "session_id": "user-abc"}' \ + localhost:50052 brunix.AssistanceEngine/AskAgentStream + +# Turn 2 — engine has Turn 1 history +grpcurl -plaintext \ + -d '{"query": "Show me a code example", "session_id": "user-abc"}' \ + localhost:50052 brunix.AssistanceEngine/AskAgentStream +``` + +### `EvaluateRAG` — quality evaluation + +Runs the RAGAS evaluation pipeline against the golden dataset using Claude as the judge. Requires `ANTHROPIC_API_KEY` to be set. + +```bash +# Full evaluation +grpcurl -plaintext -d '{}' localhost:50052 brunix.AssistanceEngine/EvaluateRAG + +# Filtered: first 10 questions of category "core_syntax" +grpcurl -plaintext \ + -d '{"category": "core_syntax", "limit": 10, "index": "avap-docs-test"}' \ + localhost:50052 \ + brunix.AssistanceEngine/EvaluateRAG +``` + +Expected response: +```json +{ + "status": "ok", + "questions_evaluated": 10, + "elapsed_seconds": 142.3, + "judge_model": "claude-sonnet-4-20250514", + "faithfulness": 0.8421, + "answer_relevancy": 0.7913, + "context_recall": 0.7234, + "context_precision": 0.6891, + "global_score": 0.7615, + "verdict": "ACCEPTABLE" +} +``` + +Verdict thresholds: `EXCELLENT` ≥ 0.80 · `ACCEPTABLE` ≥ 0.60 · `INSUFFICIENT` < 0.60 + +--- + +## HTTP Proxy (OpenAI & Ollama Compatible) + +The container also runs an **OpenAI-compatible HTTP proxy** on port `8000` (`openai_proxy.py`). It wraps the gRPC engine transparently — `stream: false` routes to `AskAgent`, `stream: true` routes to `AskAgentStream`. + +This enables integration with any tool that supports the OpenAI or Ollama API (continue.dev, LiteLLM, Open WebUI, etc.) without code changes. + +### OpenAI endpoints + +| Method | Endpoint | Description | +|---|---|---| +| `GET` | `/v1/models` | List available models | +| `POST` | `/v1/chat/completions` | Chat completion — streaming and non-streaming | +| `POST` | `/v1/completions` | Legacy text completion — streaming and non-streaming | +| `GET` | `/health` | Health check — returns gRPC target and status | + +**Non-streaming chat:** +```bash +curl http://localhost:8000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "brunix", + "messages": [{"role": "user", "content": "What is AVAP?"}], + "stream": false + }' +``` + +**Streaming chat (SSE):** +```bash +curl http://localhost:8000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "brunix", + "messages": [{"role": "user", "content": "Write an AVAP hello world API"}], + "stream": true, + "session_id": "user-xyz" + }' +``` + +> **Brunix extension:** `session_id` is a non-standard field added to the OpenAI schema. Use it to maintain multi-turn conversation context across HTTP requests. If omitted, all requests share the `"default"` session. + +### Ollama endpoints + +| Method | Endpoint | Description | +|---|---|---| +| `GET` | `/api/tags` | List models (Ollama format) | +| `POST` | `/api/chat` | Chat — NDJSON stream, `stream: true` by default | +| `POST` | `/api/generate` | Text generation — NDJSON stream, `stream: true` by default | + +```bash +curl http://localhost:8000/api/chat \ + -H "Content-Type: application/json" \ + -d '{ + "model": "brunix", + "messages": [{"role": "user", "content": "Explain AVAP loops"}], + "stream": true + }' +``` + +### Proxy environment variables + +| Variable | Default | Description | +|---|---|---| +| `BRUNIX_GRPC_TARGET` | `localhost:50051` | gRPC engine address the proxy connects to | +| `PROXY_MODEL_ID` | `brunix` | Model name returned in API responses | +| `PROXY_THREAD_WORKERS` | `20` | Thread pool size for concurrent gRPC calls | + --- ## API Contract (Protobuf) -To update the communication interface, modify `protos/brunix.proto` and re-generate the stubs: + +The source of truth for the gRPC interface is `Docker/protos/brunix.proto`. After modifying it, regenerate the stubs: ```bash -python -m grpc_tools.protoc -I./protos --python_out=./src --grpc_python_out=./src ./protos/brunix.proto +python -m grpc_tools.protoc \ + -I./Docker/protos \ + --python_out=./Docker/src \ + --grpc_python_out=./Docker/src \ + ./Docker/protos/brunix.proto ``` +For the full API reference — message types, field descriptions, error handling, and all client examples — see [`docs/API_REFERENCE.md`](./docs/API_REFERENCE.md). + --- ## Dataset Generation & Evaluation @@ -220,7 +567,7 @@ python -m grpc_tools.protoc -I./protos --python_out=./src --grpc_python_out=./sr The engine includes a specialized benchmarking suite to evaluate the model's proficiency in **AVAP syntax**. This is achieved through a synthetic data generator that creates problems in the MBPP (Mostly Basic Python Problems) style, but tailored for the AVAP Language Reference Manual (LRM). ### 1. Synthetic Data Generator -The script `scripts/generate_mbpp_avap.py` leverages Claude 3.5 Sonnet to produce high-quality, executable code examples and validation tests. +The script `scripts/pipelines/flows/generate_mbap.py` leverages Claude to produce high-quality, executable code examples and validation tests. **Key Features:** * **LRM Grounding:** Uses the provided `avap.md` as the source of truth for syntax and logic. @@ -238,8 +585,8 @@ export ANTHROPIC_API_KEY="your-sk-ant-key" Run the generator specifying the path to your LRM and the desired output: ```bash -python scripts/generate_mbpp_avap.py \ - --lrm ingestion/docs/avap.md \ +python scripts/pipelines/flows/generate_mbap.py \ + --lrm docs/LRM/avap.md \ --output evaluation/mbpp_avap.json \ --problems 300 ``` @@ -277,6 +624,21 @@ For the full set of contribution standards, see [CONTRIBUTING.md](./CONTRIBUTING --- +## Documentation Index + +| Document | Purpose | +|---|---| +| [README.md](./README.md) | Setup guide, env vars reference, quick start (this file) | +| [CONTRIBUTING.md](./CONTRIBUTING.md) | Contribution standards, GitFlow, PR process | +| [SECURITY.md](./SECURITY.md) | Security policy, vulnerability reporting, known limitations | +| [docs/ARCHITECTURE.md](./docs/ARCHITECTURE.md) | Deep technical architecture, component inventory, data flows | +| [docs/API_REFERENCE.md](./docs/API_REFERENCE.md) | Complete gRPC API contract, message types, client examples | +| [docs/RUNBOOK.md](./docs/RUNBOOK.md) | Operational playbooks, health checks, incident response | +| [docs/AVAP_CHUNKER_CONFIG.md](./docs/AVAP_CHUNKER_CONFIG.md) | `avap_config.json` reference — blocks, statements, semantic tags, how to extend | +| [docs/adr/](./docs/adr/) | Architecture Decision Records | + +--- + ## Security & Intellectual Property * **Data Privacy:** All LLM processing and vector searches are conducted within a private Kubernetes environment. * **Proprietary Technology:** This repository contains the **AVAP Technology** stack (101OBEX) and specialized training logic (MrHouston). Unauthorized distribution is prohibited. diff --git a/changelog b/changelog index 3d90407..2b55fff 100644 --- a/changelog +++ b/changelog @@ -4,6 +4,28 @@ All notable changes to the **Brunix Assistance Engine** will be documented in th --- +## [1.5.1] - 2026-03-18 + +### Added +- DOCS: Created `docs/ARCHITECTURE.md` — full technical architecture reference covering component inventory, request lifecycle, LangGraph workflow, hybrid RAG pipeline, streaming design, evaluation pipeline, infrastructure layout, session memory, observability, and security boundaries. +- DOCS: Created `docs/API_REFERENCE.md` — complete gRPC API contract documentation with method descriptions, message type tables, error handling, and `grpcurl` client examples for all three RPCs (`AskAgent`, `AskAgentStream`, `EvaluateRAG`). +- DOCS: Created `docs/RUNBOOK.md` — operational playbook with health checks, startup/shutdown procedures, tunnel management, and incident playbooks for all known failure modes. +- DOCS: Created `SECURITY.md` — security policy covering transport security, authentication, secrets management, container security, data privacy, known limitations table, and vulnerability reporting process. +- DOCS: Created `docs/AVAP_CHUNKER_CONFIG.md` — full reference for `avap_config.json`: lexer fields, all 4 block definitions with regex breakdown, all 10 statement categories with ordering rationale, all 14 semantic tags with detection patterns, a worked example showing chunks produced from real AVAP code, and a step-by-step guide for adding new constructs. + +### Changed +- DOCS: Fully rewrote `README.md` project structure tree — now reflects all files accurately including `openai_proxy.py`, `entrypoint.sh`, `golden_dataset.json`, `SECURITY.md`, `docs/ARCHITECTURE.md`, `docs/API_REFERENCE.md`, `docs/RUNBOOK.md`, `docs/adr/`, `avap_chunker.py`, `avap_config.json`, `ingestion/chunks.jsonl`, and `src/config.py`. +- DOCS: Added `Knowledge Base Ingestion` section to `README.md` documenting both ingestion pipelines in full: Pipeline A (Chonkie — `elasticsearch_ingestion.py`) with flow diagram, CLI usage, and chunk field table; Pipeline B (AVAP Native — `avap_chunker.py` + `avap_ingestor.py`) with flow diagram, chunk type table, semantic tags reference, and ingestor env vars. +- DOCS: Replaced minimal `Testing & Debugging` section with complete documentation of all three gRPC methods (`AskAgent`, `AskAgentStream`, `EvaluateRAG`) including expected responses, multi-turn example, and verdict thresholds. +- DOCS: Added `HTTP Proxy` section documenting all 7 HTTP endpoints (4 OpenAI + 3 Ollama), streaming vs non-streaming routing, `session_id` extension, and proxy env vars table. +- DOCS: Fixed `API Contract (Protobuf)` section — corrected `grpc_tools.protoc` paths and added reference to `docs/API_REFERENCE.md`. +- DOCS: Fixed remaining stale reference to `scripts/generate_mbpp_avap.py` in Dataset Generation section. +- DOCS: Added Documentation Index table to `README.md` linking all documentation files. +- DOCS: Updated `CONTRIBUTING.md` — added Section 9 (Architecture Decision Records) and updated PR checklist and doc policy table. +- ENV: Added missing variable documentation to `README.md`: `ELASTICSEARCH_USER`, `ELASTICSEARCH_PASSWORD`, `ELASTICSEARCH_API_KEY`, `ANTHROPIC_API_KEY`, `ANTHROPIC_MODEL`. + +--- + ## [1.5.0] - 2026-03-12 ### Added diff --git a/docs/ADR/ADR-0001-grpc-primary-interface.md b/docs/ADR/ADR-0001-grpc-primary-interface.md new file mode 100644 index 0000000..6ffe5fa --- /dev/null +++ b/docs/ADR/ADR-0001-grpc-primary-interface.md @@ -0,0 +1,54 @@ +# ADR-0001: gRPC as the Primary Communication Interface + +**Date:** 2026-02-09 +**Status:** Accepted +**Deciders:** Rafael Ruiz (CTO, AVAP Technology), MrHouston Engineering + +--- + +## Context + +The Brunix Assistance Engine needs a communication protocol to serve AI completions from internal backend services and client applications. The primary requirement is **real-time token streaming** — the engine must forward Ollama's token output to clients with minimal latency, not buffer the full response. + +Secondary requirements: +- Strict API contract enforcement (no schema drift) +- High throughput for potential multi-client scenarios +- Easy introspection and testing in development + +Candidates evaluated: REST/HTTP+JSON, gRPC, WebSockets, GraphQL subscriptions. + +--- + +## Decision + +Use **gRPC with Protocol Buffers (proto3)** as the primary interface, exposed on port `50051` (container) / `50052` (host). + +The API contract is defined in a single source of truth: `Docker/protos/brunix.proto`. + +An **OpenAI-compatible HTTP proxy** (`openai_proxy.py`, port `8000`) is provided as a secondary interface to enable integration with standard tooling (continue.dev, LiteLLM, etc.) without modifying the core engine. + +--- + +## Rationale + +| Criterion | REST+JSON | **gRPC** | WebSockets | +|---|---|---|---| +| Streaming support | Requires SSE or chunked | ✅ Native server-side streaming | ✅ Bidirectional | +| Schema enforcement | ❌ Optional (OpenAPI) | ✅ Enforced by protobuf | ❌ None | +| Code generation | Manual or OpenAPI tooling | ✅ Automatic stub generation | Manual | +| Performance | Good | ✅ Better (binary framing) | Good | +| Dev tooling | Excellent | Good (`grpcurl`, reflection) | Limited | +| Browser-native | ✅ Yes | ❌ Requires grpc-web proxy | ✅ Yes | + +gRPC was chosen because: (1) streaming is a first-class citizen, not bolted on; (2) the proto contract makes API evolution explicit and breaking changes detectable at compile time; (3) stub generation eliminates a class of integration bugs. + +The lack of browser-native support is not a concern — all current clients are server-side services or CLI tools. + +--- + +## Consequences + +- All API changes require modifying `brunix.proto` and regenerating stubs (`grpc_tools.protoc`). +- Client libraries must use the generated stubs or `grpcurl` — no curl-based ad-hoc testing of the main API. +- The OpenAI proxy adds a second entry point that must be kept in sync with the gRPC interface behavior. +- gRPC reflection is enabled in development. It should be evaluated for disabling in production to reduce the attack surface. diff --git a/docs/ADR/ADR-0002-two-phase-streaming.md b/docs/ADR/ADR-0002-two-phase-streaming.md new file mode 100644 index 0000000..39f7c39 --- /dev/null +++ b/docs/ADR/ADR-0002-two-phase-streaming.md @@ -0,0 +1,61 @@ +# ADR-0002: Two-Phase Streaming Design for `AskAgentStream` + +**Date:** 2026-03-05 +**Status:** Accepted +**Deciders:** Rafael Ruiz (CTO), MrHouston Engineering + +--- + +## Context + +The initial `AskAgent` implementation calls `graph.invoke()` — LangGraph's synchronous execution — and returns the complete answer as a single gRPC message. This blocks the gRPC connection for the full generation time (typically 3–15 seconds) with no intermediate feedback to the client. + +A streaming variant is required that forwards Ollama's token output to the client as tokens are produced, enabling real-time rendering in client UIs. + +The straightforward approach would be to use LangGraph's own `graph.stream()` method. + +--- + +## Decision + +Implement `AskAgentStream` using a **two-phase design**: + +**Phase 1 — Graph-managed preparation:** +Run `build_prepare_graph()` (classify → reformulate → retrieve) via `prepare_graph.invoke()`. This phase runs synchronously and produces the full classified, reformulated query and retrieved context. It does **not** call the LLM for generation. + +**Phase 2 — Manual LLM streaming:** +Call `build_final_messages()` to reconstruct the exact prompt that the full graph would have used, then call `llm.stream(final_messages)` directly. Each token chunk is yielded immediately as an `AgentResponse`. + +A separate `build_prepare_graph()` function mirrors the routing logic of `build_graph()` but terminates at `END` before any generation node. A `build_final_messages()` function replicates the prompt-building logic of `generate`, `generate_code`, and `respond_conversational`. + +--- + +## Rationale + +### Why not use `graph.stream()`? + +LangGraph's `stream()` yields **state snapshots** at node boundaries, not LLM tokens. When using `llm.invoke()` inside a graph node, the invocation is atomic — there are no intermediate yields. To get per-token streaming from `llm.stream()`, the call must happen outside the graph. + +### Why not inline the streaming call inside a graph node? + +Yielding from inside a LangGraph node to an outer generator is architecturally complex and not idiomatic to LangGraph. It requires either a callback mechanism or breaking the node abstraction. + +### Trade-offs + +| Concern | Two-phase design | Alternative (streaming inside graph) | +|---|---|---| +| Code duplication | Medium — routing logic exists in both graphs | Low | +| Architectural clarity | High — phases are clearly separated | Low | +| LangGraph compatibility | High — standard usage | Low — requires framework internals | +| Maintainability | Requires keeping `build_prepare_graph` and `build_final_messages` in sync with `build_graph` | Single source of routing truth | + +The duplication risk is accepted because: (1) the routing logic is simple (3 branches), (2) the prepare graph is strictly a subset of the full graph, and (3) both are tested via the same integration test queries. + +--- + +## Consequences + +- `graph.py` now exports three functions: `build_graph`, `build_prepare_graph`, `build_final_messages`. +- Any change to query routing logic in `build_graph` must be mirrored in `build_prepare_graph`. +- Any change to prompt selection in `generate` / `generate_code` / `respond_conversational` must be mirrored in `build_final_messages`. +- Session history persistence happens **after the stream ends**, not mid-stream. A client that disconnects early will cause history to not be saved for that turn. diff --git a/docs/ADR/ADR-0003-hybrid-retrieval-rrf.md b/docs/ADR/ADR-0003-hybrid-retrieval-rrf.md new file mode 100644 index 0000000..f9cfad9 --- /dev/null +++ b/docs/ADR/ADR-0003-hybrid-retrieval-rrf.md @@ -0,0 +1,63 @@ +# ADR-0003: Hybrid Retrieval (BM25 + kNN) with RRF Fusion + +**Date:** 2026-03-05 +**Status:** Accepted +**Deciders:** Rafael Ruiz (CTO), MrHouston Engineering + +--- + +## Context + +The RAG pipeline needs a retrieval strategy for finding relevant AVAP documentation chunks from Elasticsearch. The knowledge base contains a mix of: + +- **Prose documentation** (explanations of AVAP concepts, commands, parameters) — benefits from semantic (dense) retrieval +- **Code examples and BNF grammar** (exact syntax patterns, function signatures) — benefits from lexical (sparse) retrieval, where exact token matches are critical + +A single retrieval strategy will underperform for one of these document types. + +--- + +## Decision + +Implement **hybrid retrieval** combining: +- **BM25** (Elasticsearch `multi_match` on `content^2` and `text^2` fields) for lexical relevance +- **kNN** (Elasticsearch `knn` on the `embedding` field) for semantic relevance +- **RRF (Reciprocal Rank Fusion)** with constant `k=60` to fuse rankings from both systems + +The fused top-8 documents are passed to the generation node as context. + +Query reformulation (`reformulate` node) runs before retrieval and rewrites the user query into keyword-optimized form to improve BM25 recall for AVAP-specific terminology. + +--- + +## Rationale + +### Why hybrid over pure semantic? + +AVAP is a domain-specific language with precise, non-negotiable syntax. For queries like "how does `addVar` work", exact lexical matching on the function name `addVar` is more reliable than semantic similarity, which may confuse similar-sounding functions or return contextually related but syntactically different commands. + +### Why hybrid over pure BM25? + +Conversational queries ("explain how loops work in AVAP", "what's the difference between addVar and setVar") benefit from semantic search that captures meaning beyond exact keyword overlap. + +### Why RRF over score normalization? + +BM25 and kNN scores are on different scales and distributions. Normalizing them requires careful calibration per index. RRF operates on ranks — not scores — making it robust to distribution differences and requiring no per-deployment tuning. The `k=60` constant is the standard literature value. + +### Retrieval parameters + +| Parameter | Value | Rationale | +|---|---|---| +| `k` (top documents) | 8 | Balances context richness vs. context window length | +| `num_candidates` (kNN) | `k × 5 = 40` | Standard ES kNN oversampling ratio | +| BM25 fields | `content^2, text^2` | Boost content/text fields; `^2` emphasizes them over metadata | +| Fuzziness (BM25) | `AUTO` | Handles minor typos in AVAP function names | + +--- + +## Consequences + +- Retrieval requires two ES queries per request (BM25 + kNN). This is acceptable given the tunnel latency baseline already incurred. +- If either BM25 or kNN fails (e.g., embedding model unavailable), the system degrades gracefully: the failing component logs a warning and returns an empty list; RRF fusion proceeds with the available rankings. +- Context length grows with `k`. At `k=8` with typical chunk sizes (~300 tokens each), context is ~2400 tokens — within the `qwen2.5:1.5b` context window. +- Changing `k` has a direct impact on both retrieval quality and generation latency. Any change must be evaluated with `EvaluateRAG` before merging. diff --git a/docs/ADR/ADR-0004-claude-eval-judge.md b/docs/ADR/ADR-0004-claude-eval-judge.md new file mode 100644 index 0000000..b92a867 --- /dev/null +++ b/docs/ADR/ADR-0004-claude-eval-judge.md @@ -0,0 +1,54 @@ +# ADR-0004: Claude as the RAGAS Evaluation Judge + +**Date:** 2026-03-10 +**Status:** Accepted +**Deciders:** Rafael Ruiz (CTO), MrHouston Engineering + +--- + +## Context + +The `EvaluateRAG` endpoint runs RAGAS metrics to measure the quality of the RAG pipeline. RAGAS metrics (`faithfulness`, `answer_relevancy`, `context_recall`, `context_precision`) require an LLM judge to score answers against ground truth and context. + +The production LLM is Ollama `qwen2.5:1.5b` — a small, locally-hosted model optimized for AVAP code generation speed. Using it as the evaluation judge creates a conflict of interest (measuring a system with the same model that produces it) and a quality concern (small models produce unreliable evaluation scores). + +--- + +## Decision + +Use **Claude (`claude-sonnet-4-20250514`) as the RAGAS evaluation judge**, accessed via the Anthropic API. + +The production Ollama LLM is still used for **answer generation** during evaluation (to measure real-world pipeline quality). Only the scoring step uses Claude. + +This requires `ANTHROPIC_API_KEY` to be set. The `EvaluateRAG` endpoint fails with an explicit error if the key is missing. + +--- + +## Rationale + +### Separation of generation and evaluation + +Using a different model for generation and evaluation is standard practice in LLM system evaluation. The evaluation judge must be: +1. **Independent** — not the same model being measured +2. **High-capability** — capable of nuanced faithfulness and relevancy judgements +3. **Deterministic** — consistent scores across runs (achieved via `temperature=0`) + +### Why Claude specifically? + +- Claude Sonnet-class models score among the highest on LLM-as-judge benchmarks for English and multilingual evaluation tasks +- The AVAP knowledge base contains bilingual content (Spanish + English); Claude handles both reliably +- The Anthropic SDK is already available in the dependency stack (`langchain-anthropic`) + +### Cost implications + +Claude is called only during explicit `EvaluateRAG` invocations, not during production queries. Cost per evaluation run depends on dataset size. For 50 questions at standard RAGAS prompt lengths, estimated cost is < $0.50 using Sonnet pricing. + +--- + +## Consequences + +- `ANTHROPIC_API_KEY` and `ANTHROPIC_MODEL` become required configuration for the evaluation feature. +- Evaluation runs incur external API costs. This should be factored into the evaluation cadence policy. +- The `judge_model` field in `EvalResponse` records which Claude version was used, enabling score comparisons across model versions over time. +- If Anthropic's API is unreachable or rate-limited, `EvaluateRAG` will fail. This is acceptable since evaluation is a batch operation, not a real-time user-facing feature. +- Any change to `ANTHROPIC_MODEL` may alter scoring distributions. Historical eval scores are only comparable when the same judge model was used. diff --git a/docs/API_REFERENCE.md b/docs/API_REFERENCE.md new file mode 100644 index 0000000..d6d3675 --- /dev/null +++ b/docs/API_REFERENCE.md @@ -0,0 +1,339 @@ +# Brunix Assistance Engine — API Reference + +> **Protocol:** gRPC (proto3) +> **Port:** `50052` (host) → `50051` (container) +> **Reflection:** Enabled — service introspection available via `grpcurl` +> **Source of truth:** `Docker/protos/brunix.proto` + +--- + +## Table of Contents + +1. [Service Definition](#1-service-definition) +2. [Methods](#2-methods) + - [AskAgent](#21-askagent) + - [AskAgentStream](#22-askagentstream) + - [EvaluateRAG](#23-evaluaterag) +3. [Message Types](#3-message-types) +4. [Error Handling](#4-error-handling) +5. [Client Examples](#5-client-examples) +6. [OpenAI-Compatible Proxy](#6-openai-compatible-proxy) + +--- + +## 1. Service Definition + +```protobuf +package brunix; + +service AssistanceEngine { + rpc AskAgent (AgentRequest) returns (stream AgentResponse); + rpc AskAgentStream (AgentRequest) returns (stream AgentResponse); + rpc EvaluateRAG (EvalRequest) returns (EvalResponse); +} +``` + +Both `AskAgent` and `AskAgentStream` return a **server-side stream** of `AgentResponse` messages. They differ in how they produce and deliver the response — see [§2.1](#21-askagent) and [§2.2](#22-askagentstream). + +--- + +## 2. Methods + +### 2.1 `AskAgent` + +**Behaviour:** Runs the full LangGraph pipeline (classify → reformulate → retrieve → generate) using `llm.invoke()`. Returns the complete answer as a **single** `AgentResponse` message with `is_final = true`. + +**Use case:** Clients that do not support streaming or need a single atomic response. + +**Request:** + +```protobuf +message AgentRequest { + string query = 1; // The user's question. Required. Max recommended: 4096 chars. + string session_id = 2; // Conversation session identifier. Optional. + // If empty, defaults to "default" (shared session). + // Use a UUID per user/conversation for isolation. +} +``` + +**Response stream:** + +| Message # | `text` | `avap_code` | `is_final` | +|---|---|---|---| +| 1 (only) | Full answer text | `"AVAP-2026"` | `true` | + +**Latency characteristics:** Depends on LLM generation time (non-streaming). Typically 3–15 seconds for `qwen2.5:1.5b` on the Devaron cluster. + +--- + +### 2.2 `AskAgentStream` + +**Behaviour:** Runs `prepare_graph` (classify → reformulate → retrieve), then calls `llm.stream()` directly. Emits one `AgentResponse` per token from Ollama, followed by a terminal message. + +**Use case:** Interactive clients (chat UIs, terminal tools) that need progressive rendering. + +**Request:** Same `AgentRequest` as `AskAgent`. + +**Response stream:** + +| Message # | `text` | `avap_code` | `is_final` | +|---|---|---|---| +| 1…N | Single token | `""` | `false` | +| N+1 (final) | `""` | `""` | `true` | + +**Client contract:** +- Accumulate `text` from all messages where `is_final == false` to reconstruct the full answer. +- The `is_final == true` message signals end-of-stream. Its `text` is always empty and should be discarded. +- Do not close the stream early — the engine will fail to persist conversation history if the stream is interrupted. + +--- + +### 2.3 `EvaluateRAG` + +**Behaviour:** Runs the RAGAS evaluation pipeline against the golden dataset. Uses the production Ollama LLM for answer generation and Claude as the evaluation judge. + +> **Requirement:** `ANTHROPIC_API_KEY` must be configured in the environment. This endpoint will return an error response if it is missing. + +**Request:** + +```protobuf +message EvalRequest { + string category = 1; // Optional. Filter golden dataset by category name. + // If empty, all categories are evaluated. + int32 limit = 2; // Optional. Evaluate only the first N questions. + // If 0, all matching questions are evaluated. + string index = 3; // Optional. Elasticsearch index to evaluate against. + // If empty, uses the server's configured ELASTICSEARCH_INDEX. +} +``` + +**Response (single, non-streaming):** + +```protobuf +message EvalResponse { + string status = 1; // "ok" or error description + int32 questions_evaluated = 2; // Number of questions actually processed + float elapsed_seconds = 3; // Total wall-clock time + string judge_model = 4; // Claude model used as judge + string index = 5; // Elasticsearch index evaluated + + // RAGAS metric scores (0.0 – 1.0) + float faithfulness = 6; + float answer_relevancy = 7; + float context_recall = 8; + float context_precision = 9; + + float global_score = 10; // Mean of non-zero metric scores + string verdict = 11; // "EXCELLENT" | "ACCEPTABLE" | "INSUFFICIENT" + + repeated QuestionDetail details = 12; +} + +message QuestionDetail { + string id = 1; // Question ID from golden dataset + string category = 2; // Question category + string question = 3; // Question text + string answer_preview = 4; // First 300 chars of generated answer + int32 n_chunks = 5; // Number of context chunks retrieved +} +``` + +**Verdict thresholds:** + +| Score | Verdict | +|---|---| +| ≥ 0.80 | `EXCELLENT` | +| ≥ 0.60 | `ACCEPTABLE` | +| < 0.60 | `INSUFFICIENT` | + +--- + +## 3. Message Types + +### `AgentRequest` + +| Field | Type | Required | Description | +|---|---|---|---| +| `query` | `string` | Yes | User's natural language question | +| `session_id` | `string` | No | Conversation identifier for multi-turn context. Use a stable UUID per user session. | + +### `AgentResponse` + +| Field | Type | Description | +|---|---|---| +| `text` | `string` | Token text (streaming) or full answer text (non-streaming) | +| `avap_code` | `string` | Currently always `"AVAP-2026"` in non-streaming mode, empty in streaming | +| `is_final` | `bool` | `true` only on the last message of the stream | + +### `EvalRequest` + +| Field | Type | Required | Default | Description | +|---|---|---|---|---| +| `category` | `string` | No | `""` (all) | Filter golden dataset by category | +| `limit` | `int32` | No | `0` (all) | Max questions to evaluate | +| `index` | `string` | No | `$ELASTICSEARCH_INDEX` | ES index to evaluate | + +### `EvalResponse` + +See full definition in [§2.3](#23-evaluaterag). + +--- + +## 4. Error Handling + +The engine catches all exceptions and returns them as terminal `AgentResponse` messages rather than gRPC status errors. This means: + +- The stream will **not** be terminated with a non-OK gRPC status code on application-level errors. +- Check for error strings in the `text` field that begin with `[ENG] Error:`. +- The stream will still end with `is_final = true`. + +**Example error response:** +```json +{"text": "[ENG] Error: Connection refused connecting to Ollama", "is_final": true} +``` + +**`EvaluateRAG` error response:** +Returned as a single `EvalResponse` with `status` set to the error description: +```json +{"status": "ANTHROPIC_API_KEY no configurada en .env", ...} +``` + +--- + +## 5. Client Examples + +### Introspect the service + +```bash +grpcurl -plaintext localhost:50052 list +# Output: brunix.AssistanceEngine + +grpcurl -plaintext localhost:50052 describe brunix.AssistanceEngine +``` + +### `AskAgent` — full response + +```bash +grpcurl -plaintext \ + -d '{"query": "What is addVar in AVAP?", "session_id": "dev-001"}' \ + localhost:50052 \ + brunix.AssistanceEngine/AskAgent +``` + +Expected response: +```json +{ + "text": "addVar is an AVAP command that declares a new variable...", + "avap_code": "AVAP-2026", + "is_final": true +} +``` + +### `AskAgentStream` — token streaming + +```bash +grpcurl -plaintext \ + -d '{"query": "Write an AVAP API that returns hello world", "session_id": "dev-001"}' \ + localhost:50052 \ + brunix.AssistanceEngine/AskAgentStream +``` + +Expected response (truncated): +```json +{"text": "Here", "is_final": false} +{"text": " is", "is_final": false} +{"text": " a", "is_final": false} +... +{"text": "", "is_final": true} +``` + +### `EvaluateRAG` — run evaluation + +```bash +# Evaluate first 10 questions from the "core_syntax" category +grpcurl -plaintext \ + -d '{"category": "core_syntax", "limit": 10}' \ + localhost:50052 \ + brunix.AssistanceEngine/EvaluateRAG +``` + +Expected response: +```json +{ + "status": "ok", + "questions_evaluated": 10, + "elapsed_seconds": 142.3, + "judge_model": "claude-sonnet-4-20250514", + "index": "avap-docs-test", + "faithfulness": 0.8421, + "answer_relevancy": 0.7913, + "context_recall": 0.7234, + "context_precision": 0.6891, + "global_score": 0.7615, + "verdict": "ACCEPTABLE", + "details": [...] +} +``` + +### Multi-turn conversation example + +```bash +# Turn 1 +grpcurl -plaintext \ + -d '{"query": "What is registerEndpoint?", "session_id": "user-abc"}' \ + localhost:50052 brunix.AssistanceEngine/AskAgentStream + +# Turn 2 — the engine has history from Turn 1 +grpcurl -plaintext \ + -d '{"query": "Can you show me an example?", "session_id": "user-abc"}' \ + localhost:50052 brunix.AssistanceEngine/AskAgentStream +``` + +### Regenerate gRPC stubs after modifying `brunix.proto` + +```bash +python -m grpc_tools.protoc \ + -I./Docker/protos \ + --python_out=./Docker/src \ + --grpc_python_out=./Docker/src \ + ./Docker/protos/brunix.proto +``` + +--- + +## 6. OpenAI-Compatible Proxy + +The container also exposes an HTTP server on port `8000` (`openai_proxy.py`) that wraps `AskAgentStream` under an OpenAI-compatible endpoint. This allows integration with any tool that supports the OpenAI Chat Completions API. + +**Base URL:** `http://localhost:8000` + +### `POST /v1/chat/completions` + +**Request body:** + +```json +{ + "model": "brunix", + "messages": [ + {"role": "user", "content": "What is addVar in AVAP?"} + ], + "stream": true +} +``` + +**Notes:** +- The `model` field is ignored; the engine always uses the configured `OLLAMA_MODEL_NAME`. +- Session management is handled internally by the proxy. Conversation continuity across separate HTTP requests is not guaranteed. +- Only `stream: true` is fully supported. Non-streaming mode may be available but is not the primary use case. + +**Example with curl:** + +```bash +curl http://localhost:8000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "brunix", + "messages": [{"role": "user", "content": "Explain AVAP loops"}], + "stream": true + }' +``` diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md new file mode 100644 index 0000000..4408927 --- /dev/null +++ b/docs/ARCHITECTURE.md @@ -0,0 +1,463 @@ +# Brunix Assistance Engine — Architecture Reference + +> **Audience:** Engineers contributing to this repository, architects reviewing the system design, and operators responsible for its deployment. +> **Last updated:** 2026-03-18 +> **Version:** 1.5.x + +--- + +## Table of Contents + +1. [System Overview](#1-system-overview) +2. [Component Inventory](#2-component-inventory) +3. [Request Lifecycle](#3-request-lifecycle) +4. [LangGraph Workflow](#4-langgraph-workflow) +5. [RAG Pipeline — Hybrid Search](#5-rag-pipeline--hybrid-search) +6. [Streaming Architecture (AskAgentStream)](#6-streaming-architecture-askagentstream) +7. [Evaluation Pipeline (EvaluateRAG)](#7-evaluation-pipeline-evaluaterag) +8. [Data Ingestion Pipeline](#8-data-ingestion-pipeline) +9. [Infrastructure Layout](#9-infrastructure-layout) +10. [Session State & Conversation Memory](#10-session-state--conversation-memory) +11. [Observability Stack](#11-observability-stack) +12. [Security Boundaries](#12-security-boundaries) +13. [Known Limitations & Future Work](#13-known-limitations--future-work) + +--- + +## 1. System Overview + +The **Brunix Assistance Engine** is a stateful, streaming-capable AI service that answers questions about the AVAP programming language. It combines: + +- **gRPC** as the primary communication interface (port `50051` inside container, `50052` on host) +- **LangGraph** for deterministic, multi-step agentic orchestration +- **Hybrid RAG** (BM25 + kNN with RRF fusion) over an Elasticsearch vector index +- **Ollama** as the local LLM and embedding backend +- **RAGAS + Claude** as the automated evaluation judge + +A secondary **OpenAI-compatible HTTP proxy** (port `8000`) is served via FastAPI/Uvicorn, enabling integration with tools that expect the OpenAI API format. + +``` +┌─────────────────────────────────────────────────────────────┐ +│ External Clients │ +│ grpcurl / App SDK │ OpenAI-compatible client │ +└────────────┬────────────────┴──────────────┬────────────────┘ + │ gRPC :50052 │ HTTP :8000 + ▼ ▼ +┌────────────────────────────────────────────────────────────┐ +│ Docker Container │ +│ │ +│ ┌─────────────────────┐ ┌──────────────────────────┐ │ +│ │ server.py (gRPC) │ │ openai_proxy.py (HTTP) │ │ +│ │ BrunixEngine │ │ FastAPI / Uvicorn │ │ +│ └──────────┬──────────┘ └──────────────────────────┘ │ +│ │ │ +│ ┌──────────▼──────────────────────────────────────────┐ │ +│ │ LangGraph Orchestration │ │ +│ │ classify → reformulate → retrieve → generate │ │ +│ └──────────────────────────┬───────────────────────────┘ │ +│ │ │ +│ ┌───────────────────┼────────────────────┐ │ +│ ▼ ▼ ▼ │ +│ Ollama (LLM) Ollama (Embed) Elasticsearch │ +│ via tunnel via tunnel via tunnel │ +└────────────────────────────────────────────────────────────┘ + │ kubectl port-forward tunnels │ + ▼ ▼ + Devaron Cluster (Vultr Kubernetes) + ollama-light-service:11434 brunix-vector-db:9200 + brunix-postgres:5432 Langfuse UI +``` + +--- + +## 2. Component Inventory + +| Component | File / Service | Responsibility | +|---|---|---| +| **gRPC Server** | `Docker/src/server.py` | Entry point. Implements the `AssistanceEngine` servicer. Initializes LLM, embeddings, ES client, and both graphs. | +| **Full Graph** | `Docker/src/graph.py` → `build_graph()` | Complete workflow: classify → reformulate → retrieve → generate. Used by `AskAgent` and `EvaluateRAG`. | +| **Prepare Graph** | `Docker/src/graph.py` → `build_prepare_graph()` | Partial workflow: classify → reformulate → retrieve. Does **not** call the LLM for generation. Used by `AskAgentStream` to enable manual token streaming. | +| **Message Builder** | `Docker/src/graph.py` → `build_final_messages()` | Reconstructs the final prompt list from prepared state for `llm.stream()`. | +| **Prompt Library** | `Docker/src/prompts.py` | Centralized definitions for `CLASSIFY`, `REFORMULATE`, `GENERATE`, `CODE_GENERATION`, and `CONVERSATIONAL` prompts. | +| **Agent State** | `Docker/src/state.py` | `AgentState` TypedDict shared across all graph nodes. | +| **Evaluation Suite** | `Docker/src/evaluate.py` | RAGAS-based pipeline. Uses the production retriever + Ollama LLM for generation, and Claude as the impartial judge. | +| **OpenAI Proxy** | `Docker/src/openai_proxy.py` | FastAPI application that wraps `AskAgentStream` under an `/v1/chat/completions` endpoint. | +| **LLM Factory** | `Docker/src/utils/llm_factory.py` | Provider-agnostic factory for chat models (Ollama, AWS Bedrock). | +| **Embedding Factory** | `Docker/src/utils/emb_factory.py` | Provider-agnostic factory for embedding models (Ollama, HuggingFace). | +| **Ingestion Pipeline** | `scripts/pipelines/flows/elasticsearch_ingestion.py` | Chunks and ingests AVAP documents into Elasticsearch with embeddings. | +| **Dataset Generator** | `scripts/pipelines/flows/generate_mbap.py` | Generates synthetic MBPP-style AVAP problems using Claude. | +| **MBPP Translator** | `scripts/pipelines/flows/translate_mbpp.py` | Translates MBPP Python dataset into AVAP equivalents. | + +--- + +## 3. Request Lifecycle + +### 3.1 `AskAgent` (non-streaming) + +``` +Client → gRPC AgentRequest{query, session_id} + │ + ├─ Load conversation history from session_store[session_id] + ├─ Build initial_state = {messages: history + [user_msg], ...} + │ + └─ graph.invoke(initial_state) + ├─ classify → query_type ∈ {RETRIEVAL, CODE_GENERATION, CONVERSATIONAL} + ├─ reformulate → reformulated_query (keyword-optimized for semantic search) + ├─ retrieve → context (top-8 hybrid RRF chunks from Elasticsearch) + └─ generate → final AIMessage (llm.invoke) + │ + ├─ Persist updated history to session_store[session_id] + └─ yield AgentResponse{text, avap_code="AVAP-2026", is_final=True} +``` + +### 3.2 `AskAgentStream` (token streaming) + +``` +Client → gRPC AgentRequest{query, session_id} + │ + ├─ Load history from session_store[session_id] + ├─ Build initial_state + │ + ├─ prepare_graph.invoke(initial_state) ← Phase 1: no LLM generation + │ ├─ classify + │ ├─ reformulate + │ └─ retrieve (or skip_retrieve if CONVERSATIONAL) + │ + ├─ build_final_messages(prepared_state) ← Reconstruct prompt list + │ + └─ for chunk in llm.stream(final_messages): + └─ yield AgentResponse{text=token, is_final=False} + │ + ├─ Persist full assembled response to session_store + └─ yield AgentResponse{text="", is_final=True} +``` + +### 3.3 `EvaluateRAG` + +``` +Client → gRPC EvalRequest{category?, limit?, index?} + │ + └─ evaluate.run_evaluation(...) + ├─ Load golden_dataset.json + ├─ Filter by category / limit + ├─ For each question: + │ ├─ retrieve_context (hybrid BM25+kNN, same as production) + │ └─ generate_answer (Ollama LLM + GENERATE_PROMPT) + ├─ Build RAGAS Dataset + ├─ Run RAGAS metrics with Claude as judge: + │ faithfulness / answer_relevancy / context_recall / context_precision + └─ Compute global_score + verdict (EXCELLENT / ACCEPTABLE / INSUFFICIENT) + │ + └─ return EvalResponse{scores, global_score, verdict, details[]} +``` + +--- + +## 4. LangGraph Workflow + +### 4.1 Full Graph (`build_graph`) + +``` + ┌─────────────┐ + │ classify │ + └──────┬──────┘ + │ + ┌────────────────┼──────────────────┐ + ▼ ▼ ▼ + RETRIEVAL CODE_GENERATION CONVERSATIONAL + │ │ │ + └────────┬───────┘ │ + ▼ ▼ + ┌──────────────┐ ┌────────────────────────┐ + │ reformulate │ │ respond_conversational │ + └──────┬───────┘ └───────────┬────────────┘ + ▼ │ + ┌──────────────┐ │ + │ retrieve │ │ + └──────┬───────┘ │ + │ │ + ┌────────┴───────────┐ │ + ▼ ▼ │ + ┌──────────┐ ┌───────────────┐ │ + │ generate │ │ generate_code │ │ + └────┬─────┘ └───────┬───────┘ │ + │ │ │ + └────────────────────┴────────────────┘ + │ + END +``` + +### 4.2 Prepare Graph (`build_prepare_graph`) + +Identical routing for classify, but generation nodes are replaced by `END`. The `CONVERSATIONAL` branch uses `skip_retrieve` (returns empty context without querying Elasticsearch). + +### 4.3 Query Type Routing + +| `query_type` | Triggers retrieve? | Generation prompt | +|---|---|---| +| `RETRIEVAL` | Yes | `GENERATE_PROMPT` (explanation-focused) | +| `CODE_GENERATION` | Yes | `CODE_GENERATION_PROMPT` (code-focused, returns AVAP blocks) | +| `CONVERSATIONAL` | No | `CONVERSATIONAL_PROMPT` (reformulation of prior answer) | + +--- + +## 5. RAG Pipeline — Hybrid Search + +The retrieval system (`hybrid_search_native`) fuses BM25 lexical search and kNN dense vector search using **Reciprocal Rank Fusion (RRF)**. + +``` +User query + │ + ├─ embeddings.embed_query(query) → query_vector [768-dim] + │ + ├─ ES multi_match (BM25) on fields [content^2, text^2] + │ └─ top-k BM25 hits + │ + └─ ES knn on field [embedding], num_candidates = k×5 + └─ top-k kNN hits + │ + ├─ RRF fusion: score(doc) = Σ 1/(rank + 60) + │ + └─ Top-8 documents → format_context() → context string +``` + +**RRF constant:** `60` (standard value; prevents high-rank documents from dominating while still rewarding consensus between both retrieval modes). + +**Chunk metadata** attached to each retrieved document: + +| Field | Description | +|---|---| +| `chunk_id` | Unique identifier within the index | +| `source_file` | Origin document filename | +| `doc_type` | `prose`, `code`, `code_example`, `bnf` | +| `block_type` | AVAP block type: `function`, `if`, `startLoop`, `try` | +| `section` | Document section/chapter heading | + +Documents of type `code`, `code_example`, `bnf`, or block type `function / if / startLoop / try` are tagged as `[AVAP CODE]` in the formatted context, signaling the LLM to treat them as executable syntax rather than prose. + +--- + +## 6. Streaming Architecture (AskAgentStream) + +The two-phase streaming design is critical to understand: + +**Why not stream through LangGraph?** +LangGraph's `stream()` method yields full state snapshots per node, not individual tokens. To achieve true per-token streaming to the gRPC client, the generation step is deliberately extracted from the graph and called directly via `llm.stream()`. + +**Phase 1 — Deterministic preparation (graph-managed):** +- Classification, query reformulation, and retrieval run through `prepare_graph.invoke()`. +- This phase runs synchronously and produces the complete context before any token is emitted to the client. + +**Phase 2 — Token streaming (manual):** +- `build_final_messages()` reconstructs the exact prompt that `generate` / `generate_code` / `respond_conversational` would have used. +- `llm.stream(final_messages)` yields one `AIMessageChunk` per token from Ollama. +- Each token is immediately forwarded to the gRPC client as `AgentResponse{text=token, is_final=False}`. +- After the stream ends, the full assembled text is persisted to `session_store`. + +**Backpressure:** gRPC streaming is flow-controlled by the client. If the client stops reading, the Ollama token stream will block at the `yield` point. No explicit buffer overflow protection is implemented (acceptable for the current single-client dev mode). + +--- + +## 7. Evaluation Pipeline (EvaluateRAG) + +The evaluation suite implements an **offline RAG evaluation** pattern using RAGAS metrics. + +### Judge model separation + +The production LLM (Ollama `qwen2.5:1.5b`) is used for **answer generation** — the same pipeline as production to measure real-world quality. Claude (`claude-sonnet-4-20250514`) is used as the **evaluation judge** — an independent, high-capability model that scores the generated answers against ground truth. + +### RAGAS metrics + +| Metric | Measures | Input | +|---|---|---| +| `faithfulness` | Are claims in the answer supported by the retrieved context? | answer + contexts | +| `answer_relevancy` | Is the answer relevant to the question? | answer + question | +| `context_recall` | Does the retrieved context cover the ground truth? | contexts + ground_truth | +| `context_precision` | Are the retrieved chunks useful (signal-to-noise)? | contexts + ground_truth | + +### Global score & verdict + +``` +global_score = mean(non-zero metric scores) + +verdict: + ≥ 0.80 → EXCELLENT + ≥ 0.60 → ACCEPTABLE + < 0.60 → INSUFFICIENT +``` + +### Golden dataset + +Located at `Docker/src/golden_dataset.json`. Each entry follows this schema: + +```json +{ + "id": "avap-001", + "category": "core_syntax", + "question": "How do you declare a variable in AVAP?", + "ground_truth": "Use addVar to declare a variable..." +} +``` + +--- + +## 8. Data Ingestion Pipeline + +Documents flow into the Elasticsearch index through two paths: + +### Path A — AVAP documentation (structured markdown) + +``` +docs/LRM/avap.md +docs/avap_language_github_docs/*.md +docs/developer.avapframework.com/*.md + │ + ▼ +scripts/pipelines/flows/elasticsearch_ingestion.py + │ + ├─ Load markdown files + ├─ Chunk using scripts/pipelines/tasks/chunk.py + │ (semantic chunking via Chonkie library) + ├─ Generate embeddings via scripts/pipelines/tasks/embeddings.py + │ (Ollama or HuggingFace embedding model) + └─ Bulk index into Elasticsearch + index: avap-docs-* (configurable via ELASTICSEARCH_INDEX) + mapping: {content, embedding, source_file, doc_type, section, ...} +``` + +### Path B — Synthetic AVAP code samples + +``` +docs/samples/*.avap + │ + ▼ +scripts/pipelines/flows/generate_mbap.py + │ + ├─ Read AVAP LRM (docs/LRM/avap.md) + ├─ Call Claude API to generate MBPP-style problems + └─ Output synthetic_datasets/mbpp_avap.json + (used for fine-tuning and few-shot examples) +``` + +--- + +## 9. Infrastructure Layout + +### Devaron Cluster (Vultr Kubernetes) + +| Service | K8s Name | Port | Purpose | +|---|---|---|---| +| LLM inference | `ollama-light-service` | `11434` | Text generation + embeddings | +| Vector database | `brunix-vector-db` | `9200` | Elasticsearch 8.x | +| Observability DB | `brunix-postgres` | `5432` | PostgreSQL for Langfuse | +| Langfuse UI | — | `80` | `http://45.77.119.180` | + +### Kubernetes tunnel commands + +```bash +# Terminal 1 — LLM +kubectl port-forward --address 0.0.0.0 svc/ollama-light-service 11434:11434 \ + -n brunix --kubeconfig ./kubernetes/kubeconfig.yaml + +# Terminal 2 — Elasticsearch +kubectl port-forward --address 0.0.0.0 svc/brunix-vector-db 9200:9200 \ + -n brunix --kubeconfig ./kubernetes/kubeconfig.yaml + +# Terminal 3 — PostgreSQL (Langfuse) +kubectl port-forward --address 0.0.0.0 svc/brunix-postgres 5432:5432 \ + -n brunix --kubeconfig ./kubernetes/kubeconfig.yaml +``` + +### Port map summary + +| Port | Protocol | Service | Scope | +|---|---|---|---| +| `50051` | gRPC | Brunix Engine (inside container) | Internal | +| `50052` | gRPC | Brunix Engine (host-mapped) | External | +| `8000` | HTTP | OpenAI proxy | External | +| `11434` | HTTP | Ollama (via tunnel) | Tunnel | +| `9200` | HTTP | Elasticsearch (via tunnel) | Tunnel | +| `5432` | TCP | PostgreSQL/Langfuse (via tunnel) | Tunnel | + +--- + +## 10. Session State & Conversation Memory + +Conversation history is managed via an in-process dictionary: + +```python +session_store: dict[str, list] = defaultdict(list) +# key: session_id (string, provided by client) +# value: list of LangChain BaseMessage objects +``` + +**Characteristics:** +- **In-memory only.** History is lost on container restart. +- **No TTL or eviction.** Sessions grow unbounded for the lifetime of the process. +- **Thread safety:** Python's GIL provides basic safety for the `ThreadPoolExecutor(max_workers=10)` gRPC server, but concurrent writes to the same `session_id` from two simultaneous requests are not explicitly protected. +- **History window:** `format_history_for_classify()` uses only the last 6 messages for query classification to keep the classify prompt short and deterministic. + +> **Future work:** Replace `session_store` with a Redis-backed persistent store to survive restarts and support horizontal scaling. + +--- + +## 11. Observability Stack + +### Langfuse tracing + +The server integrates Langfuse for end-to-end LLM tracing. Every `AskAgent` / `AskAgentStream` request creates a trace that captures: +- Input query and session ID +- Each LangGraph node execution (classify, reformulate, retrieve, generate) +- LLM token counts, latency, and cost +- Final response + +**Access:** `http://45.77.119.180` — requires a project API key configured via `LANGFUSE_PUBLIC_KEY` and `LANGFUSE_SECRET_KEY`. + +### Logging + +Structured logging via Python's `logging` module, configured at `INFO` level. Log format: + +``` +[MODULE] context_info — key=value key=value +``` + +Key log markers: + +| Marker | Module | Meaning | +|---|---|---| +| `[ESEARCH]` | `server.py` | Elasticsearch connection status | +| `[classify]` | `graph.py` | Query type decision + raw LLM output | +| `[reformulate]` | `graph.py` | Reformulated query string | +| `[hybrid]` | `graph.py` | BM25 / kNN hit counts and RRF result count | +| `[retrieve]` | `graph.py` | Number of docs retrieved and context length | +| `[generate]` | `graph.py` | Response character count | +| `[AskAgentStream]` | `server.py` | Token count and total chars per stream | +| `[eval]` | `evaluate.py` | Per-question retrieval and generation status | + +--- + +## 12. Security Boundaries + +| Boundary | Current state | Risk | +|---|---|---| +| gRPC transport | **Insecure** (`add_insecure_port`) | Network interception possible. Acceptable in dev/tunnel setup; requires mTLS for production. | +| Elasticsearch auth | Optional (user/pass or API key via env vars) | Index is accessible without auth if `ELASTICSEARCH_USER` and `ELASTICSEARCH_API_KEY` are unset. | +| Container user | Non-root (`python:3.11-slim` default) | Low risk. Do not override with `root`. | +| Secrets in env | Via `.env` / `docker-compose` env injection | Never commit real values. See [CONTRIBUTING.md](../CONTRIBUTING.md#6-environment-variables-policy). | +| Session store | In-memory, no auth | Any caller with access to the gRPC port can read/write any session by guessing its ID. | +| Kubeconfig | `./kubernetes/kubeconfig.yaml` (local only) | Grants cluster access. Never commit. Listed in `.gitignore`. | + +--- + +## 13. Known Limitations & Future Work + +| Area | Limitation | Proposed solution | +|---|---|---| +| Session persistence | In-memory, lost on restart | Redis-backed `session_store` | +| Horizontal scaling | `session_store` is per-process | Sticky sessions or external session store | +| gRPC security | Insecure port | Add TLS + optional mTLS | +| Elasticsearch auth | Not enforced if vars unset | Make auth required; fail-fast on startup | +| Context window | Full history passed to generate; no truncation | Sliding window or summarization for long sessions | +| Evaluation | Golden dataset must be manually maintained | Automated golden dataset refresh pipeline | +| Rate limiting | None on gRPC server | Add interceptor-based rate limiter | +| Health check | No gRPC health protocol | Implement `grpc.health.v1` | diff --git a/docs/AVAP_CHUNKER_CONFIG.md b/docs/AVAP_CHUNKER_CONFIG.md new file mode 100644 index 0000000..011e0ec --- /dev/null +++ b/docs/AVAP_CHUNKER_CONFIG.md @@ -0,0 +1,372 @@ +# AVAP Chunker — Language Configuration Reference + +> **File:** `scripts/pipelines/ingestion/avap_config.json` +> **Used by:** `avap_chunker.py` (Pipeline B) +> **Last updated:** 2026-03-18 + +This file is the **grammar definition** for the AVAP language chunker. It tells `avap_chunker.py` how to tokenize, parse, and semantically classify `.avap` source files before they are embedded and ingested into Elasticsearch. Modifying this file changes what the chunker recognises as a block, a statement, or a semantic feature — and therefore what metadata every chunk in the knowledge base carries. + +--- + +## Table of Contents + +1. [Top-Level Fields](#1-top-level-fields) +2. [Lexer](#2-lexer) +3. [Blocks](#3-blocks) +4. [Statements](#4-statements) +5. [Semantic Tags](#5-semantic-tags) +6. [How They Work Together](#6-how-they-work-together) +7. [Adding New Constructs](#7-adding-new-constructs) +8. [Full Annotated Example](#8-full-annotated-example) + +--- + +## 1. Top-Level Fields + +```json +{ + "language": "avap", + "version": "1.0", + "file_extensions": [".avap"] +} +``` + +| Field | Type | Description | +|---|---|---| +| `language` | string | Human-readable language name. Used in chunker progress reports. | +| `version` | string | Config schema version. Increment when making breaking changes. | +| `file_extensions` | array of strings | File extensions the chunker will process. `.md` files are always processed regardless of this setting. | + +--- + +## 2. Lexer + +The lexer section controls how raw source lines are stripped of comments and string literals before pattern matching is applied. + +```json +"lexer": { + "string_delimiters": ["\"", "'"], + "escape_char": "\\", + "comment_line": ["///", "//"], + "comment_block": { "open": "/*", "close": "*/" }, + "line_oriented": true +} +``` + +| Field | Type | Description | +|---|---|---| +| `string_delimiters` | array of strings | Characters that open and close string literals. Content inside strings is ignored during pattern matching. | +| `escape_char` | string | Character used to escape the next character inside a string. Prevents `\"` from closing the string. | +| `comment_line` | array of strings | Line comment prefixes, evaluated longest-first. Everything after the matched prefix is stripped. AVAP supports both `///` (documentation comments) and `//` (inline comments). | +| `comment_block.open` | string | Block comment opening delimiter. | +| `comment_block.close` | string | Block comment closing delimiter. Content between `/*` and `*/` is stripped before pattern matching. | +| `line_oriented` | bool | When `true`, the lexer processes one line at a time. Should always be `true` for AVAP. | + +**Important:** Comment stripping and string boundary detection happen before any block or statement pattern is evaluated. A keyword inside a string literal or a comment will never trigger a block or statement match. + +--- + +## 3. Blocks + +Blocks are **multi-line constructs** with a defined opener and closer. The chunker tracks nesting depth — each opener increments depth, each closer decrements it, and the block ends when depth returns to zero. This correctly handles nested `if()` inside `function{}` and similar cases. + +Each block definition produces a chunk with `doc_type` as specified and `block_type` equal to the block `name`. + +```json +"blocks": [ + { + "name": "function", + "doc_type": "code", + "opener_pattern": "^\\s*function\\s+(\\w+)\\s*\\(([^)]*)", + "closer_pattern": "^\\s*\\}\\s*$", + "extract_signature": true, + "signature_template": "function {group1}({group2})" + }, + ... +] +``` + +### Block fields + +| Field | Type | Required | Description | +|---|---|---|---| +| `name` | string | Yes | Identifier for this block type. Used as `block_type` in the chunk metadata and in the `semantic_overlap` context header. | +| `doc_type` | string | Yes | Elasticsearch `doc_type` field value for chunks from this block. | +| `opener_pattern` | regex string | Yes | Pattern matched against the clean (comment-stripped) line to detect the start of this block. Must be anchored at the start (`^`). | +| `closer_pattern` | regex string | Yes | Pattern matched to detect the end of this block. Checked at every line after the opener. | +| `extract_signature` | bool | No (default: `false`) | When `true`, the chunker extracts a compact signature string from the opener line using capture groups, and creates an additional `function_signature` chunk alongside the full block chunk. | +| `signature_template` | string | No | Template for the signature string. Uses `{group1}`, `{group2}`, etc. as placeholders for the regex capture groups from `opener_pattern`. | + +### Current block definitions + +#### `function` + +``` +opener: ^\\s*function\\s+(\\w+)\\s*\\(([^)]*) +closer: ^\\s*\\}\\s*$ +``` + +Matches any top-level or nested AVAP function declaration. The two capture groups extract the function name (`group1`) and parameter list (`group2`), which are combined into the signature template `function {group1}({group2})`. + +Because `extract_signature: true`, every function produces **two chunks**: +1. A `doc_type: "code"`, `block_type: "function"` chunk containing the full function body. +2. A `doc_type: "function_signature"`, `block_type: "function_signature"` chunk containing only the signature string (e.g. `function validateAccess(userId, token)`). This lightweight chunk is indexed separately to enable fast function-name lookup without retrieving the entire body. + +Additionally, the function signature is registered in the `SemanticOverlapBuffer`. Subsequent non-function chunks in the same file will receive the current function signature prepended as a context comment (`// contexto: function validateAccess(userId, token)`), keeping the surrounding code semantically grounded. + +#### `if` + +``` +opener: ^\\s*if\\s*\\( +closer: ^\\s*end\\s*\\(\\s*\\) +``` + +Matches AVAP conditional blocks. Note: AVAP uses `end()` as the closer, not `}`. + +#### `startLoop` + +``` +opener: ^\\s*startLoop\\s*\\( +closer: ^\\s*endLoop\\s*\\(\\s*\\) +``` + +Matches AVAP iteration blocks. The closer is `endLoop()`. + +#### `try` + +``` +opener: ^\\s*try\\s*\\(\\s*\\) +closer: ^\\s*end\\s*\\(\\s*\\) +``` + +Matches AVAP error-handling blocks (`try()` … `end()`). + +--- + +## 4. Statements + +Statements are **single-line constructs**. Lines that are not part of any block opener or closer are classified against the statement patterns in order. The first match wins. If no pattern matches, the statement is classified as `"statement"` (the fallback). + +Consecutive lines with the same statement type are **grouped into a single chunk**, keeping semantically related statements together. When the statement type changes, the current group is flushed as a chunk. + +```json +"statements": [ + { "name": "registerEndpoint", "pattern": "^\\s*registerEndpoint\\s*\\(" }, + { "name": "addVar", "pattern": "^\\s*addVar\\s*\\(" }, + ... +] +``` + +### Statement fields + +| Field | Type | Description | +|---|---|---| +| `name` | string | Used as `block_type` in the chunk metadata. | +| `pattern` | regex string | Matched against the clean line. First match wins — order matters. | + +### Current statement definitions + +| Name | Matches | AVAP commands | +|---|---|---| +| `registerEndpoint` | API route registration | `registerEndpoint(...)` | +| `addVar` | Variable declaration | `addVar(...)` | +| `io_command` | Input/output operations | `addParam`, `getListLen`, `addResult`, `getQueryParamList` | +| `http_command` | HTTP client calls | `RequestPost`, `RequestGet` | +| `orm_command` | Database ORM operations | `ormDirect`, `ormCheckTable`, `ormCreateTable`, `ormAccessSelect`, `ormAccessInsert`, `ormAccessUpdate` | +| `util_command` | Utility and helper functions | `variableToList`, `itemFromList`, `variableFromJSON`, `AddVariableToJSON`, `encodeSHA256`, `encodeMD5`, `getRegex`, `getDateTime`, `stampToDatetime`, `getTimeStamp`, `randomString`, `replace` | +| `async_command` | Concurrency primitives | `x = go funcName(`, `gather(` | +| `connector` | External service connector | `x = avapConnector(` | +| `modularity` | Module imports | `import`, `include` | +| `assignment` | Variable assignment (catch-all before fallback) | `x = ...` | + +**Ordering note:** `registerEndpoint`, `addVar`, and the specific command categories are listed before `assignment` intentionally. `assignment` would match many of them (they all contain `=` or are function calls that could follow an assignment), so the more specific patterns must come first. + +--- + +## 5. Semantic Tags + +Semantic tags are **boolean metadata flags** applied to every chunk (both blocks and statements) by scanning the entire chunk content with a regex. A chunk can have multiple tags simultaneously. + +The `complexity` field is automatically computed as the count of `true` tags in a chunk's metadata, providing a rough signal of how much AVAP functionality a given chunk exercises. + +```json +"semantic_tags": [ + { "tag": "uses_orm", "pattern": "\\b(ormDirect|ormAccessSelect|...)\\s*\\(" }, + ... +] +``` + +### Tag fields + +| Field | Description | +|---|---| +| `tag` | Key name in the `metadata` object stored in Elasticsearch. Value is always `true` when present. | +| `pattern` | Regex searched (not matched) across the full chunk text. Uses `\b` word boundaries to avoid false positives. | + +### Current semantic tags + +| Tag | Detected when chunk contains | +|---|---| +| `uses_orm` | Any ORM command: `ormDirect`, `ormCheckTable`, `ormCreateTable`, `ormAccessSelect`, `ormAccessInsert`, `ormAccessUpdate` | +| `uses_http` | HTTP client calls: `RequestPost`, `RequestGet` | +| `uses_connector` | External connector: `avapConnector(` | +| `uses_async` | Concurrency: `go funcName(` or `gather(` | +| `uses_crypto` | Hashing/encoding: `encodeSHA256(`, `encodeMD5(` | +| `uses_auth` | Auth-related commands: `addParam`, `_status` | +| `uses_error_handling` | Error handling block: `try()` | +| `uses_loop` | Loop construct: `startLoop(` | +| `uses_json` | JSON operations: `variableFromJSON(`, `AddVariableToJSON(` | +| `uses_list` | List operations: `variableToList(`, `itemFromList(`, `getListLen(` | +| `uses_regex` | Regular expressions: `getRegex(` | +| `uses_datetime` | Date/time operations: `getDateTime(`, `getTimeStamp(`, `stampToDatetime(` | +| `returns_result` | Returns data to the API caller: `addResult(` | +| `registers_endpoint` | Defines an API route: `registerEndpoint(` | + +**How tags are used at retrieval time:** The Elasticsearch mapping stores each tag as a `boolean` field under the `metadata` object. This enables filtered retrieval — for example, a future retrieval enhancement could boost chunks with `metadata.uses_orm: true` for queries that contain ORM-related keywords, improving precision for database-related questions. + +--- + +## 6. How They Work Together + +The following example shows how `avap_chunker.py` processes a real `.avap` file using this config: + +```avap +// Validate user session +function validateAccess(userId, token) { + addVar(isValid = false) + addParam(userId) + try() + ormAccessSelect(users, id = userId) + addVar(isValid = true) + end() + addResult(isValid) +} + +registerEndpoint(POST, /validate) +``` + +**Chunks produced:** + +| # | `doc_type` | `block_type` | Content | Tags | +|---|---|---|---|---| +| 1 | `code` | `function` | Full function body (lines 2–10) | `uses_auth`, `uses_orm`, `uses_error_handling`, `returns_result` · `complexity: 4` | +| 2 | `function_signature` | `function_signature` | `function validateAccess(userId, token)` | — | +| 3 | `code` | `registerEndpoint` | `registerEndpoint(POST, /validate)` | `registers_endpoint` · `complexity: 1` | + +Chunk 1 also receives the function signature as a semantic overlap header because the `SemanticOverlapBuffer` tracks `validateAccess` and injects it as context into any subsequent non-function chunks in the same file. + +--- + +## 7. Adding New Constructs + +### Adding a new block type + +1. Identify the opener and closer patterns from the AVAP LRM (`docs/LRM/avap.md`). +2. Add an entry to `"blocks"` in `avap_config.json`. +3. If the block introduces a named construct worth indexing independently (like functions), set `"extract_signature": true` and define a `"signature_template"`. +4. Run a smoke test on a representative `.avap` file: + ```bash + python scripts/pipelines/ingestion/avap_chunker.py \ + --lang-config scripts/pipelines/ingestion/avap_config.json \ + --docs-path docs/samples \ + --output /tmp/test_chunks.jsonl \ + --no-dedup + ``` +5. Inspect `/tmp/test_chunks.jsonl` and verify the new `block_type` appears with the expected content. +6. Re-run the ingestion pipeline to rebuild the index. + +### Adding a new statement category + +1. Add an entry to `"statements"` **before** the `assignment` catch-all. +2. Use `^\\s*` to anchor the pattern at the start of the line. +3. Test as above — verify the new `block_type` appears in the JSONL output. + +### Adding a new semantic tag + +1. Add an entry to `"semantic_tags"`. +2. Use `\\b` word boundaries to prevent false positives on substrings. +3. Add the new tag as a `boolean` field to the Elasticsearch index mapping in `avap_ingestor.py` (`build_index_mapping()`). +4. **Re-index from scratch** — existing documents will not have the new tag unless the index is rebuilt (`--delete` flag). + +--- + +## 8. Full Annotated Example + +```jsonc +{ + // Identifies this config as the AVAP v1.0 grammar + "language": "avap", + "version": "1.0", + "file_extensions": [".avap"], // Only .avap files; .md is always included + + "lexer": { + "string_delimiters": ["\"", "'"], // Both quote styles used in AVAP + "escape_char": "\\", + "comment_line": ["///", "//"], // /// first — longest match wins + "comment_block": { "open": "/*", "close": "*/" }, + "line_oriented": true + }, + + "blocks": [ + { + "name": "function", + "doc_type": "code", + // Captures: group1=name, group2=params + "opener_pattern": "^\\s*function\\s+(\\w+)\\s*\\(([^)]*)", + "closer_pattern": "^\\s*\\}\\s*$", // AVAP functions close with } + "extract_signature": true, + "signature_template": "function {group1}({group2})" + }, + { + "name": "if", + "doc_type": "code", + "opener_pattern": "^\\s*if\\s*\\(", + "closer_pattern": "^\\s*end\\s*\\(\\s*\\)" // AVAP if closes with end() + }, + { + "name": "startLoop", + "doc_type": "code", + "opener_pattern": "^\\s*startLoop\\s*\\(", + "closer_pattern": "^\\s*endLoop\\s*\\(\\s*\\)" + }, + { + "name": "try", + "doc_type": "code", + "opener_pattern": "^\\s*try\\s*\\(\\s*\\)", + "closer_pattern": "^\\s*end\\s*\\(\\s*\\)" // try also closes with end() + } + ], + + "statements": [ + // Specific patterns first — must come before the generic "assignment" catch-all + { "name": "registerEndpoint", "pattern": "^\\s*registerEndpoint\\s*\\(" }, + { "name": "addVar", "pattern": "^\\s*addVar\\s*\\(" }, + { "name": "io_command", "pattern": "^\\s*(addParam|getListLen|addResult|getQueryParamList)\\s*\\(" }, + { "name": "http_command", "pattern": "^\\s*(RequestPost|RequestGet)\\s*\\(" }, + { "name": "orm_command", "pattern": "^\\s*(ormDirect|ormCheckTable|ormCreateTable|ormAccessSelect|ormAccessInsert|ormAccessUpdate)\\s*\\(" }, + { "name": "util_command", "pattern": "^\\s*(variableToList|itemFromList|variableFromJSON|AddVariableToJSON|encodeSHA256|encodeMD5|getRegex|getDateTime|stampToDatetime|getTimeStamp|randomString|replace)\\s*\\(" }, + { "name": "async_command", "pattern": "^\\s*(\\w+\\s*=\\s*go\\s+|gather\\s*\\()" }, + { "name": "connector", "pattern": "^\\s*\\w+\\s*=\\s*avapConnector\\s*\\(" }, + { "name": "modularity", "pattern": "^\\s*(import|include)\\s+" }, + { "name": "assignment", "pattern": "^\\s*\\w+\\s*=\\s*" } // catch-all + ], + + "semantic_tags": [ + // Applied to every chunk by full-content regex search (not line-by-line) + { "tag": "uses_orm", "pattern": "\\b(ormDirect|ormCheckTable|ormCreateTable|ormAccessSelect|ormAccessInsert|ormAccessUpdate)\\s*\\(" }, + { "tag": "uses_http", "pattern": "\\b(RequestPost|RequestGet)\\s*\\(" }, + { "tag": "uses_connector", "pattern": "\\bavapConnector\\s*\\(" }, + { "tag": "uses_async", "pattern": "\\bgo\\s+\\w+\\s*\\(|\\bgather\\s*\\(" }, + { "tag": "uses_crypto", "pattern": "\\b(encodeSHA256|encodeMD5)\\s*\\(" }, + { "tag": "uses_auth", "pattern": "\\b(addParam|_status)\\b" }, + { "tag": "uses_error_handling", "pattern": "\\btry\\s*\\(\\s*\\)" }, + { "tag": "uses_loop", "pattern": "\\bstartLoop\\s*\\(" }, + { "tag": "uses_json", "pattern": "\\b(variableFromJSON|AddVariableToJSON)\\s*\\(" }, + { "tag": "uses_list", "pattern": "\\b(variableToList|itemFromList|getListLen)\\s*\\(" }, + { "tag": "uses_regex", "pattern": "\\bgetRegex\\s*\\(" }, + { "tag": "uses_datetime", "pattern": "\\b(getDateTime|getTimeStamp|stampToDatetime)\\s*\\(" }, + { "tag": "returns_result", "pattern": "\\baddResult\\s*\\(" }, + { "tag": "registers_endpoint", "pattern": "\\bregisterEndpoint\\s*\\(" } + ] +} +``` diff --git a/docs/LRM/avap.md b/docs/LRM/avap.md index 1805d2e..37950b5 100644 --- a/docs/LRM/avap.md +++ b/docs/LRM/avap.md @@ -1,6 +1,6 @@ ### Prefacio Arquitectónico -**AVAP es un DSL (Domain-Specific Language) Turing Completo, diseñado arquitectónicamente para la orquestación segura, concurrente y determinista de microservicios e I/O.** No es un lenguaje de propósito general; su motor híbrido y su gramática estricta están optimizados para el procesamiento rápido de transacciones HTTP, la manipulación de datos en memoria y la persistencia, minimizando los efectos secundarios no deseados. +**AVAP (Advanced Virtual API Programming) es un DSL (Domain-Specific Language) Turing Completo, diseñado arquitectónicamente para la orquestación segura, concurrente y determinista de microservicios e I/O.** No es un lenguaje de propósito general; su motor híbrido y su gramática estricta están optimizados para el procesamiento rápido de transacciones HTTP, la manipulación de datos en memoria y la persistencia, minimizando los efectos secundarios no deseados. --- @@ -388,7 +388,7 @@ AVAP provee tres comandos complementarios para cubrir todas las conversiones pos /* Expresiones regulares */ ::= "getRegex(" "," "," ")" -/* Fecha/hora actual → string */ +/* Fecha/hora actual -> string */ ::= "getDateTime(" "," "," "," ")" /* Argumentos: formato_salida, timedelta, zona_horaria, destino */ diff --git a/docs/RUNBOOK.md b/docs/RUNBOOK.md new file mode 100644 index 0000000..85a5dac --- /dev/null +++ b/docs/RUNBOOK.md @@ -0,0 +1,389 @@ +# Brunix Assistance Engine — Operations Runbook + +> **Audience:** Engineers on-call, DevOps, and anyone debugging the Brunix Engine in a live environment. +> **Last updated:** 2026-03-18 + +--- + +## Table of Contents + +1. [Health Checks](#1-health-checks) +2. [Starting the Engine](#2-starting-the-engine) +3. [Stopping & Restarting](#3-stopping--restarting) +4. [Tunnel Management](#4-tunnel-management) +5. [Incident Playbooks](#5-incident-playbooks) + - [Engine fails to start](#51-engine-fails-to-start) + - [Elasticsearch unreachable](#52-elasticsearch-unreachable) + - [Ollama unreachable / model not found](#53-ollama-unreachable--model-not-found) + - [AskAgent returns `[ENG] Error`](#54-askagent-returns-eng-error) + - [EvaluateRAG returns ANTHROPIC_API_KEY error](#55-evaluaterag-returns-anthropic_api_key-error) + - [Container memory / OOM](#56-container-memory--oom) + - [Session history not persisting between requests](#57-session-history-not-persisting-between-requests) +6. [Log Reference](#6-log-reference) +7. [Useful Commands](#7-useful-commands) +8. [Escalation Path](#8-escalation-path) + +--- + +## 1. Health Checks + +### Is the gRPC server up? + +```bash +grpcurl -plaintext localhost:50052 list +# Expected: brunix.AssistanceEngine +``` + +If `grpcurl` hangs or returns a connection error, the container is not running or the port is not mapped. + +### Is Elasticsearch reachable? + +```bash +curl -s http://localhost:9200/_cluster/health | python3 -m json.tool +# Expected: "status": "green" or "yellow" +``` + +### Is Ollama reachable? + +```bash +curl -s http://localhost:11434/api/tags | python3 -m json.tool +# Expected: list of available models including qwen2.5:1.5b +``` + +### Is the embedding model loaded? + +```bash +curl -s http://localhost:11434/api/tags | grep qwen3-0.6B-emb +# Expected: model entry present +``` + +### Is Langfuse reachable? + +```bash +curl -s http://45.77.119.180/api/public/health +# Expected: {"status":"ok"} +``` + +--- + +## 2. Starting the Engine + +### Prerequisites checklist + +- [ ] Kubeconfig present at `./kubernetes/kubeconfig.yaml` +- [ ] `.env` file populated with all required variables (see `README.md`) +- [ ] All three kubectl tunnels active (see [§4](#4-tunnel-management)) +- [ ] Docker daemon running + +### Start command + +```bash +cd Docker/ +docker-compose up -d --build +``` + +### Verify startup + +```bash +# Watch logs until you see "Brunix Engine initialized." +docker logs -f brunix-assistance-engine + +# Expected log sequence: +# [ESEARCH] Connected: 8.x.x — index: avap-docs-test +# [ENGINE] listen on 50051 (gRPC) +# Brunix Engine initialized. +# [entrypoint] Starting OpenAI Proxy (HTTP :8000)... +``` + +**Startup typically takes 20–60 seconds** depending on Ollama model loading time. + +--- + +## 3. Stopping & Restarting + +```bash +# Graceful stop +docker-compose down + +# Hard stop (if container is unresponsive) +docker stop brunix-assistance-engine +docker rm brunix-assistance-engine + +# Restart only the engine (no rebuild) +docker-compose restart brunix-engine + +# Rebuild and restart (after code changes) +docker-compose up -d --build +``` + +> ⚠️ **Restart clears all in-memory session history.** All active conversations will lose context. + +--- + +## 4. Tunnel Management + +All three tunnels must be active for the engine to function. Run each in a separate terminal or as a background process. + +```bash +# Tunnel 1 — Ollama (LLM + embeddings) +kubectl port-forward --address 0.0.0.0 svc/ollama-light-service 11434:11434 \ + -n brunix --kubeconfig ./kubernetes/kubeconfig.yaml + +# Tunnel 2 — Elasticsearch (vector knowledge base) +kubectl port-forward --address 0.0.0.0 svc/brunix-vector-db 9200:9200 \ + -n brunix --kubeconfig ./kubernetes/kubeconfig.yaml + +# Tunnel 3 — PostgreSQL (Langfuse observability) +kubectl port-forward --address 0.0.0.0 svc/brunix-postgres 5432:5432 \ + -n brunix --kubeconfig ./kubernetes/kubeconfig.yaml +``` + +### Check tunnel status + +```bash +# List active port-forwards +ps aux | grep "kubectl port-forward" + +# Alternatively +lsof -i :11434 +lsof -i :9200 +lsof -i :5432 +``` + +### Tunnel dropped? + +kubectl tunnels drop silently. Symptoms: +- Elasticsearch: `[ESEARCH] Cant Connect` in engine logs +- Ollama: requests timeout or return connection errors +- Langfuse: tracing data stops appearing in the dashboard + +**Fix:** Re-run the affected tunnel command. The engine will reconnect automatically on the next request. + +--- + +## 5. Incident Playbooks + +### 5.1 Engine fails to start + +**Symptom:** `docker-compose up` exits immediately, or container restarts in a loop. + +**Diagnosis:** +```bash +docker logs brunix-assistance-engine 2>&1 | head -50 +``` + +**Common causes and fixes:** + +| Log message | Cause | Fix | +|---|---|---| +| `Cannot connect to Ollama` | Ollama tunnel not running | Start Tunnel 1 | +| `model 'qwen2.5:1.5b' not found` | Model not loaded in Ollama | See [§5.3](#53-ollama-unreachable--model-not-found) | +| `ELASTICSEARCH_URL not set` | Missing `.env` | Check `.env` file exists and is complete | +| `No module named 'brunix_pb2'` | Proto stubs not generated | Run `docker-compose up --build` | +| `Port 50051 already in use` | Another instance running | `docker stop brunix-assistance-engine && docker rm brunix-assistance-engine` | + +--- + +### 5.2 Elasticsearch unreachable + +**Symptom:** Log shows `[ESEARCH] Cant Connect`. Queries return empty context. + +**Step 1 — Verify tunnel:** +```bash +curl -s http://localhost:9200/_cluster/health +``` + +**Step 2 — Restart tunnel if down:** +```bash +kubectl port-forward --address 0.0.0.0 svc/brunix-vector-db 9200:9200 \ + -n brunix --kubeconfig ./kubernetes/kubeconfig.yaml +``` + +**Step 3 — Check index exists:** +```bash +curl -s http://localhost:9200/_cat/indices?v | grep avap +``` + +If the index is missing, the knowledge base has not been ingested. Run: +```bash +cd scripts/pipelines/flows/ +python elasticsearch_ingestion.py +``` + +**Step 4 — Verify authentication:** +If your cluster uses authentication, confirm `ELASTICSEARCH_USER` + `ELASTICSEARCH_PASSWORD` or `ELASTICSEARCH_API_KEY` are set in `.env`. + +--- + +### 5.3 Ollama unreachable / model not found + +**Symptom:** Engine logs show connection errors to `http://host.docker.internal:11434`, or `validate_model_on_init=True` raises a model-not-found error on startup. + +**Step 1 — Verify Ollama tunnel is active:** +```bash +curl -s http://localhost:11434/api/tags +``` + +**Step 2 — List available models:** +```bash +curl -s http://localhost:11434/api/tags | python3 -c " +import json, sys +data = json.load(sys.stdin) +for m in data.get('models', []): + print(m['name']) +" +``` + +**Step 3 — Pull missing models if needed:** +```bash +# On the Devaron cluster (via kubectl exec or direct access): +ollama pull qwen2.5:1.5b +ollama pull qwen3-0.6B-emb:latest +``` + +**Step 4 — Restart engine** after models are available: +```bash +docker-compose restart brunix-engine +``` + +--- + +### 5.4 AskAgent returns `[ENG] Error` + +**Symptom:** Client receives `{"text": "[ENG] Error: ...", "is_final": true}`. + +**Diagnosis:** +```bash +docker logs brunix-assistance-engine 2>&1 | grep -A 10 "Error" +``` + +| Error substring | Cause | Fix | +|---|---|---| +| `Connection refused` to `11434` | Ollama tunnel down | Restart Tunnel 1 | +| `Connection refused` to `9200` | ES tunnel down | Restart Tunnel 2 | +| `Index not found` | ES index missing | Run ingestion pipeline | +| `context length exceeded` | Query + history too long for model | Reduce session history or use a larger context model | +| `Traceback` / `KeyError` | Code bug | Check full traceback, open GitHub Issue | + +--- + +### 5.5 EvaluateRAG returns ANTHROPIC_API_KEY error + +**Symptom:** `EvalResponse.status` = `"ANTHROPIC_API_KEY no configurada en .env"`. + +**Fix:** +1. Add `ANTHROPIC_API_KEY=sk-ant-...` to your `.env` file. +2. Add `ANTHROPIC_MODEL=claude-sonnet-4-20250514` (optional, has default). +3. Restart the engine: `docker-compose restart brunix-engine`. + +--- + +### 5.6 Container memory / OOM + +**Symptom:** Container is killed by the OOM killer. `docker inspect brunix-assistance-engine` shows `OOMKilled: true`. + +**Diagnosis:** +```bash +docker stats brunix-assistance-engine +``` + +**Common causes:** +- Large context window being passed to Ollama (many retrieved chunks × long document). +- Session history growing unbounded over a long-running session. + +**Mitigation:** +- Set `mem_limit` in `docker-compose.yaml`: + ```yaml + services: + brunix-engine: + mem_limit: 4g + ``` +- Restart the container to clear session store. +- Consider reducing `k=8` in `hybrid_search_native` to limit context size. + +--- + +### 5.7 Session history not persisting between requests + +**Expected behaviour:** Sending two requests with the same `session_id` should maintain context. + +**If Turn 2 does not seem to know about Turn 1:** + +1. Confirm both requests use **identical** `session_id` strings (case-sensitive, no trailing spaces). +2. Confirm the engine was **not restarted** between the two requests (restart wipes `session_store`). +3. Check logs for `[AskAgentStream] conversation: N previous messages.` — if `N=0` on Turn 2, the session was not found. +4. Confirm the stream for Turn 1 was **fully consumed** (client read all messages including `is_final=true`) — the engine only persists history after the stream ends. + +--- + +## 6. Log Reference + +| Log prefix | Module | What it means | +|---|---|---| +| `[ESEARCH] Connected` | `server.py` | Elasticsearch OK on startup | +| `[ESEARCH] Cant Connect` | `server.py` | Elasticsearch unreachable on startup | +| `[ENGINE] listen on 50051` | `server.py` | gRPC server ready | +| `[AskAgent] session=... query=...` | `server.py` | New non-streaming request | +| `[AskAgent] conversation: N messages` | `server.py` | History loaded for session | +| `[AskAgentStream] done — chunks=N` | `server.py` | Stream completed, history saved | +| `[classify] raw=... -> TYPE` | `graph.py` | Query classification result | +| `[reformulate] -> '...'` | `graph.py` | Reformulated query | +| `[hybrid] BM25 -> N hits` | `graph.py` | BM25 retrieval result | +| `[hybrid] kNN -> N hits` | `graph.py` | kNN retrieval result | +| `[hybrid] RRF -> N final docs` | `graph.py` | After RRF fusion | +| `[retrieve] N docs, context len=X` | `graph.py` | Context assembled | +| `[generate] X chars` | `graph.py` | Non-streaming answer generated | +| `[eval] Iniciando: N preguntas` | `evaluate.py` | Evaluation started | +| `[eval] Completado — global=X` | `evaluate.py` | Evaluation finished | + +--- + +## 7. Useful Commands + +```bash +# Real-time log streaming +docker logs -f brunix-assistance-engine + +# Filter for errors only +docker logs brunix-assistance-engine 2>&1 | grep -i error + +# Check container resource usage +docker stats brunix-assistance-engine --no-stream + +# Enter container for debugging +docker exec -it brunix-assistance-engine /bin/bash + +# Send a test query +grpcurl -plaintext \ + -d '{"query": "What is AVAP?", "session_id": "test"}' \ + localhost:50052 brunix.AssistanceEngine/AskAgent + +# Check ES index document count +curl -s "http://localhost:9200/avap-docs-test/_count" | python3 -m json.tool + +# Check ES index mapping +curl -s "http://localhost:9200/avap-docs-test/_mapping" | python3 -m json.tool + +# List active containers +docker ps --filter name=brunix + +# Check port bindings +docker port brunix-assistance-engine +``` + +--- + +## 8. Escalation Path + +| Severity | Condition | Action | +|---|---|---| +| P1 | Engine completely down, not recoverable in 15 min | Notify via Slack `#brunix-incidents` immediately. Tag CTO. | +| P2 | Degraded quality (bad answers) or evaluation score drops below 0.60 | Open GitHub Issue with full log output and evaluation report. | +| P3 | Tunnel instability, intermittent errors | Report in daily standup. Document in GitHub Issue within 24h. | +| P4 | Documentation gap or non-critical config issue | Open GitHub Issue with label `documentation` or `improvement`. | + +**For all P1/P2 incidents, the GitHub Issue must include:** +1. Exact command that triggered the failure +2. Full terminal output / error log +3. Status of all three kubectl tunnels at the time of failure +4. Docker container status (`docker inspect brunix-assistance-engine`) diff --git a/docs/SECURITY.md b/docs/SECURITY.md new file mode 100644 index 0000000..5e80239 --- /dev/null +++ b/docs/SECURITY.md @@ -0,0 +1,102 @@ +# Security Policy + +## Supported Versions + +| Version | Security patches | +|---|---| +| 1.5.x | ✅ Active | +| 1.4.x | ⚠️ Critical fixes only | +| < 1.4 | ❌ Not supported | + +--- + +## Reporting a Vulnerability + +**Do not open a public GitHub Issue for security vulnerabilities.** + +Report security issues directly to the CTO via the private Slack channel `#brunix-security` or by email to the address on file. Include: + +1. A clear description of the vulnerability and its potential impact. +2. Steps to reproduce (proof-of-concept if applicable). +3. Affected component(s) and version(s). +4. Suggested remediation, if known. + +You will receive an acknowledgement within **48 hours** and a resolution timeline within **7 business days** for confirmed issues. + +--- + +## Security Model + +### Transport + +The gRPC server currently runs with `add_insecure_port` — **there is no TLS in the current dev configuration.** This is intentional for the local development setup where all traffic flows through authenticated kubectl tunnels. + +**For any production or internet-exposed deployment, TLS must be enabled.** See ADR-0003 for context. + +### Authentication & Authorization + +The current version has **no authentication layer** on the gRPC API. Any client with network access to port `50052` can call any RPC method and access any session by session ID. + +Acceptable risk boundaries for the current deployment: +- Port `50052` must be accessible **only** to authorized developers via firewall rules or VPN. +- Do not expose port `50052` on a public IP without an authenticating reverse proxy. + +### Secrets Management + +All secrets (API keys, database credentials) are managed exclusively via environment variables. The following rules are enforced: + +- **Never commit real secret values** to any branch, including feature branches. +- Use placeholder values (e.g., `sk-ant-...`, `pk-lf-...`) in documentation and examples. +- The `.env` file is listed in `.gitignore` and must never be committed. +- The `kubernetes/kubeconfig.yaml` file grants cluster-level access and must never be committed. +- PRs containing secrets or committed `.env` / kubeconfig files will be **immediately closed** and the committer will be required to rotate all exposed credentials before resubmission. + +**Environment variables that contain secrets:** + +| Variable | Type | +|---|---| +| `LANGFUSE_PUBLIC_KEY` | API key | +| `LANGFUSE_SECRET_KEY` | API key | +| `ANTHROPIC_API_KEY` | API key | +| `ELASTICSEARCH_PASSWORD` | Credential | +| `ELASTICSEARCH_API_KEY` | API key | +| `HF_TOKEN` | API key | + +### Container Security + +- The container runs as a **non-root user** (Python 3.11 slim base image default). +- Using `root` as the container user is explicitly prohibited (see `CONTRIBUTING.md` §3). +- The `/workspace` directory is deprecated. All application code runs from `/app`. +- The `.dockerignore` ensures that development artifacts (`.git`, `.env`, `tests/`, `docs/`) are excluded from the production image. + +### Data Privacy + +- All LLM inference (text generation and embeddings) is performed within the **private Devaron Kubernetes cluster** on Vultr infrastructure. No user query data is sent to external third-party APIs during normal operation. +- The exception is the `EvaluateRAG` endpoint, which sends **golden dataset questions and generated answers** to the Anthropic API (Claude) for evaluation scoring. No real user queries from production sessions are used in evaluation. +- Conversation history is stored **in-memory only** and is never persisted to disk or an external database. + +### Dependency Security + +- Dependencies are pinned via `uv.lock` and exported to `Docker/requirements.txt`. +- Dependency updates should be reviewed for security advisories before merging. +- Run `pip audit` or `safety check` against `Docker/requirements.txt` before major releases. + +```bash +pip install pip-audit +pip-audit -r Docker/requirements.txt +``` + +--- + +## Known Security Limitations + +These are acknowledged risks accepted for the current development phase. They must be addressed before any production internet-facing deployment. + +| ID | Limitation | Risk | Mitigation required | +|---|---|---|---| +| SEC-001 | No gRPC TLS | Traffic interception | Enable TLS with server certificate | +| SEC-002 | No API authentication | Unauthorized access | Add JWT / mutual TLS authentication | +| SEC-003 | Session IDs are guessable | Session hijacking | Enforce UUIDs; validate ownership | +| SEC-004 | No rate limiting | DoS / cost amplification | Add gRPC interceptor rate limiter | +| SEC-005 | In-memory session store | Data loss on restart | Acceptable for dev; requires Redis for prod | +| SEC-006 | `ELASTICSEARCH_USER/PASS` optional | Unauthenticated ES access | Make auth required in prod; fail-fast if absent | diff --git a/scripts/pipelines/ingestion/avap_chunker.py b/scripts/pipelines/ingestion/avap_chunker.py new file mode 100644 index 0000000..ef2cd85 --- /dev/null +++ b/scripts/pipelines/ingestion/avap_chunker.py @@ -0,0 +1,794 @@ +""" +chunker.py v1.0 + +Uso: + python chunker.py --lang-config avap_config.json --docs-path ./docs/samples + python chunker.py --lang-config avap_config.json --docs-path ./docs/samples --workers 8 + python chunker.py --lang-config avap_config.json --docs-path ./docs/samples --redis-url redis://localhost:6379 + python chunker.py --lang-config avap_config.json --docs-path ./docs/samples --no-dedup +""" + +import re +import os +import json +import hashlib +import argparse +import tempfile +import warnings as py_warnings +from pathlib import Path +from dataclasses import dataclass, asdict, field +from typing import Optional, Generator, IO +from concurrent.futures import ProcessPoolExecutor, as_completed + + +try: + import tiktoken + _ENC = tiktoken.get_encoding("cl100k_base") + def count_tokens(text: str) -> int: + return len(_ENC.encode(text)) + TOKEN_BACKEND = "tiktoken/cl100k_base" +except ImportError: + py_warnings.warn("tiktoken no instalado — usando word-count. pip install tiktoken", + stacklevel=2) + def count_tokens(text: str) -> int: # type: ignore[misc] + return len(text.split()) + TOKEN_BACKEND = "word-count (estimación)" + +try: + from datasketch import MinHash, MinHashLSH + MINHASH_AVAILABLE = True +except ImportError: + MINHASH_AVAILABLE = False + py_warnings.warn("datasketch no instalado — dedup desactivada. pip install datasketch", + stacklevel=2) + +try: + from tqdm import tqdm +except ImportError: + def tqdm(x, **kwargs): return x # type: ignore[misc] + + +MAX_NARRATIVE_TOKENS = 400 +OVERLAP_LINES = 3 +DEDUP_THRESHOLD = 0.85 +MINHASH_NUM_PERM = 128 +MINHASH_SHINGLE_SIZE = 3 +DEFAULT_WORKERS = max(1, (os.cpu_count() or 4) - 1) + + +@dataclass +class BlockDef: + name: str + doc_type: str + opener_re: re.Pattern + closer_re: re.Pattern + extract_signature:bool = False + signature_template:str = "" + + def extract_sig(self, clean_line): + if not self.extract_signature: + return None + m = self.opener_re.match(clean_line) + if not m: + return None + tpl = self.signature_template + for i, g in enumerate(m.groups(), start=1): + tpl = tpl.replace(f"{{group{i}}}", (g or "").strip()) + return tpl + + +@dataclass +class StatementDef: + name: str + re: re.Pattern + + +@dataclass +class SemanticTag: + tag: str + re: re.Pattern + + +class LanguageConfig: + + def __init__(self, config_path: str): + raw = json.loads(Path(config_path).read_text(encoding="utf-8")) + + self.language = raw.get("language", "unknown") + self.version = raw.get("version", "1.0") + self.extensions = set(raw.get("file_extensions", [])) + + lex = raw.get("lexer", {}) + self.string_delimiters = lex.get("string_delimiters", ['"', "'"]) + self.escape_char = lex.get("escape_char", "\\") + self.comment_line = sorted(lex.get("comment_line", ["#"]), key=len, reverse=True) + cb = lex.get("comment_block", {}) + self.comment_block_open = cb.get("open", "") + self.comment_block_close = cb.get("close", "") + self.line_oriented = lex.get("line_oriented", True) + + self.blocks: list[BlockDef] = [] + for b in raw.get("blocks", []): + self.blocks.append(BlockDef( + name = b["name"], + doc_type = b.get("doc_type", "code"), + opener_re = re.compile(b["opener_pattern"]), + closer_re = re.compile(b["closer_pattern"]), + extract_signature = b.get("extract_signature", False), + signature_template = b.get("signature_template", ""), + )) + + self.statements: list[StatementDef] = [ + StatementDef(name=s["name"], re=re.compile(s["pattern"])) + for s in raw.get("statements", []) + ] + + self.semantic_tags: list[SemanticTag] = [ + SemanticTag(tag=t["tag"], re=re.compile(t["pattern"])) + for t in raw.get("semantic_tags", []) + ] + + def match_opener(self, clean_line): + for block in self.blocks: + if block.opener_re.match(clean_line): + return block + return None + + def match_closer(self, clean_line): + for block in self.blocks: + if block.closer_re.match(clean_line): + return True + return False + + def classify_statement(self, clean_line): + for stmt in self.statements: + if stmt.re.match(clean_line): + return stmt.name + return "statement" + + def enrich_metadata(self, content): + meta: dict = {} + for tag in self.semantic_tags: + if tag.re.search(content): + meta[tag.tag] = True + meta["complexity"] = sum(1 for v in meta.values() if v is True) + return meta + + +@dataclass +class Chunk: + chunk_id: str + source_file: str + doc_type: str + block_type: str + section: str + start_line: int + end_line: int + content: str + metadata: dict = field(default_factory=dict) + + def token_count(self): + return count_tokens(self.content) + + def to_dict(self): + d = asdict(self) + d["token_estimate"] = self.token_count() + return d + + +def make_chunk_id(filepath, start, end, content): + return hashlib.sha1( + f"{filepath.name}:{start}:{end}:{content[:60]}".encode() + ).hexdigest()[:16] + + +def make_chunk(filepath: Path, doc_type, block_type,section, start, end, content, cfg, extra_meta = None): + content = content.strip() + meta = cfg.enrich_metadata(content) + if extra_meta: + meta.update(extra_meta) + return Chunk( + chunk_id=make_chunk_id(filepath, start, end, content), + source_file=str(filepath), + doc_type=doc_type, block_type=block_type, + section=section, start_line=start, end_line=end, + content=content, metadata=meta, + ) + +class GenericLexer: + + def __init__(self, cfg: LanguageConfig): + self.cfg = cfg + self.in_block_comment = False + + def process_line(self, raw): + if self.in_block_comment: + if self.cfg.comment_block_close and \ + self.cfg.comment_block_close in raw: + self.in_block_comment = False + return False, "" + + cb_open = self.cfg.comment_block_open + cb_close = self.cfg.comment_block_close + if cb_open and cb_open in raw: + idx_open = raw.index(cb_open) + rest = raw[idx_open + len(cb_open):] + if cb_close and cb_close in rest: + idx_close = raw.index(cb_close, idx_open) + code_part = raw[:idx_open] + raw[idx_close + len(cb_close):] + return self._strip_line_comments(code_part) + else: + self.in_block_comment = True + return self._strip_line_comments(raw[:idx_open]) + + return self._strip_line_comments(raw) + + def _strip_line_comments(self, raw): + + in_str: Optional[str] = None + result = [] + i = 0 + + while i < len(raw): + ch = raw[i] + if in_str and ch == self.cfg.escape_char: + result.append(ch) + if i + 1 < len(raw): + result.append(raw[i + 1]) + i += 2 + else: + i += 1 + continue + + if in_str and ch == in_str: + in_str = None + result.append(ch); i += 1; continue + + if not in_str and ch in self.cfg.string_delimiters: + in_str = ch + result.append(ch); i += 1; continue + + if not in_str: + matched = False + for prefix in self.cfg.comment_line: + if raw[i:].startswith(prefix): + matched = True + break + if matched: + break + + result.append(ch); i += 1 + + code = "".join(result).strip() + return bool(code), code + + +class SemanticOverlapBuffer: + def __init__(self, overlap_lines = OVERLAP_LINES): + self.overlap_lines = overlap_lines + self._prev = None + self._current_fn_sig = None + self._current_fn_file = None + + def notify_function(self, sig, source_file): + self._current_fn_sig = sig + self._current_fn_file = source_file + + def notify_file_change(self, source_file): + if self._current_fn_file != source_file: + self._current_fn_sig = None + self._current_fn_file = source_file + self._prev = None + + def apply(self, chunk): + if self.overlap_lines <= 0: + self._prev = chunk + return chunk + + if self._prev and self._prev.source_file != chunk.source_file: + self.notify_file_change(chunk.source_file) + + context_header = None + + if (self._current_fn_sig + and self._current_fn_file == chunk.source_file + and chunk.block_type not in ("function", "function_signature")): + context_header = f"// contexto: {self._current_fn_sig}" + overlap_type = "function_sig" + elif (self._prev + and self._prev.source_file == chunk.source_file + and self._prev.doc_type == chunk.doc_type): + context_header = "\n".join( + self._prev.content.splitlines()[-self.overlap_lines:]) + overlap_type = "line_tail" + else: + overlap_type = "none" + + self._prev = chunk + + if context_header: + new_content = (context_header + "\n" + chunk.content).strip() + return Chunk( + chunk_id=chunk.chunk_id, source_file=chunk.source_file, + doc_type=chunk.doc_type, block_type=chunk.block_type, + section=chunk.section, start_line=chunk.start_line, + end_line=chunk.end_line, content=new_content, + metadata={**chunk.metadata, + "has_overlap": True, + "overlap_type": overlap_type}, + ) + return chunk + + +def _shingles(text, k = MINHASH_SHINGLE_SIZE): + words = text.lower().split() + if len(words) < k: + return [" ".join(words).encode()] + return [" ".join(words[i:i+k]).encode() for i in range(len(words) - k + 1)] + + +def _build_minhash(text): + m = MinHash(num_perm=MINHASH_NUM_PERM) + for s in _shingles(text): + m.update(s) + return m + + +class StreamingDeduplicator: + def __init__(self, threshold: float = DEDUP_THRESHOLD ): + self.threshold = threshold + self._lsh: dict[str, "MinHashLSH"] = {} + self.removed = 0 + + + def _get_lsh(self, doc_type): + if doc_type not in self._lsh: + self._lsh[doc_type] = MinHashLSH( + threshold=self.threshold, num_perm=MINHASH_NUM_PERM) + return self._lsh[doc_type] + + def is_duplicate(self, chunk): + if not MINHASH_AVAILABLE: + return False + lsh = self._get_lsh(chunk.doc_type) + m = _build_minhash(chunk.content) + try: + if lsh.query(m): + self.removed += 1 + return True + except Exception: + pass + try: + lsh.insert(chunk.chunk_id, m) + except Exception as e: + print(e) + pass + return False + +class JsonlWriter: + def __init__(self, path): + out = Path(path) + if out.suffix.lower() == ".json": + out = out.with_suffix(".jsonl") + out.parent.mkdir(parents=True, exist_ok=True) + self.path = out + self._handle: IO = open(out, "w", encoding="utf-8") + self.written = 0 + + def write(self, chunk): + self._handle.write(json.dumps(chunk.to_dict(), ensure_ascii=False) + "\n") + self.written += 1 + + def close(self): + if self._handle: + self._handle.close() + +def validate_syntax(lines, filepath, cfg ): + warnings_out = [] + stack = [] + lexer = GenericLexer(cfg) + for i, raw in enumerate(lines): + line_no = i + 1 + is_code, clean = lexer.process_line(raw) + if not is_code or not clean: + continue + block = cfg.match_opener(clean) + if block: + stack.append((block.name, line_no)) + continue + if cfg.match_closer(clean): + if stack: + stack.pop() + else: + warnings_out.append( + f"{filepath.name}:{line_no} — close without open") + for bt, ln in stack: + warnings_out.append( + f"{filepath.name}:{ln} — not closed block '{bt}'") + return warnings_out + +def iter_code_chunks(filepath, cfg, overlap_buf): + lines = filepath.read_text(encoding="utf-8").splitlines() + warnings = validate_syntax(lines, filepath, cfg) + overlap_buf.notify_file_change(str(filepath)) + + lexer = GenericLexer(cfg) + i = 0 + pending_raw = [] + loose_buffer = [] + loose_type = None + + def flush_loose(): + nonlocal loose_buffer, loose_type + if not loose_buffer: + return + start = loose_buffer[0][0] + end = loose_buffer[-1][0] + content = "\n".join(t for _, t in loose_buffer) + chunk = make_chunk(filepath, "code", loose_type or "statement", + "", start, end, content, cfg) + chunk = overlap_buf.apply(chunk) + loose_buffer.clear(); loose_type = None + yield chunk + + while i < len(lines): + raw = lines[i] + line_no = i + 1 + is_code, clean = lexer.process_line(raw) + + if not is_code or not clean: + pending_raw.append(raw); i += 1; continue + + block_def = cfg.match_opener(clean) + + if block_def: + yield from flush_loose() + block_start = line_no + block_lines = list(pending_raw) + [raw] + pending_raw.clear() + + sig = block_def.extract_sig(clean) + if sig: + overlap_buf.notify_function(sig, str(filepath)) + + depth = 1; i += 1 + while i < len(lines) and depth > 0: + inner_raw = lines[i] + _, inner_clean = lexer.process_line(inner_raw) + block_lines.append(inner_raw) + if inner_clean: + if cfg.match_opener(inner_clean): + depth += 1 + elif cfg.match_closer(inner_clean): + depth -= 1 + i += 1 + + chunk = make_chunk(filepath, block_def.doc_type, block_def.name, "", block_start, i, "\n".join(block_lines), cfg) + chunk = overlap_buf.apply(chunk) + yield chunk + + if sig: + yield make_chunk( + filepath, "function_signature", "function_signature", "", block_start, block_start, sig, cfg, + extra_meta={"full_block_start": block_start, + "full_block_end": i} + ) + continue + + stmt_type = cfg.classify_statement(clean) + if loose_type and stmt_type != loose_type: + yield from flush_loose() + if pending_raw and not loose_buffer: + for pc in pending_raw: + loose_buffer.append((line_no, pc)) + pending_raw.clear() + loose_type = stmt_type + loose_buffer.append((line_no, raw)) + i += 1 + + yield from flush_loose() + + if warnings: + yield (None, warnings) + + +RE_MD_H1 = re.compile(r"^# (.+)") +RE_MD_H2 = re.compile(r"^## (.+)") +RE_MD_H3 = re.compile(r"^### (.+)") +RE_FENCE_OPEN = re.compile(r"^```(\w*)") +RE_FENCE_CLOSE = re.compile(r"^```\s*$") +RE_TABLE_ROW = re.compile(r"^\|") + + +def split_narrative_by_tokens(text, max_tokens): + paragraphs = re.split(r"\n\s*\n", text) + result = []; current = []; current_tokens = 0 + for para in paragraphs: + pt = count_tokens(para) + if current_tokens + pt > max_tokens and current: + result.append("\n\n".join(current)) + current = [para]; current_tokens = pt + else: + current.append(para); current_tokens += pt + if current: + result.append("\n\n".join(current)) + return [t for t in result if t.strip()] + + +def iter_markdown_chunks(filepath, cfg, max_tokens = MAX_NARRATIVE_TOKENS): + + lines = filepath.read_text(encoding="utf-8").splitlines() + current_h1 = current_h2 = current_h3 = "" + + def section_label() -> str: + return " > ".join(p for p in [current_h1, current_h2, current_h3] if p) + + def make_md_chunk(doc_type, block_type, start, end, content) -> Chunk: + return make_chunk(filepath, doc_type, block_type, + section_label(), start, end, content, cfg) + + i = 0 + narrative_start = 1; narrative_lines: list[str] = [] + + def flush_narrative() -> Generator: + nonlocal narrative_lines, narrative_start + text = "\n".join(narrative_lines).strip() + if not text: + narrative_lines.clear(); return + for sub in split_narrative_by_tokens(text, max_tokens): + sl = sub.count("\n") + 1 + yield make_md_chunk("spec", "narrative", + narrative_start, narrative_start + sl - 1, sub) + narrative_lines.clear() + + while i < len(lines): + raw = lines[i]; line_no = i + 1 + m1 = RE_MD_H1.match(raw); m2 = RE_MD_H2.match(raw); m3 = RE_MD_H3.match(raw) + + if m1: + yield from flush_narrative() + current_h1 = m1.group(1).strip(); current_h2 = current_h3 = "" + narrative_start = line_no + 1; i += 1; continue + if m2: + yield from flush_narrative() + current_h2 = m2.group(1).strip(); current_h3 = "" + narrative_start = line_no + 1; i += 1; continue + if m3: + yield from flush_narrative() + current_h3 = m3.group(1).strip() + narrative_start = line_no + 1; i += 1; continue + + fm = RE_FENCE_OPEN.match(raw) + if fm and not RE_FENCE_CLOSE.match(raw): + yield from flush_narrative() + lang = fm.group(1).lower() or "code" + doc_type = "bnf" if lang == "bnf" else "code_example" + fence_start = line_no + fence_lines = [raw]; i += 1 + while i < len(lines): + fence_lines.append(lines[i]) + if RE_FENCE_CLOSE.match(lines[i]) and len(fence_lines) > 1: + i += 1; break + i += 1 + yield make_md_chunk(doc_type, lang, + fence_start, fence_start + len(fence_lines) - 1, + "\n".join(fence_lines)) + narrative_start = i + 1 + continue + + if RE_TABLE_ROW.match(raw): + yield from flush_narrative() + ts = line_no; tl = [] + while i < len(lines) and RE_TABLE_ROW.match(lines[i]): + tl.append(lines[i]); i += 1 + yield make_md_chunk("spec", "table", ts, ts + len(tl) - 1, "\n".join(tl)) + narrative_start = i + 1 + continue + + if not narrative_lines: + narrative_start = line_no + narrative_lines.append(raw) + i += 1 + + yield from flush_narrative() + + +def _worker(args): + + paths, config_path, overlap_lines, max_tokens = args + + cfg = LanguageConfig(config_path) + overlap_buf = SemanticOverlapBuffer(overlap_lines) + stats = {t: 0 for t in ["code", "function_signature", "spec", "bnf", "code_example", "unknown", "total"]} + all_warnings = [] + + fd, tmp_path = tempfile.mkstemp(suffix=".jsonl", prefix="worker_") + os.close(fd) + + with open(tmp_path, "w", encoding="utf-8") as f: + for path in paths: + ext = path.suffix.lower() + if ext in cfg.extensions: + for item in iter_code_chunks(path, cfg, overlap_buf): + if isinstance(item, tuple) and item[0] is None: + all_warnings.extend(item[1]) + continue + chunk = item + f.write(json.dumps(chunk.to_dict(), ensure_ascii=False) + "\n") + stats[chunk.doc_type] = stats.get(chunk.doc_type, 0) + 1 + stats["total"] += 1 + elif ext == ".md": + for chunk in iter_markdown_chunks(path, cfg, max_tokens): + f.write(json.dumps(chunk.to_dict(), ensure_ascii=False) + "\n") + stats[chunk.doc_type] = stats.get(chunk.doc_type, 0) + 1 + stats["total"] += 1 + else: + content = path.read_text(encoding="utf-8") + chunk = make_chunk(path, "unknown", "raw", "", 1, + content.count("\n") + 1, content, cfg) + f.write(json.dumps(chunk.to_dict(), ensure_ascii=False) + "\n") + stats["unknown"] += 1; stats["total"] += 1 + + return tmp_path, stats, all_warnings + +def fetch_documents(docs_path, cfg, extra_extensions): + root = Path(docs_path) + if not root.exists(): + raise FileNotFoundError(f"PATH not found: {root}") + all_exts = cfg.extensions | set(extra_extensions) + return sorted(p for p in root.rglob("*") + if p.is_file() and p.suffix.lower() in all_exts) + + +def _partition(paths, n): + k = max(1, len(paths) // n) + return [paths[i:i+k] for i in range(0, len(paths), k)] + + +def run_pipeline(paths, + config_path, + writer, + deduplicator, + overlap_lines, + max_tokens, + workers): + + total_stats = {t: 0 for t in ["code", "function_signature", "spec", "bnf", "code_example", "unknown", "total", "dedup_removed"]} + all_warnings = [] + tmp_files = [] + + partitions = _partition(paths, workers) + worker_args = [(part, config_path, overlap_lines, max_tokens) for part in partitions] + + print(f"{len(paths)} Files in {len(partitions)} workers...\n") + + with ProcessPoolExecutor(max_workers=workers) as executor: + futures = {executor.submit(_worker, arg): i + for i, arg in enumerate(worker_args)} + for future in tqdm(as_completed(futures), total=len(futures), + desc=" Workers", unit="worker"): + tmp_path, stats, warns = future.result() + tmp_files.append(tmp_path) + all_warnings.extend(warns) + for k, v in stats.items(): + total_stats[k] = total_stats.get(k, 0) + v + + print(f"\n Mergin {len(tmp_files)} partial files...") + for tmp_path in tqdm(tmp_files, desc=" Merge + dedup", unit="file"): + with open(tmp_path, encoding="utf-8") as f: + for line in f: + line = line.strip() + if not line: + continue + try: + cd = json.loads(line) + if deduplicator: + c = Chunk( + chunk_id=cd["chunk_id"], source_file=cd["source_file"], + doc_type=cd["doc_type"], block_type=cd["block_type"], + section=cd["section"], start_line=cd["start_line"], + end_line=cd["end_line"], content=cd["content"], + metadata=cd.get("metadata", {}), + ) + if deduplicator.is_duplicate(c): + total_stats["dedup_removed"] = \ + total_stats.get("dedup_removed", 0) + 1 + continue + writer._handle.write(line + "\n") + writer.written += 1 + except json.JSONDecodeError as e: + print(e) + pass + Path(tmp_path).unlink(missing_ok=True) + + return total_stats, all_warnings + + +def print_report(stats, warnings, output_path, token_backend, workers, language): + + print(f" RESULT — [{language}]") + + print(f" Tokenizer : {token_backend}") + dedup_be = "MinHash LSH (RAM)" if MINHASH_AVAILABLE else "desactivada" + print(f" Dedup backend : {dedup_be}") + print(f" Workers : {workers}") + print() + for t in ["code", "function_signature", "spec", "bnf", "code_example", "unknown"]: + n = stats.get(t, 0) + if n: + print(f" {t:<25}: {n:>6} chunks") + print(f"\n Total written : {stats.get('total', 0)}") + print(f" Erased (dedup) : {stats.get('dedup_removed', 0)}") + if warnings: + print(f"\n Warnings ({len(warnings)}):") + for w in warnings[:20]: + print(w) + if len(warnings) > 20: + print(f" ... and {len(warnings) - 20} more") + else: + print("\n Ok") + print(f"\n OUTPUT File {output_path}") + + +def main(): + parser = argparse.ArgumentParser( + description="GEneric chunker" + ) + parser.add_argument("--lang-config", required=True, + help="(ej: avap_config.json)") + parser.add_argument("--docs-path", default="docs/samples") + parser.add_argument("--output", default="ingestion/chunks.jsonl") + parser.add_argument("--overlap", type=int, default=OVERLAP_LINES) + parser.add_argument("--max-tokens", type=int, default=MAX_NARRATIVE_TOKENS) + parser.add_argument("--dedup-threshold", type=float, default=DEDUP_THRESHOLD) + parser.add_argument("--no-dedup", action="store_true") + parser.add_argument("--no-overlap", action="store_true") + parser.add_argument("--workers", type=int, default=DEFAULT_WORKERS) + + args = parser.parse_args() + + cfg = LanguageConfig(args.lang_config) + overlap = 0 if args.no_overlap else args.overlap + + print(f" Lenguaje : {cfg.language} v{cfg.version}") + print(f" Config : {args.lang_config}") + print(f" Extensiones : {cfg.extensions | {'.md'}}") + print(f" Docs path : {args.docs_path}") + print(f" Output : {args.output}") + print(f" Workers : {args.workers}") + print(f" Tokenizador : {TOKEN_BACKEND}") + print(f" Overlap : {overlap} líneas (semántico)") + print(f" Max tokens : {args.max_tokens}") + dedup_info = "deactive" if args.no_dedup else \ + f"MinHash LSH threshold={args.dedup_threshold}" + \ + (f" RAM") + print(f" Dedup : {dedup_info}") + print() + + paths = fetch_documents(args.docs_path, cfg, [".md"]) + if not paths: + print("No files found.") + return + print(f"{len(paths)} files found\n") + + deduplicator = None + if not args.no_dedup and MINHASH_AVAILABLE: + deduplicator = StreamingDeduplicator( + threshold=args.dedup_threshold, + ) + + writer = JsonlWriter(args.output) + try: + stats, warnings = run_pipeline( + paths, args.lang_config, writer, deduplicator, + overlap, args.max_tokens, args.workers + ) + finally: + writer.close() + + print_report(stats, warnings, str(writer.path), + TOKEN_BACKEND, args.workers, cfg.language) + + +if __name__ == "__main__": + main() diff --git a/scripts/pipelines/ingestion/avap_config.json b/scripts/pipelines/ingestion/avap_config.json new file mode 100644 index 0000000..be39077 --- /dev/null +++ b/scripts/pipelines/ingestion/avap_config.json @@ -0,0 +1,74 @@ +{ + "_comment": "Configuración del lenguaje AVAP para el chunker genérico. Basada en el LRM (Language Reference Manual) de AVAP.", + + "language": "avap", + "version": "1.0", + "file_extensions": [".avap"], + + "lexer": { + "string_delimiters": ["\"", "'"], + "escape_char": "\\", + "comment_line": ["///", "//"], + "comment_block": { "open": "/*", "close": "*/" }, + "line_oriented": true + }, + + "blocks": [ + { + "name": "function", + "doc_type": "code", + "opener_pattern": "^\\s*function\\s+(\\w+)\\s*\\(([^)]*)", + "closer_pattern": "^\\s*\\}\\s*$", + "extract_signature": true, + "signature_template": "function {group1}({group2})" + }, + { + "name": "if", + "doc_type": "code", + "opener_pattern": "^\\s*if\\s*\\(", + "closer_pattern": "^\\s*end\\s*\\(\\s*\\)" + }, + { + "name": "startLoop", + "doc_type": "code", + "opener_pattern": "^\\s*startLoop\\s*\\(", + "closer_pattern": "^\\s*endLoop\\s*\\(\\s*\\)" + }, + { + "name": "try", + "doc_type": "code", + "opener_pattern": "^\\s*try\\s*\\(\\s*\\)", + "closer_pattern": "^\\s*end\\s*\\(\\s*\\)" + } + ], + + "statements": [ + { "name": "registerEndpoint", "pattern": "^\\s*registerEndpoint\\s*\\(" }, + { "name": "addVar", "pattern": "^\\s*addVar\\s*\\(" }, + { "name": "io_command", "pattern": "^\\s*(addParam|getListLen|addResult|getQueryParamList)\\s*\\(" }, + { "name": "http_command", "pattern": "^\\s*(RequestPost|RequestGet)\\s*\\(" }, + { "name": "orm_command", "pattern": "^\\s*(ormDirect|ormCheckTable|ormCreateTable|ormAccessSelect|ormAccessInsert|ormAccessUpdate)\\s*\\(" }, + { "name": "util_command", "pattern": "^\\s*(variableToList|itemFromList|variableFromJSON|AddVariableToJSON|encodeSHA256|encodeMD5|getRegex|getDateTime|stampToDatetime|getTimeStamp|randomString|replace)\\s*\\(" }, + { "name": "async_command", "pattern": "^\\s*(\\w+\\s*=\\s*go\\s+|gather\\s*\\()" }, + { "name": "connector", "pattern": "^\\s*\\w+\\s*=\\s*avapConnector\\s*\\(" }, + { "name": "modularity", "pattern": "^\\s*(import|include)\\s+" }, + { "name": "assignment", "pattern": "^\\s*\\w+\\s*=\\s*" } + ], + + "semantic_tags": [ + { "tag": "uses_orm", "pattern": "\\b(ormDirect|ormCheckTable|ormCreateTable|ormAccessSelect|ormAccessInsert|ormAccessUpdate)\\s*\\(" }, + { "tag": "uses_http", "pattern": "\\b(RequestPost|RequestGet)\\s*\\(" }, + { "tag": "uses_connector", "pattern": "\\bavapConnector\\s*\\(" }, + { "tag": "uses_async", "pattern": "\\bgo\\s+\\w+\\s*\\(|\\bgather\\s*\\(" }, + { "tag": "uses_crypto", "pattern": "\\b(encodeSHA256|encodeMD5)\\s*\\(" }, + { "tag": "uses_auth", "pattern": "\\b(addParam|_status)\\b" }, + { "tag": "uses_error_handling", "pattern": "\\btry\\s*\\(\\s*\\)" }, + { "tag": "uses_loop", "pattern": "\\bstartLoop\\s*\\(" }, + { "tag": "uses_json", "pattern": "\\b(variableFromJSON|AddVariableToJSON)\\s*\\(" }, + { "tag": "uses_list", "pattern": "\\b(variableToList|itemFromList|getListLen)\\s*\\(" }, + { "tag": "uses_regex", "pattern": "\\bgetRegex\\s*\\(" }, + { "tag": "uses_datetime", "pattern": "\\b(getDateTime|getTimeStamp|stampToDatetime)\\s*\\(" }, + { "tag": "returns_result", "pattern": "\\baddResult\\s*\\(" }, + { "tag": "registers_endpoint", "pattern": "\\bregisterEndpoint\\s*\\(" } + ] +} diff --git a/scripts/pipelines/ingestion/avap_ingestor.py b/scripts/pipelines/ingestion/avap_ingestor.py new file mode 100644 index 0000000..db9e983 --- /dev/null +++ b/scripts/pipelines/ingestion/avap_ingestor.py @@ -0,0 +1,452 @@ +""" +avap_ingest.py v2.0 + +Uso: + + # Ingestar + python avap_ingest.py --chunks ingestion/chunks.jsonl --index avap-knowledge-v1 + + # Borrar indice y re-ingestar desde cero + python avap_ingest.py --chunks ingestion/chunks.jsonl --index avap-knowledge-v1 --delete + + # Reprocesar solo los fallidos (DLQ) + python avap_ingest.py --chunks ingestion/failed_chunks.jsonl --index avap-knowledge-v1 + +""" + +import os +import json +import time +import asyncio +import argparse +import traceback +from pathlib import Path +from datetime import datetime +from typing import AsyncGenerator +from elasticsearch import AsyncElasticsearch +import httpx +from tqdm import tqdm +from elasticsearch import helpers as es_helpers + + +DEFAULT_CHUNKS_PATH = "ingestion/chunks.jsonl" +DEFAULT_INDEX = "avap-knowledge-v1" +DEFAULT_OLLAMA_URL= "http://localhost:11434" +DEFAULT_OLLAMA_MODEL= "qwen3-0.6B-emb:latest" +DEFAULT_EMBEDDING_DIM= 1024 +BATCH_SIZE_EMBED= 8 +BATCH_SIZE_ES= 50 +QUEUE_MAXSIZE= 5 +MAX_RETRIES= 3 +RETRY_DELAY= 2.0 +OLLAMA_TIMEOUT= 120 + + +def iter_chunks_jsonl(path, batch_size): + + batch = [] + with open(path, encoding="utf-8") as f: + for line in f: + line = line.strip() + if not line: + continue + try: + chunk = json.loads(line) + batch.append(chunk) + if len(batch) >= batch_size: + yield batch + batch = [] + except json.JSONDecodeError as e: + print(e) + if batch: + yield batch + + +def count_lines(path): + n = 0 + with open(path, encoding="utf-8") as f: + for line in f: + if line.strip(): + n += 1 + return n + + +def build_index_mapping(embedding_dim): + + return { + "settings": { + "number_of_shards": 1, + "number_of_replicas": 0, + "analysis": { + "analyzer": { + "avap_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": ["lowercase", "stop"] + } + } + } + }, + "mappings": { + "properties": { + "chunk_id": {"type": "keyword"}, + "content": { + "type": "text", + "analyzer": "avap_analyzer" + }, + + "embedding": { + "type": "dense_vector", + "dims": embedding_dim, + "index": True, + "similarity": "cosine", + "index_options": { + "type": "int8_hnsw", + "m": 16, + "ef_construction": 100 + } + }, + "doc_type": {"type": "keyword"}, + "block_type": {"type": "keyword"}, + "section": { + "type": "text", + "fields": {"keyword": {"type": "keyword"}} + }, + "source_file": {"type": "keyword"}, + "start_line": {"type": "integer"}, + "end_line": {"type": "integer"}, + "token_estimate": {"type": "integer"}, + "metadata": { + "properties": { + "uses_orm": {"type": "boolean"}, + "uses_http": {"type": "boolean"}, + "uses_connector": {"type": "boolean"}, + "uses_async": {"type": "boolean"}, + "uses_crypto": {"type": "boolean"}, + "uses_auth": {"type": "boolean"}, + "uses_error_handling": {"type": "boolean"}, + "uses_loop": {"type": "boolean"}, + "uses_json": {"type": "boolean"}, + "uses_list": {"type": "boolean"}, + "uses_regex": {"type": "boolean"}, + "uses_datetime": {"type": "boolean"}, + "returns_result": {"type": "boolean"}, + "registers_endpoint": {"type": "boolean"}, + "has_overlap": {"type": "boolean"}, + "complexity": {"type": "integer"}, + "full_block_start": {"type": "integer"}, + "full_block_end": {"type": "integer"}, + } + } + } + } + } + + +class DeadLetterQueue: + + def __init__(self, base_path = "ingestion"): + ts = datetime.now().strftime("%Y%m%d_%H%M%S") + self.path = Path(base_path) / f"failed_chunks_{ts}.jsonl" + self._handle = None + self.count = 0 + + def _open(self): + if self._handle is None: + self.path.parent.mkdir(parents=True, exist_ok=True) + self._handle = open(self.path, "w", encoding="utf-8") + + def write(self, chunk, reason) -> None: + self._open() + record = {"reason": reason, "chunk": chunk} + self._handle.write(json.dumps(record, ensure_ascii=False) + "\n") + self._handle.flush() + self.count += 1 + + def close(self): + if self._handle: + self._handle.close() + self._handle = None + + def report(self): + if self.count: + print(f"{self.count} Failed: {self.path}") + else: + print(" No failed chunks") + +class OllamaAsyncEmbedder: + + def __init__(self, base_url, model, timeout = OLLAMA_TIMEOUT): + self.base_url = base_url.rstrip("/") + self.model = model + self._client = httpx.AsyncClient(timeout=timeout) + + async def probe_dimension(self): + vecs = await self._embed(["dimension probe"]) + return len(vecs[0]) + + async def _embed(self, texts): + payload = {"model": self.model, "input": texts} + for attempt in range(1, MAX_RETRIES + 1): + try: + resp = await self._client.post( + f"{self.base_url}/api/embed", + json=payload + ) + resp.raise_for_status() + return resp.json()["embeddings"] + except Exception as exc: + if attempt >= MAX_RETRIES: + raise RuntimeError(f"Embeddings fail {MAX_RETRIES}: {exc}") from exc + await asyncio.sleep(RETRY_DELAY * attempt) + return [] + + async def embed_batch(self, chunks, dlq): + texts = [c["content"] for c in chunks] + try: + vectors = await self._embed(texts) + return list(zip(chunks, vectors)) + except Exception as exc: + print(exc) + results = [] + for chunk in chunks: + try: + vecs = await self._embed([chunk["content"]]) + results.append((chunk, vecs[0])) + except Exception as single_exc: + dlq.write(chunk, f"Ollama embed failed: {single_exc}") + return results + + async def close(self): + await self._client.aclose() + + +async def producer(chunks_path, embedder, queue, dlq, batch_size, pbar): + + for batch in iter_chunks_jsonl(chunks_path, batch_size): + embedded = await embedder.embed_batch(batch, dlq) + if embedded: + await queue.put(embedded) + pbar.update(len(batch)) + + await queue.put(None) + + +async def consumer( queue, es_client, index, dlq, batch_size_es, stats): + + buffer: list[tuple[dict, list[float]]] = [] + + async def flush_buffer(): + if not buffer: + return + actions = [ + { + "_index": index, + "_id": chunk["chunk_id"], + "_source": { + "chunk_id": chunk["chunk_id"], + "content": chunk["content"], + "embedding": vector, + "doc_type": chunk.get("doc_type", "unknown"), + "block_type": chunk.get("block_type", ""), + "section": chunk.get("section", ""), + "source_file": chunk.get("source_file", ""), + "start_line": chunk.get("start_line", 0), + "end_line": chunk.get("end_line", 0), + "token_estimate": chunk.get("token_estimate", 0), + "metadata": chunk.get("metadata", {}), + } + } + for chunk, vector in buffer + ] + try: + ok, errors = await es_helpers.async_bulk( + es_client, actions, + raise_on_error=False, + stats_only=False + ) + stats["ok"] += ok + stats["errors"] += len(errors) + for err in errors: + failed_id = err.get("index", {}).get("_id", "unknown") + reason = str(err.get("index", {}).get("error", "unknown ES error")) + for chunk, _ in buffer: + if chunk["chunk_id"] == failed_id: + dlq.write(chunk, f"ES bulk error: {reason}") + break + except Exception as exc: + for chunk, _ in buffer: + dlq.write(chunk, f"ES bulk exception: {exc}") + stats["errors"] += len(buffer) + + buffer.clear() + + while True: + item = await queue.get() + if item is None: + await flush_buffer() + break + buffer.extend(item) + if len(buffer) >= batch_size_es: + await flush_buffer() + + +async def build_es_client(): + url = "http://127.0.0.1:9200" + + client = AsyncElasticsearch( + url, + verify_certs=False, + request_timeout=60 + ) + + try: + info = await client.info() + print(f" Elasticsearch {info['version']['number']} en {url}") + except Exception as e: + raise ConnectionError(f"Cant connet {url}. Error: {e}") + + return client + + +async def create_index(client: AsyncElasticsearch, index: str, + embedding_dim: int, + delete_if_exists: bool = False) -> None: + exists = await client.indices.exists(index=index) + if exists and delete_if_exists: + await client.indices.delete(index=index) + exists = False + if not exists: + await client.indices.create(index=index, body=build_index_mapping(embedding_dim)) + print(f" · Index '{index}' created (dim={embedding_dim}, int8_hnsw, cosine).") + else: + print(f" · Inex '{index}' reused.") + + +""" +async def build_es_client(): + url = "http://127.0.0.1:9200" + + client = AsyncElasticsearch( + url, + verify_certs=False, + request_timeout=60, + headers={ + "Accept": "application/vnd.elasticsearch+json; compatible-with=8", + "Content-Type": "application/json" + } + ) + client.options(headers={"Accept": "application/vnd.elasticsearch+json; compatible-with=8"}) + + try: + await client.info() + except Exception as e: + raise ConnectionError(f"Error de versión/compatibilidad: {e}") + return client +""" + +async def run(args): + + ollama_url = os.environ.get("OLLAMA_URL", DEFAULT_OLLAMA_URL) + ollama_model = os.environ.get("OLLAMA_MODEL", DEFAULT_OLLAMA_MODEL) + embed_dim = int(os.environ.get("OLLAMA_EMBEDDING_DIM", DEFAULT_EMBEDDING_DIM)) + + embedder = OllamaAsyncEmbedder(ollama_url, ollama_model) + + if args.probe_dim: + dim = await embedder.probe_dimension() + print(f" Model dimensions: {dim}") + await embedder.close() + return + + if not Path(args.chunks).exists(): + print(f"File Not Found: {args.chunks}") + await embedder.close() + return + + total = count_lines(args.chunks) + print(f" Total Chunks: {total}") + + print("\nConnecting to VectorDB...") + es_client = await build_es_client() + + print(f"\nGenerating index '{args.index}'...") + await create_index(es_client, args.index, embed_dim, + delete_if_exists=args.delete) + + print("\n Checking Model dimmensions...") + actual_dim = await embedder.probe_dimension() + if actual_dim != embed_dim: + print(f" Real dimmension ({actual_dim}) != OLLAMA_EMBEDDING_DIM ({embed_dim})") + await embedder.close() + await es_client.close() + return + print(f" Dimmension: {actual_dim}") + + dlq = DeadLetterQueue(base_path=str(Path(args.chunks).parent)) + stats = {"ok": 0, "errors": 0} + queue = asyncio.Queue(maxsize=QUEUE_MAXSIZE) + + print(f"\nAsync pipeline (Ollama <-> Elasticsearch)...\n") + t0 = time.time() + pbar = tqdm(total=total, desc=" Processing", unit="chunks") + + await asyncio.gather( + producer(args.chunks, embedder, queue, dlq, + args.batch_embed, pbar), + consumer(queue, es_client, args.index, dlq, + args.batch_es, stats), + ) + + pbar.close() + elapsed = time.time() - t0 + + await embedder.close() + await es_client.close() + dlq.close() + + print("RESULT") + print("----------------") + print(f"Chunks : {total}") + print(f" -OK : {stats['ok']}") + print(f" -Errors : {stats['errors']}") + print(f" -Index Name: {args.index}") + print() + dlq.report() + print("----------------") + +def main(): + + parser = argparse.ArgumentParser( + description="AVAP Ingestor" + ) + parser.add_argument("--chunks", default=DEFAULT_CHUNKS_PATH, + help=f"JSONL Chunk File (default: {DEFAULT_CHUNKS_PATH})") + parser.add_argument("--index", default=DEFAULT_INDEX, + help=f"Index Name (default: {DEFAULT_INDEX})") + parser.add_argument("--delete", action="store_true", + help="Delete index before send") + parser.add_argument("--probe-dim", action="store_true", + help="Check Model dimmension") + parser.add_argument("--batch-embed", type=int, default=BATCH_SIZE_EMBED, + help=f"Chunks by Ollama call(default: {BATCH_SIZE_EMBED})") + parser.add_argument("--batch-es", type=int, default=BATCH_SIZE_ES, + help=f"Docs by bulk ES (default: {BATCH_SIZE_ES})") + args = parser.parse_args() + + print("----------------") + print("AVAP INGESTOR") + print("----------------") + if not args.probe_dim: + print(f" Chunks : {args.chunks}") + print(f" INDEX ES : {args.index}") + print(f" Ollama URL : {os.environ.get('OLLAMA_URL', DEFAULT_OLLAMA_URL)}") + print(f" MODEL : {os.environ.get('OLLAMA_MODEL', DEFAULT_OLLAMA_MODEL)}") + print(f" MODEL DIM : {os.environ.get('OLLAMA_EMBEDDING_DIM', DEFAULT_EMBEDDING_DIM)}") + print() + + asyncio.run(run(args)) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/scripts/pipelines/ingestion/ingestion/chunks.jsonl b/scripts/pipelines/ingestion/ingestion/chunks.jsonl new file mode 100644 index 0000000..3c637d2 --- /dev/null +++ b/scripts/pipelines/ingestion/ingestion/chunks.jsonl @@ -0,0 +1,105 @@ +{"chunk_id": "f2b9f3531de0a901", "source_file": "../../../docs/LRM/avap.md", "doc_type": "spec", "block_type": "narrative", "section": "Prefacio Arquitectónico", "start_line": 2, "end_line": 4, "content": "**AVAP (Advanced Virtual API Programming) es un DSL (Domain-Specific Language) Turing Completo, diseñado arquitectónicamente para la orquestación segura, concurrente y determinista de microservicios e I/O.** No es un lenguaje de propósito general; su motor híbrido y su gramática estricta están optimizados para el procesamiento rápido de transacciones HTTP, la manipulación de datos en memoria y la persistencia, minimizando los efectos secundarios no deseados.\n\n---", "metadata": {"complexity": 0}, "token_estimate": 115} +{"chunk_id": "5fd5f1e92023b13d", "source_file": "../../../docs/LRM/avap.md", "doc_type": "spec", "block_type": "narrative", "section": "Especificación Técnica Consolidada del Lenguaje AVAP (LRM)", "start_line": 8, "end_line": 10, "content": "Este documento unifica la arquitectura de memoria, estructuras de control, modularidad, concurrencia asíncrona y la gramática formal (BNF) del lenguaje AVAP. Actúa como la única fuente de verdad (Single Source of Truth) para la implementación del parser, el motor de ejecución y la indexación del sistema RAG.\n\n---", "metadata": {"complexity": 0}, "token_estimate": 80} +{"chunk_id": "77a75c28d0b778bc", "source_file": "../../../docs/LRM/avap.md", "doc_type": "spec", "block_type": "narrative", "section": "Especificación Técnica Consolidada del Lenguaje AVAP (LRM) > SECCIÓN I: Arquitectura, Memoria y Fundamentos Estructurales", "start_line": 14, "end_line": 14, "content": "Esta sección sienta las bases de cómo AVAP gestiona la lógica de los servicios y la manipulación de datos en memoria. A diferencia de los lenguajes interpretados convencionales, AVAP utiliza un motor de evaluación híbrida que permite combinar comandos declarativos con expresiones dinámicas.", "metadata": {"complexity": 0}, "token_estimate": 72} +{"chunk_id": "5fd81b806e4f5711", "source_file": "../../../docs/LRM/avap.md", "doc_type": "spec", "block_type": "narrative", "section": "Especificación Técnica Consolidada del Lenguaje AVAP (LRM) > SECCIÓN I: Arquitectura, Memoria y Fundamentos Estructurales > 1.1 Estructura de Archivo y Terminación de Sentencias", "start_line": 18, "end_line": 21, "content": "AVAP es un lenguaje **estrictamente orientado a líneas**. Esta decisión de diseño garantiza que el analizador sintáctico (parser) sea extremadamente rápido y determinista, evitando la ambigüedad que sufren lenguajes que permiten declaraciones en múltiples líneas.\n* Cada instrucción lógica (`statement`) debe completarse en una única línea física de texto.\n* El motor reconoce el salto de línea o retorno de carro (``) como el terminador absoluto de la instrucción.\n* No se admite la partición de una instrucción, obligando al programador a escribir un código secuencial, limpio y fácil de depurar.", "metadata": {"complexity": 0}, "token_estimate": 159} +{"chunk_id": "de6ad8755bd4893c", "source_file": "../../../docs/LRM/avap.md", "doc_type": "spec", "block_type": "narrative", "section": "Especificación Técnica Consolidada del Lenguaje AVAP (LRM) > SECCIÓN I: Arquitectura, Memoria y Fundamentos Estructurales > 1.2 Registro de Endpoints (registerEndpoint)", "start_line": 24, "end_line": 27, "content": "El comando `registerEndpoint` es la unidad atómica de configuración en AVAP. Actúa como el puente crítico entre la red externa (HTTP) y el código interno.\n* **Mecánica:** Define la ruta URL, el método HTTP permitido (ej. `GET`, `POST`), y la función de entrada principal (Handler).\n* **Seguridad:** El servidor AVAP rechazará automáticamente (con un Error 405) cualquier petición que no coincida con el método especificado.\n* **Middlewares:** Permite inyectar una lista de funciones previas para validar tokens antes de ejecutar el bloque principal.", "metadata": {"complexity": 0}, "token_estimate": 139} +{"chunk_id": "926d88e1a5ac0868", "source_file": "../../../docs/LRM/avap.md", "doc_type": "spec", "block_type": "narrative", "section": "Especificación Técnica Consolidada del Lenguaje AVAP (LRM) > SECCIÓN I: Arquitectura, Memoria y Fundamentos Estructurales > 1.3 Asignación Dinámica y Referencias (addVar)", "start_line": 30, "end_line": 33, "content": "AVAP permite una sintaxis de asignación directa mediante el símbolo `=`, otorgando flexibilidad bajo un estricto control de contexto.\n* **Evaluación en tiempo real:** Cuando el intérprete lee `variable = expresión`, resuelve cualquier operación matemática o lógica utilizando el motor de evaluación subyacente.\n* **El operador de desreferenciación (`$`):** Cuando se utiliza el comando nativo `addVar(copia, $original)`, el prefijo `$` indica al motor que debe buscar en la tabla de símbolos la variable llamada \"original\" y extraer su valor.\n* **Semántica de addVar:** El comando acepta `addVar(valor, variable)` o `addVar(variable, valor)`. Si ambos argumentos son identificadores, el valor del segundo se asigna al primero. No está permitido usar dos literales como argumentos.", "metadata": {"complexity": 0}, "token_estimate": 201} +{"chunk_id": "5c30935931a47a71", "source_file": "../../../docs/LRM/avap.md", "doc_type": "bnf", "block_type": "bnf", "section": "Especificación Técnica Consolidada del Lenguaje AVAP (LRM) > SECCIÓN I: Arquitectura, Memoria y Fundamentos Estructurales > Especificación BNF (Sección I)", "start_line": 37, "end_line": 80, "content": "```bnf\n ::= ( | )*\n ::= [ ] [ | ] \n | ( | ) \n ::= /* Retorno de carro / Salto de línea (\\n o \\r\\n) */\n\n ::= \n | \n | \n | \n | \n | \n | \n | \n | \n | \n | \n | \n | \n | \n\n ::= \"=\" \n\n/* Llamada a función global (sin receptor de objeto) */\n ::= \"(\" [] \")\"\n\n/* Llamada a método sobre un objeto conector (con receptor) */\n ::= \"=\" \".\" \"(\" [] \")\"\n\n ::= | \n ::= \"registerEndpoint(\" \",\" \",\" \",\" \",\" \",\" \")\"\n/* addVar asigna un valor a una variable. Acepta (valor, variable) o (variable, valor).\n Si ambos argumentos son identificadores, el valor del segundo se asigna al primero.\n No está permitido pasar dos literales como argumentos. */\n ::= \"addVar(\" \",\" \")\"\n ::= | | \"$\" \n/* Restricción semántica: al menos uno de los dos debe ser */\n\n ::= [a-zA-Z_] [a-zA-Z0-9_]*\n\n/* Variables de sistema reservadas — accesibles y asignables desde cualquier scope:\n _status — código HTTP de respuesta (ej. addVar(_status, 401) o _status = 404) */\n ::= \"_status\"\n```", "metadata": {"uses_auth": true, "registers_endpoint": true, "complexity": 2}, "token_estimate": 511} +{"chunk_id": "d4d70d35c8ec7325", "source_file": "../../../docs/LRM/avap.md", "doc_type": "spec", "block_type": "narrative", "section": "Especificación Técnica Consolidada del Lenguaje AVAP (LRM) > SECCIÓN I: Arquitectura, Memoria y Fundamentos Estructurales > Especificación BNF (Sección I)", "start_line": 81, "end_line": 81, "content": "---", "metadata": {"complexity": 0}, "token_estimate": 1} +{"chunk_id": "10944d208c9da6f4", "source_file": "../../../docs/LRM/avap.md", "doc_type": "spec", "block_type": "narrative", "section": "Especificación Técnica Consolidada del Lenguaje AVAP (LRM) > SECCIÓN II: Gestión de Entrada y Salida (I/O)", "start_line": 85, "end_line": 85, "content": "Esta sección describe los mecanismos que AVAP utiliza para la ingesta de datos externos, la validación de la integridad de los parámetros y la construcción del paquete de respuesta HTTP. AVAP no posee comandos de impresión interna (como `print`); toda salida de datos se realiza a través de la interfaz HTTP.", "metadata": {"complexity": 0}, "token_estimate": 79} +{"chunk_id": "c384770f48495e01", "source_file": "../../../docs/LRM/avap.md", "doc_type": "spec", "block_type": "narrative", "section": "Especificación Técnica Consolidada del Lenguaje AVAP (LRM) > SECCIÓN II: Gestión de Entrada y Salida (I/O) > 2.1 Captura Inteligente de Parámetros (addParam)", "start_line": 89, "end_line": 89, "content": "El comando `addParam(parametro, destino)` inspecciona la petición HTTP en un orden jerárquico estricto: primero en la URL (Query arguments), luego en el JSON Body, y finalmente en el Form Data. Si el parámetro solicitado no existe, la variable de destino se inicializa como `None`.", "metadata": {"uses_auth": true, "complexity": 1}, "token_estimate": 72} +{"chunk_id": "89e708750d7aad10", "source_file": "../../../docs/LRM/avap.md", "doc_type": "spec", "block_type": "narrative", "section": "Especificación Técnica Consolidada del Lenguaje AVAP (LRM) > SECCIÓN II: Gestión de Entrada y Salida (I/O) > 2.2 Validación y Colecciones (getListLen / getQueryParamList)", "start_line": 92, "end_line": 93, "content": "* **`getListLen(fuente, destino)`**: Actúa como un inspector de volumen. Cuenta cuántos elementos hay en una lista o cadena.\n* **`getQueryParamList(parametro, lista_destino)`**: Empaqueta automáticamente múltiples ocurrencias de un parámetro de URL (ej. `?filtro=A&filtro=B`) en una única estructura de lista.", "metadata": {"uses_list": true, "complexity": 1}, "token_estimate": 84} +{"chunk_id": "6596e38ff8166537", "source_file": "../../../docs/LRM/avap.md", "doc_type": "spec", "block_type": "narrative", "section": "Especificación Técnica Consolidada del Lenguaje AVAP (LRM) > SECCIÓN II: Gestión de Entrada y Salida (I/O) > 2.3 Construcción de Respuesta (addResult y _status)", "start_line": 96, "end_line": 96, "content": "El comando `addResult(variable)` es el encargado de registrar qué variables formarán parte del cuerpo JSON de la respuesta final. La variable de sistema `_status` permite definir explícitamente el código HTTP de salida tanto mediante asignación directa (`_status = 404`) como mediante `addVar(_status, 401)`.", "metadata": {"uses_auth": true, "returns_result": true, "complexity": 2}, "token_estimate": 72} +{"chunk_id": "0c688f58b62acad3", "source_file": "../../../docs/LRM/avap.md", "doc_type": "bnf", "block_type": "bnf", "section": "Especificación Técnica Consolidada del Lenguaje AVAP (LRM) > SECCIÓN II: Gestión de Entrada y Salida (I/O) > Especificación BNF (Sección II)", "start_line": 100, "end_line": 106, "content": "```bnf\n ::= | | | \n ::= \"addParam(\" \",\" \")\"\n ::= \"getListLen(\" \",\" \")\"\n ::= \"getQueryParamList(\" \",\" \")\"\n ::= \"addResult(\" \")\"\n```", "metadata": {"uses_auth": true, "uses_list": true, "returns_result": true, "complexity": 3}, "token_estimate": 112} +{"chunk_id": "08e0210bd8a6e9f5", "source_file": "../../../docs/LRM/avap.md", "doc_type": "spec", "block_type": "narrative", "section": "Especificación Técnica Consolidada del Lenguaje AVAP (LRM) > SECCIÓN III: Lógica de Control y Estructuras de Decisión", "start_line": 111, "end_line": 111, "content": "AVAP utiliza una gramática estructural mixta. Combina la fluidez de las palabras clave para abrir bloques funcionales con la seguridad matemática de cierres estrictos.", "metadata": {"complexity": 0}, "token_estimate": 41} +{"chunk_id": "fd4ab34e1a2e1505", "source_file": "../../../docs/LRM/avap.md", "doc_type": "spec", "block_type": "narrative", "section": "Especificación Técnica Consolidada del Lenguaje AVAP (LRM) > SECCIÓN III: Lógica de Control y Estructuras de Decisión > 3.1 El Bloque Condicional (if() / else() / end())", "start_line": 115, "end_line": 118, "content": "La estructura `if()` evalúa una expresión lógica o de comparación. Todo bloque condicional requiere un cierre explícito utilizando el comando `end()`.\n\nEl comando `if()` soporta dos modos de invocación:\n* **Modo 1 (comparación estructurada):** `if(variable, valor, comparador)` — evalúa la comparación entre variable y valor usando el operador indicado como string (ej. `\"==\"`, `\">\"`, `\"!=\"`). Los dos primeros argumentos deben ser identificadores simples o literales, nunca expresiones de acceso como `dict['clave']`. Si se necesita comparar un valor extraído de una estructura, debe asignarse primero a una variable.* **Modo 2 (expresión libre):** `if(None, None, expresion_compleja)` — evalúa directamente una expresión booleana compleja proporcionada como string encapsulado entre `.", "metadata": {"complexity": 0}, "token_estimate": 204} +{"chunk_id": "6a4ee7e79b875a95", "source_file": "../../../docs/LRM/avap.md", "doc_type": "spec", "block_type": "narrative", "section": "Especificación Técnica Consolidada del Lenguaje AVAP (LRM) > SECCIÓN III: Lógica de Control y Estructuras de Decisión > 3.1 El Bloque Condicional (if() / else() / end())", "start_line": 125, "end_line": 133, "content": "El comando `if()` gestiona la lógica condicional mediante dos modos de invocación estrictamente diferenciados. Es imperativo respetar los delimitadores y la posición de los argumentos.\n\n#### Modo 1: Comparación Estructurada (Atómica)\nSe utiliza para comparaciones directas entre dos valores simples.\n* **Sintaxis:** `if(átomo_1, átomo_2, \"operador\")`\n* **Argumentos 1 y 2:** Deben ser identificadores simples (variables) o literales (strings/números). **No se permite el uso de `None` en este modo.**\n* **Argumento 3:** El operador de comparación debe ir obligatoriamente entre **comillas dobles** (`\"==\"`, `\"!=\"`, `\">\"`, `\"<\"`, `\">=\"`, `\"<=\"`).\n* **Restricción:** No se permiten expresiones de acceso (ej. `data.user` o `list[0]`). Estos valores deben asignarse previamente a una variable.\n* **Ejemplo correcto:** `if(reintentos, 5, \"<\")`", "metadata": {"complexity": 0}, "token_estimate": 253} +{"chunk_id": "3986628fd0bade4b", "source_file": "../../../docs/LRM/avap.md", "doc_type": "spec", "block_type": "narrative", "section": "Especificación Técnica Consolidada del Lenguaje AVAP (LRM) > SECCIÓN III: Lógica de Control y Estructuras de Decisión > 3.1 El Bloque Condicional (if() / else() / end())", "start_line": 125, "end_line": 132, "content": "#### Modo 2: Expresión Libre (Evaluación Compleja)\nSe utiliza para evaluar expresiones lógicas que no encajan en la estructura atómica.\n* **Sintaxis:** `if(None, None, `expresión_compleja`)`\n* **Argumentos 1 y 2:** Deben ser literalmente la palabra `None` (sin comillas).\n* **Argumento 3:** La expresión completa **debe** estar encapsulada entre **acentos graves (backticks)**. Esto permite incluir lógica interna, operadores `and/or` y accesos a estructuras de datos.\n* **Ejemplo correcto:** `if(None, None, `user.id > 10 and email.contains(\"@\")`)`\n\n---", "metadata": {"complexity": 0}, "token_estimate": 168} +{"chunk_id": "fccb6c308f9b078a", "source_file": "../../../docs/LRM/avap.md", "doc_type": "spec", "block_type": "table", "section": "Especificación Técnica Consolidada del Lenguaje AVAP (LRM) > SECCIÓN III: Lógica de Control y Estructuras de Decisión > Tabla de Validación para el Modelo", "start_line": 146, "end_line": 152, "content": "| Entrada | Estado | Razón |\n| :--- | :--- | :--- |\n| `if(count, 10, \"==\")` | ✅ VÁLIDO | Modo 1: Átomos válidos y operador entre comillas. |\n| `if(None, None, `val > 0`)` | ✅ VÁLIDO | Modo 2: Uso correcto de `None` y backticks. |\n| `if(username, None, \"==\")` | ❌ ERROR | El Modo 1 prohíbe el uso de `None`. Debe usarse el Modo 2. |\n| `if(None, None, \"val > 0\")` | ❌ ERROR | El Modo 2 requiere backticks (`` ` ``), no comillas. |\n| `if(user.id, 10, \"==\")` | ❌ ERROR | El Modo 1 no permite expresiones de acceso (`.`). |", "metadata": {"complexity": 0}, "token_estimate": 207} +{"chunk_id": "e55b937828573e96", "source_file": "../../../docs/LRM/avap.md", "doc_type": "spec", "block_type": "narrative", "section": "Especificación Técnica Consolidada del Lenguaje AVAP (LRM) > SECCIÓN III: Lógica de Control y Estructuras de Decisión > 3.2 Iteraciones Estrictas y Deterministas (startLoop / endLoop)", "start_line": 155, "end_line": 158, "content": "Para garantizar el determinismo y evitar el colapso de memoria:\n* Los bucles se definen mediante `startLoop(contador, inicio, fin)`. Solo iteran basándose en índices numéricos finitos.\n* El bloque debe cerrarse obligatoriamente con `endLoop()`.\n* La forma de salir anticipadamente es invocando el comando global `return()`.", "metadata": {"uses_loop": true, "complexity": 1}, "token_estimate": 83} +{"chunk_id": "44619b4a5d7c2a6a", "source_file": "../../../docs/LRM/avap.md", "doc_type": "spec", "block_type": "narrative", "section": "Especificación Técnica Consolidada del Lenguaje AVAP (LRM) > SECCIÓN III: Lógica de Control y Estructuras de Decisión > 3.3 Gestión de Errores en Tiempo de Ejecución (try() / exception() / end())", "start_line": 161, "end_line": 162, "content": "Diseñada para proteger la estabilidad del servidor ante fallos de I/O.\n* Si ocurre un fallo del sistema dentro del bloque `try`, el flujo salta al bloque `exception(variable_error)`, poblando la variable con la traza para facilitar la recuperación del script.", "metadata": {"complexity": 0}, "token_estimate": 64} +{"chunk_id": "a66e1f02c3b2af0e", "source_file": "../../../docs/LRM/avap.md", "doc_type": "bnf", "block_type": "bnf", "section": "Especificación Técnica Consolidada del Lenguaje AVAP (LRM) > SECCIÓN III: Lógica de Control y Estructuras de Decisión > Especificación BNF (Sección III)", "start_line": 166, "end_line": 197, "content": "```bnf\n ::= | | \n\n ::= \"if(\" \")\" \n \n [ \"else()\" ]\n \"end()\" \n\n ::= | \n\n ::= \"if\" \"(\" \",\" \",\" \")\"\n ::= \"if\" \"(\" \"None\" \",\" \"None\" \",\" \")\"\n\n ::= | \n ::= \"`\" \"`\"\n\n ::= [a-zA-Z_][a-zA-Z0-9_]*\n::= | \n/* Nota: NO incluye la palabra \"None\" */\n\n ::= \"startLoop(\" \",\" \",\" \")\" \n \n \"endLoop()\" \n\n ::= \"try()\" \n \n \"exception(\" \")\" \n \n \"end()\" \n\n ::= *\n```", "metadata": {"uses_error_handling": true, "uses_loop": true, "complexity": 2}, "token_estimate": 309} +{"chunk_id": "11333d2fac62c3e9", "source_file": "../../../docs/LRM/avap.md", "doc_type": "spec", "block_type": "narrative", "section": "Especificación Técnica Consolidada del Lenguaje AVAP (LRM) > SECCIÓN IV: Concurrencia y Asincronía", "start_line": 202, "end_line": 202, "content": "Implementa un sistema avanzado basado en hilos ligeros (gorutinas), permitiendo que el servidor procese operaciones de E/S largas sin bloquear el hilo principal.", "metadata": {"complexity": 0}, "token_estimate": 40} +{"chunk_id": "73dc6006bf8b2750", "source_file": "../../../docs/LRM/avap.md", "doc_type": "spec", "block_type": "narrative", "section": "Especificación Técnica Consolidada del Lenguaje AVAP (LRM) > SECCIÓN IV: Concurrencia y Asincronía > 4.1 Comando Lanzador (go)", "start_line": 206, "end_line": 207, "content": "* **Sintaxis:** `identificador = go nombre_funcion(parametros)`.\n* **Mecánica:** Crea un nuevo contexto de ejecución aislado. Devuelve un identificador único que debe guardarse para interactuar con el hilo posteriormente.", "metadata": {"uses_async": true, "complexity": 1}, "token_estimate": 57} +{"chunk_id": "b084cc827e7ad592", "source_file": "../../../docs/LRM/avap.md", "doc_type": "spec", "block_type": "narrative", "section": "Especificación Técnica Consolidada del Lenguaje AVAP (LRM) > SECCIÓN IV: Concurrencia y Asincronía > 4.2 Comando Sincronizador (gather)", "start_line": 210, "end_line": 211, "content": "* **Sintaxis:** `resultado = gather(identificador, timeout)`.\n* **Mecánica:** Pausa el hilo principal esperando el resultado. Si se supera el `timeout` especificado, cancela la espera y devuelve `None`.", "metadata": {"uses_async": true, "complexity": 1}, "token_estimate": 55} +{"chunk_id": "2fc32e3d5bbee77a", "source_file": "../../../docs/LRM/avap.md", "doc_type": "bnf", "block_type": "bnf", "section": "Especificación Técnica Consolidada del Lenguaje AVAP (LRM) > SECCIÓN IV: Concurrencia y Asincronía > Especificación BNF (Sección IV)", "start_line": 215, "end_line": 219, "content": "```bnf\n ::= | \n ::= \"=\" \"go\" \"(\" [] \")\"\n ::= \"=\" \"gather(\" [\",\" ] \")\"\n```", "metadata": {"uses_async": true, "complexity": 1}, "token_estimate": 64} +{"chunk_id": "c548654ccca08295", "source_file": "../../../docs/LRM/avap.md", "doc_type": "spec", "block_type": "narrative", "section": "Especificación Técnica Consolidada del Lenguaje AVAP (LRM) > SECCIÓN V: Conectores de Terceros, Peticiones HTTP y ORM Nativo", "start_line": 224, "end_line": 224, "content": "Agrupa todas las capacidades de interconexión hacia el exterior, permitiendo consumir integraciones de terceros, APIs externas y administrar bases de datos relacionales sin drivers adicionales.", "metadata": {"complexity": 0}, "token_estimate": 42} +{"chunk_id": "89758e089ae5eeae", "source_file": "../../../docs/LRM/avap.md", "doc_type": "spec", "block_type": "narrative", "section": "Especificación Técnica Consolidada del Lenguaje AVAP (LRM) > SECCIÓN V: Conectores de Terceros, Peticiones HTTP y ORM Nativo > 5.1 Conectores de Terceros (avapConnector)", "start_line": 228, "end_line": 230, "content": "`avapConnector` es el mecanismo de integración con servicios de terceros configurados en la plataforma AVAP. Un conector se registra previamente mediante un UUID único. Al instanciarlo, la variable se convierte en un **objeto proxy** que encapsula credenciales y contexto, exponiendo métodos dinámicos mediante notación de punto.\n\n**Patrón de uso:**", "metadata": {"complexity": 0}, "token_estimate": 88} +{"chunk_id": "3def8aeea87256a1", "source_file": "../../../docs/LRM/avap.md", "doc_type": "code_example", "block_type": "avap", "section": "Especificación Técnica Consolidada del Lenguaje AVAP (LRM) > SECCIÓN V: Conectores de Terceros, Peticiones HTTP y ORM Nativo > 5.1 Conectores de Terceros (avapConnector)", "start_line": 232, "end_line": 242, "content": "```avap\n// 1. Instanciar el conector usando su UUID\nbelvo_connector = avapConnector(\"20908e93260147acb2636967021fbf5d\")\n\n// 2. Invocar métodos dinámicos (resueltos en runtime)\ninstitutions = belvo_connector.list_institutions()\nbalances = belvo_connector.get_balances(link, account_id)\n\n// 3. Resultado tratable como variable estándar\naddResult(balances)\n```", "metadata": {"uses_connector": true, "returns_result": true, "complexity": 2}, "token_estimate": 106} +{"chunk_id": "44548bb3d2f94cc2", "source_file": "../../../docs/LRM/avap.md", "doc_type": "spec", "block_type": "narrative", "section": "Especificación Técnica Consolidada del Lenguaje AVAP (LRM) > SECCIÓN V: Conectores de Terceros, Peticiones HTTP y ORM Nativo > 5.2 Cliente HTTP Externo (RequestPost / RequestGet)", "start_line": 245, "end_line": 248, "content": "Para evitar hilos bloqueados por latencia de red, AVAP exige un parámetro de **timeout** (en milisegundos). Si se supera, la variable destino recibe `None`.\n\n* **`RequestPost(url, querystring, headers, body, destino, timeout)`**: Ejecuta un POST almacenando la respuesta en `destino`.\n* **`RequestGet(url, querystring, headers, destino, timeout)`**: Ejecuta un GET omitiendo el cuerpo.", "metadata": {"uses_http": true, "complexity": 1}, "token_estimate": 104} +{"chunk_id": "90c517b760b54c04", "source_file": "../../../docs/LRM/avap.md", "doc_type": "spec", "block_type": "narrative", "section": "Especificación Técnica Consolidada del Lenguaje AVAP (LRM) > SECCIÓN V: Conectores de Terceros, Peticiones HTTP y ORM Nativo > 5.3 Conector de Bases de Datos y ORM", "start_line": 252, "end_line": 261, "content": "AVAP utiliza `avapConnector(\"TOKEN\")` para la hidratación segura de credenciales. Las operaciones se ejecutan sobre una tabla específica definida por el parámetro `tableName`.\n\n* **`ormCheckTable(tableName, varTarget)`**: Verifica la existencia de una tabla en la base de datos conectada.\n* **`ormCreateTable(fields, fieldsType, tableName, varTarget)`**: Comando DDL para creación de tablas.\n* **`ormAccessSelect(fields, tableName, selector, varTarget)`**: Recupera registros. `fields` acepta `*` o lista de campos. El `selector` es la cláusula WHERE (puede estar vacío). Devuelve una lista de diccionarios.\n* **`ormAccessInsert(fieldsValues, tableName, varTarget)`**: Inserción parametrizada de registros en la tabla `tableName`.\n* **`ormAccessUpdate(fields, fieldsValues, tableName, selector, varTarget)`**: Modifica registros existentes. El `selector` es obligatorio para delimitar el alcance del cambio en la tabla `tableName`.\n* **`ormDirect(sentencia, destino)`**: Ejecución de SQL crudo para consultas analíticas complejas.\n\n---", "metadata": {"uses_orm": true, "uses_connector": true, "complexity": 2}, "token_estimate": 266} +{"chunk_id": "8bf39cab443ec928", "source_file": "../../../docs/LRM/avap.md", "doc_type": "bnf", "block_type": "bnf", "section": "Especificación Técnica Consolidada del Lenguaje AVAP (LRM) > SECCIÓN V: Conectores de Terceros, Peticiones HTTP y ORM Nativo > Especificación BNF (Sección V)", "start_line": 268, "end_line": 294, "content": "```bnf\n/* Instanciación de conector de terceros y llamada a sus métodos dinámicos */\n ::= | \n ::= \"=\" \"avapConnector(\" \")\"\n ::= [ \"=\" ] \".\" \"(\" [] \")\"\n\n/* Cliente HTTP con Timeout Obligatorio */\n ::= | \n ::= \"RequestPost(\" \",\" \",\" \",\" \",\" \",\" \")\"\n ::= \"RequestGet(\" \",\" \",\" \",\" \",\" \")\"\n\n/* ORM y Persistencia (Estandarizado con tableName) */\n ::= | | | | | \n ::= \"ormDirect(\" \",\" \")\"\n ::= \"ormCheckTable(\" \",\" \")\"\n ::= \"ormCreateTable(\" \",\" \",\" \",\" \")\"\n\n/* ormAccessSelect(fields, tableName, selector, varTarget) */\n ::= \"ormAccessSelect(\" \",\" \",\" [] \",\" \")\"\n ::= \"*\" | \n\n/* ormAccessInsert(fieldsValues, tableName, varTarget) */\n ::= \"ormAccessInsert(\" \",\" \",\" \")\"\n\n/* ormAccessUpdate(fields, fieldsValues, tableName, selector, varTarget) */\n ::= \"ormAccessUpdate(\" \",\" \",\" \",\" \",\" \")\"\n```", "metadata": {"uses_orm": true, "uses_http": true, "uses_connector": true, "complexity": 3}, "token_estimate": 438} +{"chunk_id": "1ee459cf710ed983", "source_file": "../../../docs/LRM/avap.md", "doc_type": "spec", "block_type": "narrative", "section": "Especificación Técnica Consolidada del Lenguaje AVAP (LRM) > SECCIÓN V: Conectores de Terceros, Peticiones HTTP y ORM Nativo > Especificación BNF (Sección V)", "start_line": 295, "end_line": 297, "content": "> **Nota de implementación:** `` se distingue de `` (ORM) únicamente por contexto semántico: el UUID pasado como argumento determina si el adaptador resuelto es un ORM de base de datos o un proxy de terceros. La gramática los trata de forma idéntica; el motor de ejecución selecciona el adaptador apropiado en runtime.\n\n---", "metadata": {"complexity": 0}, "token_estimate": 93} +{"chunk_id": "9c36aa500a211a01", "source_file": "../../../docs/LRM/avap.md", "doc_type": "spec", "block_type": "narrative", "section": "SECCIÓN VI: Utilidades, Criptografía y Manipulación de Datos", "start_line": 301, "end_line": 303, "content": "AVAP incluye un set de comandos integrados de alto nivel para manipular tipos complejos (JSON y Listas), tiempos, textos y generar hashes.\n\n---", "metadata": {"complexity": 0}, "token_estimate": 38} +{"chunk_id": "21d7c45e60d98b82", "source_file": "../../../docs/LRM/avap.md", "doc_type": "spec", "block_type": "narrative", "section": "SECCIÓN VI: Utilidades, Criptografía y Manipulación de Datos > 6.1 Manipulación Nativa de Listas y Objetos JSON", "start_line": 307, "end_line": 319, "content": "Para extraer y mutar estructuras complejas, AVAP provee comandos nativos específicos. En AVAP, las listas **no se instancian con literales de array**, sino que se construyen y recorren a través de un conjunto cerrado de comandos especializados:\n\n* **`variableToList(elemento, destino)`**: Fuerza a que una variable escalar se convierta en una estructura iterable de lista de un único elemento. Es el punto de entrada canónico para construir una lista desde cero a partir de un valor existente.\n\n* **`itemFromList(lista_origen, indice, destino)`**: Extrae de forma segura el elemento contenido en la posición `indice` (base 0) de una lista. Equivale a un acceso por índice controlado.\n\n* **`getListLen(lista, destino)`**: Calcula el número total de elementos contenidos en `lista` y almacena el resultado entero en `destino`. Imprescindible para construir bucles de recorrido seguro y para validar listas antes de acceder a sus índices. Se recomienda llamar siempre a `getListLen` antes de `itemFromList` para evitar accesos fuera de rango.\n\n* **`variableFromJSON(json_origen, clave, destino)`**: Parsea un objeto JSON en memoria y extrae el valor correspondiente a la `clave`, almacenándolo en `destino`. El acceso es directo por nombre de propiedad.\n\n* **`AddVariableToJSON(clave, valor, json_destino)`**: Inyecta dinámicamente una nueva propiedad dentro de un objeto JSON existente. Si la clave ya existe, su valor es sobreescrito.\n\n**Patrón de recorrido típico en AVAP:**", "metadata": {"uses_json": true, "uses_list": true, "complexity": 2}, "token_estimate": 384} +{"chunk_id": "dc9304e8db408667", "source_file": "../../../docs/LRM/avap.md", "doc_type": "code_example", "block_type": "avap", "section": "SECCIÓN VI: Utilidades, Criptografía y Manipulación de Datos > 6.1 Manipulación Nativa de Listas y Objetos JSON", "start_line": 322, "end_line": 333, "content": "```avap\n// 1. Obtener longitud de la lista\ngetListLen(myList, len)\n\n// 2. Iterar con índice controlado\ni = 0\nwhile (i < len) {\n itemFromList(myList, i, currentItem)\n // ... procesar currentItem ...\n i = i + 1\n}\n```", "metadata": {"uses_list": true, "complexity": 1}, "token_estimate": 75} +{"chunk_id": "356328fd14d2cb9c", "source_file": "../../../docs/LRM/avap.md", "doc_type": "spec", "block_type": "narrative", "section": "SECCIÓN VI: Utilidades, Criptografía y Manipulación de Datos > 6.2 Criptografía y Expresiones Regulares", "start_line": 338, "end_line": 342, "content": "* **`encodeSHA256(origen, destino)`** y **`encodeMD5(origen, destino)`**: Funciones criptográficas que encriptan de forma irreversible un texto. Vitales para el almacenamiento seguro de contraseñas y la verificación de integridad de datos. SHA-256 produce un digest de 64 caracteres hexadecimales y ofrece mayor resistencia criptográfica que MD5 (32 caracteres); se recomienda SHA-256 para nuevos desarrollos.\n\n* **`getRegex(origen, patron, destino)`**: Aplica una Expresión Regular (`patron`) sobre la variable de origen, extrayendo la primera coincidencia exacta encontrada. El patrón sigue la sintaxis estándar compatible con Python `re`.\n\n---", "metadata": {"uses_crypto": true, "uses_regex": true, "complexity": 2}, "token_estimate": 166} +{"chunk_id": "e30e00ffbad9299e", "source_file": "../../../docs/LRM/avap.md", "doc_type": "spec", "block_type": "narrative", "section": "SECCIÓN VI: Utilidades, Criptografía y Manipulación de Datos > 6.3 Transformación de Tiempo y Cadenas > Fechas y Timestamps", "start_line": 348, "end_line": 348, "content": "AVAP provee tres comandos complementarios para cubrir todas las conversiones posibles entre representaciones de tiempo. Los tres soportan formatos de calendario en notación `strftime` de Python y cálculos con `TimeDelta` expresados en segundos (positivo para sumar, negativo para restar):", "metadata": {"complexity": 0}, "token_estimate": 69} +{"chunk_id": "418ad7a6e4e5f85d", "source_file": "../../../docs/LRM/avap.md", "doc_type": "spec", "block_type": "table", "section": "SECCIÓN VI: Utilidades, Criptografía y Manipulación de Datos > 6.3 Transformación de Tiempo y Cadenas > Fechas y Timestamps", "start_line": 351, "end_line": 355, "content": "| Comando | Entrada | Salida |\n|---|---|---|\n| `getTimeStamp(fecha_string, formato, timedelta, destino)` | String de fecha | Epoch (entero) |\n| `stampToDatetime(epoch, formato, timedelta, destino)` | Epoch (entero) | String de fecha |\n| `getDateTime(formato, timedelta, zona_horaria, destino)` | — (ahora mismo) | String de fecha |", "metadata": {"uses_datetime": true, "complexity": 1}, "token_estimate": 93} +{"chunk_id": "069dfe5b704bb29f", "source_file": "../../../docs/LRM/avap.md", "doc_type": "spec", "block_type": "narrative", "section": "SECCIÓN VI: Utilidades, Criptografía y Manipulación de Datos > 6.3 Transformación de Tiempo y Cadenas > Fechas y Timestamps", "start_line": 356, "end_line": 360, "content": "* **`getTimeStamp(fecha_string, formato, timedelta, destino)`**: Convierte un string de fecha legible a su valor Epoch (entero Unix). Útil para almacenar fechas y realizar cálculos aritméticos sobre ellas.\n\n* **`stampToDatetime(epoch, formato, timedelta, destino)`**: Convierte un valor Epoch a un string de fecha con el formato especificado. Útil para presentar timestamps almacenados de forma legible.\n\n* **`getDateTime(formato, timedelta, zona_horaria, destino)`**: Captura la fecha y hora actuales del sistema, aplica el ajuste `timedelta` y las convierte a la `zona_horaria` indicada antes de almacenar el resultado. Acepta cualquier zona horaria reconocida por la librería `pytz` de Python.", "metadata": {"uses_datetime": true, "complexity": 1}, "token_estimate": 179} +{"chunk_id": "70aeed6f69fdb183", "source_file": "../../../docs/LRM/avap.md", "doc_type": "spec", "block_type": "narrative", "section": "SECCIÓN VI: Utilidades, Criptografía y Manipulación de Datos > 6.3 Transformación de Tiempo y Cadenas > Cadenas de Texto", "start_line": 364, "end_line": 368, "content": "* **`randomString(patron, longitud, destino)`**: Genera una cadena aleatoria de `longitud` caracteres cuyos símbolos están restringidos al conjunto definido por `patron` (expresión regular de caracteres). Útil para generar tokens de sesión, contraseñas temporales o identificadores únicos.\n\n* **`replace(origen, patron_busqueda, reemplazo, destino)`**: Localiza todas las ocurrencias de `patron_busqueda` dentro de `origen` y las sustituye por `reemplazo`, almacenando el resultado en `destino`. Facilita el saneamiento y normalización de datos de entrada antes de su procesamiento o almacenamiento.\n\n---", "metadata": {"complexity": 0}, "token_estimate": 155} +{"chunk_id": "8f8da55cbe6dd59d", "source_file": "../../../docs/LRM/avap.md", "doc_type": "bnf", "block_type": "bnf", "section": "SECCIÓN VI: Utilidades, Criptografía y Manipulación de Datos > BNF — Gramática Formal de los Comandos de Utilidad", "start_line": 373, "end_line": 407, "content": "```bnf\n ::= | | \n | | | | \n\n/* Manipulación de listas y JSON */\n ::= \"variableToList(\" \",\" \")\"\n | \"itemFromList(\" \",\" \",\" \")\"\n | \"getListLen(\" \",\" \")\"\n | \"variableFromJSON(\" \",\" \",\" \")\"\n | \"AddVariableToJSON(\" \",\" \",\" \")\"\n\n/* Criptografía */\n ::= \"encodeSHA256(\" \",\" \")\"\n | \"encodeMD5(\" \",\" \")\"\n\n/* Expresiones regulares */\n ::= \"getRegex(\" \",\" \",\" \")\"\n\n/* Fecha/hora actual -> string */\n ::= \"getDateTime(\" \",\" \",\" \",\" \")\"\n/* Argumentos: formato_salida, timedelta, zona_horaria, destino */\n\n/* Conversiones epoch ↔ string */\n ::= \"stampToDatetime(\" \",\" \",\" \",\" \")\"\n/* Argumentos: epoch_origen, formato, timedelta, destino */\n | \"getTimeStamp(\" \",\" \",\" \",\" \")\"\n/* Argumentos: fecha_string, formato_entrada, timedelta, destino */\n\n/* Cadenas */\n ::= \"randomString(\" \",\" \",\" \")\"\n/* Argumentos: patron, longitud, destino */\n\n ::= \"replace(\" \",\" \",\" \",\" \")\"\n/* Argumentos: origen, patron_busqueda, reemplazo, destino */\n```", "metadata": {"uses_crypto": true, "uses_json": true, "uses_list": true, "uses_regex": true, "uses_datetime": true, "complexity": 5}, "token_estimate": 443} +{"chunk_id": "c6d44dfa4f20d4ba", "source_file": "../../../docs/LRM/avap.md", "doc_type": "spec", "block_type": "narrative", "section": "SECCIÓN VI: Utilidades, Criptografía y Manipulación de Datos > SECCIÓN VII: Arquitectura de Funciones y Ámbitos (Scopes)", "start_line": 413, "end_line": 414, "content": "Las funciones son recintos herméticos de memoria. Al entrar en una función, AVAP crea un nuevo diccionario de variables locales aislado del contexto global.\nEl comando `return()` actúa como interruptor de flujo: inyecta el valor calculado al llamador, libera la memoria local, y si se usa dentro de un `startLoop`, rompe la iteración anticipadamente.", "metadata": {"complexity": 0}, "token_estimate": 87} +{"chunk_id": "3eeaf5913a0be091", "source_file": "../../../docs/LRM/avap.md", "doc_type": "bnf", "block_type": "bnf", "section": "SECCIÓN VI: Utilidades, Criptografía y Manipulación de Datos > SECCIÓN VII: Arquitectura de Funciones y Ámbitos (Scopes) > Especificación BNF (Sección VII)", "start_line": 419, "end_line": 429, "content": "```bnf\n/* Nota: las funciones utilizan llaves {} como delimitadores de bloque por decisión\n arquitectónica explícita, diferenciándose de las estructuras de control (if, loop, try)\n que usan palabras clave de cierre (end(), endLoop()). Ambos patrones coexisten\n en la gramática y el parser los distingue por el token de apertura. */\n ::= \"function\" \"(\" [] \")\" \"{\" \n \n \"}\" \n ::= (\",\" )*\n ::= \"return(\" [] \")\"\n```", "metadata": {"complexity": 0}, "token_estimate": 158} +{"chunk_id": "a21981b11b385a44", "source_file": "../../../docs/LRM/avap.md", "doc_type": "spec", "block_type": "narrative", "section": "SECCIÓN VI: Utilidades, Criptografía y Manipulación de Datos > SECCIÓN VIII: Modularidad e Inclusiones", "start_line": 434, "end_line": 435, "content": "* **Inclusión Estática (`include`)**: Directiva de preprocesador que pega el contenido de un fichero físico en la línea actual.\n* **Librerías (`import`)**: Carga colecciones de funciones. Corchetes angulares (`import `) para nativas, comillas (`import \"mis_utils\"`) para locales.", "metadata": {"complexity": 0}, "token_estimate": 79} +{"chunk_id": "1cb62ad40dde6e03", "source_file": "../../../docs/LRM/avap.md", "doc_type": "bnf", "block_type": "bnf", "section": "SECCIÓN VI: Utilidades, Criptografía y Manipulación de Datos > SECCIÓN VIII: Modularidad e Inclusiones > Especificación BNF (Sección VIII)", "start_line": 440, "end_line": 444, "content": "```bnf\n ::= | \n ::= \"include\" \" \" \n ::= \"import\" \" \" ( \"<\" \">\" | )\n```", "metadata": {"complexity": 0}, "token_estimate": 60} +{"chunk_id": "8353b49d77752023", "source_file": "../../../docs/LRM/avap.md", "doc_type": "spec", "block_type": "narrative", "section": "SECCIÓN VI: Utilidades, Criptografía y Manipulación de Datos > SECCIÓN IX: Expresiones y Gramática Léxica Estricta", "start_line": 449, "end_line": 449, "content": "Esta sección es el corazón matemático evaluador de AVAP. Define la jerarquía exacta (Precedencia) y provee soporte nativo para características avanzadas similares a Python.", "metadata": {"complexity": 0}, "token_estimate": 45} +{"chunk_id": "3ef2bf52da198594", "source_file": "../../../docs/LRM/avap.md", "doc_type": "spec", "block_type": "narrative", "section": "SECCIÓN VI: Utilidades, Criptografía y Manipulación de Datos > SECCIÓN IX: Expresiones y Gramática Léxica Estricta > 9.1 Cast de Tipos Explícito", "start_line": 453, "end_line": 453, "content": "AVAP permite conversiones de tipos (Type Casting) en cualquier evaluación utilizando funciones constructoras estándar. Puedes transformar variables dinámicamente usando `int(var)`, `float(var)` o `str(var)`.", "metadata": {"complexity": 0}, "token_estimate": 49} +{"chunk_id": "d64846e65a09ba05", "source_file": "../../../docs/LRM/avap.md", "doc_type": "spec", "block_type": "narrative", "section": "SECCIÓN VI: Utilidades, Criptografía y Manipulación de Datos > SECCIÓN IX: Expresiones y Gramática Léxica Estricta > 9.2 Slicing y Comprensiones (Comprehensions)", "start_line": 456, "end_line": 457, "content": "* **Slicing (Cortes):** Puedes extraer fragmentos de listas o strings utilizando la notación de dos puntos. Ejemplo: `mi_lista[1:4]` (extrae desde el índice 1 hasta el 3).\n* **Comprehensions:** AVAP soporta la construcción rápida de listas mediante iteradores en una sola línea, permitiendo filtrar y mapear colecciones enteras (ej. `[x * 2 for x in valores if x > 0]`).", "metadata": {"complexity": 0}, "token_estimate": 115} +{"chunk_id": "ef984b8dd1da3bf5", "source_file": "../../../docs/LRM/avap.md", "doc_type": "spec", "block_type": "narrative", "section": "SECCIÓN VI: Utilidades, Criptografía y Manipulación de Datos > SECCIÓN IX: Expresiones y Gramática Léxica Estricta > 9.3 Análisis Léxico (Lexer) y Documentación", "start_line": 460, "end_line": 463, "content": "AVAP cuenta con tres niveles de descarte de texto para anotaciones humanas:\n1. **Comentarios de Línea (`//`):** Ignora el texto hasta el salto de línea.\n2. **Comentarios de Bloque (`/* ... */`):** Para aislar bloques enteros multilínea.\n3. **Comentarios de Documentación (`///`):** Utilizados por analizadores de código o IDEs para generar documentación técnica automática (Docstrings) a partir del código fuente.", "metadata": {"complexity": 0}, "token_estimate": 114} +{"chunk_id": "d9dcbc4914a55b0e", "source_file": "../../../docs/LRM/avap.md", "doc_type": "bnf", "block_type": "bnf", "section": "SECCIÓN VI: Utilidades, Criptografía y Manipulación de Datos > SECCIÓN IX: Expresiones y Gramática Léxica Estricta > Especificación BNF (Sección IX)", "start_line": 467, "end_line": 530, "content": "```bnf\n/* Jerarquía de Expresiones (Precedencia de menor a mayor) */\n ::= \n ::= ( \"or\" )*\n ::= ( \"and\" )*\n ::= \"not\" | \n\n ::= ( )*\n ::= \"==\" | \"!=\" | \"<\" | \">\" | \"<=\" | \">=\" | \"in\" | \"is\"\n\n ::= ( ( \"+\" | \"-\" ) )*\n ::= ( ( \"*\" | \"/\" | \"%\" ) )*\n ::= ( \"+\" | \"-\" ) | \n ::= [ \"**\" ]\n\n/* Primarios y Átomos (Accesos, Castings, Slicing, Métodos y Funciones)\n La regla cubre también el acceso a métodos de objetos conector\n (conector.metodo(...)) y el acceso por clave a sus resultados (resultado[\"key\"]) */\n ::= \n | \".\" \n | \"[\" \"]\"\n | \"[\" [] \":\" [] [\":\" []] \"]\"\n | \"(\" [] \")\"\n\n ::= \n | \"$\" \n | \n | \"(\" \")\"\n | \n | \n\n/* Estructuras de Datos, Comprensiones y Argumentos */\n ::= \"[\" [] \"]\"\n | \"[\" \"for\" \"in\" [] \"]\"\n ::= \"if\" \n ::= \"{\" [] \"}\"\n ::= ( \",\" )*\n ::= \":\" \n ::= ( \",\" )*\n\n/* Tipo numérico unificado */\n ::= | \n\n/* Literales (Tipos de Datos Primitivos Soportados) */\n ::= | | | \"None\"\n ::= \"True\" | \"False\"\n ::= [0-9]+\n ::= [0-9]+ \".\" [0-9]* | \".\" [0-9]+\n\n/* Cadenas de Texto con soporte de secuencias de escape */\n ::= \"\\\"\" \"\\\"\" | \"'\" \"'\"\n ::= \"\\\\\" ( \"\\\"\" | \"'\" | \"\\\\\" | \"n\" | \"t\" | \"r\" | \"0\" )\n ::= ( [^\"\\\\] | )*\n ::= ( [^'\\\\] | )*\n ::= | \n\n/* Reglas de Comentarios para el Lexer\n El lexer aplica longest-match: /// debe evaluarse ANTES que // */\n ::= \"///\" \n ::= \"//\" \n ::= \"/*\" \"*/\"\n ::= [^\\r\\n]*\n ::= /* Cualquier secuencia de caracteres que no contenga la subcadena \"*/\" */\n```", "metadata": {"complexity": 0}, "token_estimate": 833} +{"chunk_id": "1a283ddb2d395d2e", "source_file": "../../../docs/LRM/avap.md", "doc_type": "spec", "block_type": "narrative", "section": "APÉNDICE X: Especificación Léxica de AVAP", "start_line": 533, "end_line": 538, "content": "Este apéndice define las reglas del **analizador léxico (lexer)** del lenguaje AVAP. \nEl lexer transforma el código fuente en una secuencia de **tokens**, que posteriormente son consumidos por el parser descrito en la gramática BNF.\n\nEl análisis léxico sigue el principio de **máxima coincidencia (longest match)**: cuando múltiples reglas pueden coincidir con el mismo texto, se selecciona la coincidencia más larga.\n\n---", "metadata": {"complexity": 0}, "token_estimate": 109} +{"chunk_id": "0433456477979413", "source_file": "../../../docs/LRM/avap.md", "doc_type": "spec", "block_type": "narrative", "section": "X.1 Espacios en blanco y separadores", "start_line": 542, "end_line": 542, "content": "Los siguientes caracteres se ignoran excepto cuando forman parte de literales o comentarios.", "metadata": {"complexity": 0}, "token_estimate": 18} +{"chunk_id": "355934ebd06425c5", "source_file": "../../../docs/LRM/avap.md", "doc_type": "code_example", "block_type": "regex", "section": "X.1 Espacios en blanco y separadores", "start_line": 545, "end_line": 548, "content": "```regex\nWHITESPACE ::= [ \\t]+\nEOL ::= \\r\\n | \\n | \\r\n```", "metadata": {"complexity": 0}, "token_estimate": 28} +{"chunk_id": "bad9116e87d54385", "source_file": "../../../docs/LRM/avap.md", "doc_type": "spec", "block_type": "narrative", "section": "X.1 Espacios en blanco y separadores", "start_line": 549, "end_line": 555, "content": "Reglas:\n\n- `WHITESPACE` se ignora\n- `EOL` genera el token **EOL**, que actúa como terminador de sentencia\n- AVAP es un lenguaje **orientado a líneas**, por lo que las sentencias no pueden dividirse en múltiples líneas.\n\n---", "metadata": {"complexity": 0}, "token_estimate": 68} +{"chunk_id": "070a7e4aa025eee8", "source_file": "../../../docs/LRM/avap.md", "doc_type": "spec", "block_type": "narrative", "section": "X.2 Comentarios", "start_line": 559, "end_line": 559, "content": "AVAP soporta tres tipos de comentarios. El lexer aplica longest-match, por lo que `///` debe reconocerse **antes** que `//`.", "metadata": {"complexity": 0}, "token_estimate": 34} +{"chunk_id": "3c0f88dc459e1aa4", "source_file": "../../../docs/LRM/avap.md", "doc_type": "code_example", "block_type": "regex", "section": "X.2 Comentarios > Comentario de documentación (mayor prioridad)", "start_line": 564, "end_line": 566, "content": "```regex\nDOC_COMMENT ::= \"///\"[^\\r\\n]*\n```", "metadata": {"complexity": 0}, "token_estimate": 15} +{"chunk_id": "a4cd287486836c9e", "source_file": "../../../docs/LRM/avap.md", "doc_type": "spec", "block_type": "narrative", "section": "X.2 Comentarios > Comentario de documentación (mayor prioridad)", "start_line": 567, "end_line": 569, "content": "Se utiliza para generar documentación automática o anotaciones de herramientas.\n\nEjemplo:", "metadata": {"complexity": 0}, "token_estimate": 20} +{"chunk_id": "80c6a5349e60c83e", "source_file": "../../../docs/LRM/avap.md", "doc_type": "code_example", "block_type": "avap", "section": "X.2 Comentarios > Comentario de documentación (mayor prioridad)", "start_line": 572, "end_line": 574, "content": "```avap\n/// obtiene el balance del usuario\n```", "metadata": {"complexity": 0}, "token_estimate": 13} +{"chunk_id": "c8c98ea294c04cd2", "source_file": "../../../docs/LRM/avap.md", "doc_type": "code_example", "block_type": "regex", "section": "X.2 Comentarios > Comentario de línea", "start_line": 580, "end_line": 582, "content": "```regex\nLINE_COMMENT ::= \"//\"[^\\r\\n]*\n```", "metadata": {"complexity": 0}, "token_estimate": 14} +{"chunk_id": "0d33a6ce642cacc9", "source_file": "../../../docs/LRM/avap.md", "doc_type": "spec", "block_type": "narrative", "section": "X.2 Comentarios > Comentario de línea", "start_line": 583, "end_line": 583, "content": "Ejemplo:", "metadata": {"complexity": 0}, "token_estimate": 4} +{"chunk_id": "a758685e1a878d59", "source_file": "../../../docs/LRM/avap.md", "doc_type": "code_example", "block_type": "avap", "section": "X.2 Comentarios > Comentario de línea", "start_line": 586, "end_line": 588, "content": "```avap\n// comentario\n```", "metadata": {"complexity": 0}, "token_estimate": 8} +{"chunk_id": "3b04f22536af08a3", "source_file": "../../../docs/LRM/avap.md", "doc_type": "spec", "block_type": "narrative", "section": "X.2 Comentarios > Comentario de línea", "start_line": 589, "end_line": 591, "content": "El texto se ignora hasta el final de la línea.\n\n---", "metadata": {"complexity": 0}, "token_estimate": 13} +{"chunk_id": "06917fa36b322f7b", "source_file": "../../../docs/LRM/avap.md", "doc_type": "code_example", "block_type": "regex", "section": "X.2 Comentarios > Comentario de bloque", "start_line": 596, "end_line": 598, "content": "```regex\nBLOCK_COMMENT ::= \"/*\" .*? \"*/\"\n```", "metadata": {"complexity": 0}, "token_estimate": 15} +{"chunk_id": "7e664fbf890d19ec", "source_file": "../../../docs/LRM/avap.md", "doc_type": "spec", "block_type": "narrative", "section": "X.2 Comentarios > Comentario de bloque", "start_line": 599, "end_line": 601, "content": "Puede abarcar múltiples líneas.\n\nEjemplo:", "metadata": {"complexity": 0}, "token_estimate": 15} +{"chunk_id": "97f6d11951708550", "source_file": "../../../docs/LRM/avap.md", "doc_type": "code_example", "block_type": "avap", "section": "X.2 Comentarios > Comentario de bloque", "start_line": 604, "end_line": 607, "content": "```avap\n/* comentario\n multilinea */\n```", "metadata": {"complexity": 0}, "token_estimate": 12} +{"chunk_id": "96797fa8cdcb0bb8", "source_file": "../../../docs/LRM/avap.md", "doc_type": "spec", "block_type": "narrative", "section": "X.3 Identificadores", "start_line": 612, "end_line": 612, "content": "Los identificadores representan nombres de variables, funciones o parámetros.", "metadata": {"complexity": 0}, "token_estimate": 16} +{"chunk_id": "f08450f4b076af96", "source_file": "../../../docs/LRM/avap.md", "doc_type": "code_example", "block_type": "regex", "section": "X.3 Identificadores", "start_line": 615, "end_line": 617, "content": "```regex\nIDENTIFIER ::= [a-zA-Z_][a-zA-Z0-9_]*\n```", "metadata": {"complexity": 0}, "token_estimate": 21} +{"chunk_id": "137d14d78249e4bc", "source_file": "../../../docs/LRM/avap.md", "doc_type": "spec", "block_type": "narrative", "section": "X.3 Identificadores", "start_line": 618, "end_line": 627, "content": "Ejemplos válidos:\n\n```\nx\nuser_id\nbalanceTotal\n_connector\n```\n\n---", "metadata": {"complexity": 0}, "token_estimate": 22} +{"chunk_id": "ce83cd4dd8d3f82c", "source_file": "../../../docs/LRM/avap.md", "doc_type": "spec", "block_type": "narrative", "section": "X.4 Palabras reservadas", "start_line": 631, "end_line": 631, "content": "Las siguientes palabras están reservadas y **no pueden utilizarse como identificadores**.", "metadata": {"complexity": 0}, "token_estimate": 18} +{"chunk_id": "f638422514566af0", "source_file": "../../../docs/LRM/avap.md", "doc_type": "spec", "block_type": "narrative", "section": "X.4 Palabras reservadas > Control de flujo", "start_line": 635, "end_line": 644, "content": "```\nif\nelse\nend\nstartLoop\nendLoop\ntry\nexception\nreturn\n```", "metadata": {"complexity": 0}, "token_estimate": 21} +{"chunk_id": "0f324b0730ebd2e2", "source_file": "../../../docs/LRM/avap.md", "doc_type": "spec", "block_type": "narrative", "section": "X.4 Palabras reservadas > Declaración de funciones", "start_line": 648, "end_line": 650, "content": "```\nfunction\n```", "metadata": {"complexity": 0}, "token_estimate": 5} +{"chunk_id": "38b7f5fa2fed4953", "source_file": "../../../docs/LRM/avap.md", "doc_type": "spec", "block_type": "narrative", "section": "X.4 Palabras reservadas > Concurrencia", "start_line": 654, "end_line": 657, "content": "```\ngo\ngather\n```", "metadata": {"complexity": 0}, "token_estimate": 7} +{"chunk_id": "007c79f8edd33043", "source_file": "../../../docs/LRM/avap.md", "doc_type": "spec", "block_type": "narrative", "section": "X.4 Palabras reservadas > Modularidad", "start_line": 661, "end_line": 664, "content": "```\ninclude\nimport\n```", "metadata": {"complexity": 0}, "token_estimate": 7} +{"chunk_id": "6098165b41db2735", "source_file": "../../../docs/LRM/avap.md", "doc_type": "spec", "block_type": "narrative", "section": "X.4 Palabras reservadas > Operadores lógicos", "start_line": 668, "end_line": 674, "content": "```\nand\nor\nnot\nin\nis\n```", "metadata": {"complexity": 0}, "token_estimate": 13} +{"chunk_id": "404ac961095eb856", "source_file": "../../../docs/LRM/avap.md", "doc_type": "spec", "block_type": "narrative", "section": "X.4 Palabras reservadas > Literales", "start_line": 678, "end_line": 684, "content": "```\nTrue\nFalse\nNone\n```\n\n---", "metadata": {"complexity": 0}, "token_estimate": 11} +{"chunk_id": "2b042127c6d731cc", "source_file": "../../../docs/LRM/avap.md", "doc_type": "spec", "block_type": "narrative", "section": "X.5 Operadores > Asignación", "start_line": 690, "end_line": 700, "content": "```\n=\n```\n\nToken:\n\n```\nASSIGN\n```\n\n---", "metadata": {"complexity": 0}, "token_estimate": 15} +{"chunk_id": "f204bc217eef6166", "source_file": "../../../docs/LRM/avap.md", "doc_type": "spec", "block_type": "narrative", "section": "X.5 Operadores > Operadores aritméticos", "start_line": 704, "end_line": 728, "content": "```\n+\n-\n*\n/\n%\n**\n```\n\nTokens:\n\n```\nPLUS\nMINUS\nMULT\nDIV\nMOD\nPOWER\n```\n\nRegla importante:\n\n`**` debe evaluarse antes que `*` por la regla de **máxima coincidencia**.\n\n---", "metadata": {"complexity": 0}, "token_estimate": 59} +{"chunk_id": "8e87cc07f2c5bb83", "source_file": "../../../docs/LRM/avap.md", "doc_type": "spec", "block_type": "narrative", "section": "X.5 Operadores > Operadores de comparación", "start_line": 732, "end_line": 752, "content": "```\n==\n!=\n<\n>\n<=\n>=\n```\n\nTokens:\n\n```\nEQ\nNEQ\nLT\nGT\nLTE\nGTE\n```\n\n---", "metadata": {"complexity": 0}, "token_estimate": 34} +{"chunk_id": "0ad578582af128e3", "source_file": "../../../docs/LRM/avap.md", "doc_type": "spec", "block_type": "narrative", "section": "X.5 Operadores > Operadores lógicos", "start_line": 756, "end_line": 770, "content": "```\nand\nor\nnot\n```\n\nTokens:\n\n```\nAND\nOR\nNOT\n```\n\n---", "metadata": {"complexity": 0}, "token_estimate": 23} +{"chunk_id": "02aad017b7cc9694", "source_file": "../../../docs/LRM/avap.md", "doc_type": "spec", "block_type": "narrative", "section": "X.6 Delimitadores", "start_line": 774, "end_line": 802, "content": "Los siguientes símbolos delimitan estructuras sintácticas.\n\n```\n(\n)\n[\n]\n{\n}\n,\n.\n:\n```\n\nTokens:\n\n```\nLPAREN\nRPAREN\nLBRACKET\nRBRACKET\nLBRACE\nRBRACE\nCOMMA\nDOT\nCOLON\n```\n\n---", "metadata": {"complexity": 0}, "token_estimate": 66} +{"chunk_id": "03adc6e15a7f8f56", "source_file": "../../../docs/LRM/avap.md", "doc_type": "code_example", "block_type": "regex", "section": "X.7 Literales > Enteros", "start_line": 809, "end_line": 811, "content": "```regex\nINTEGER ::= [0-9]+\n```", "metadata": {"complexity": 0}, "token_estimate": 12} +{"chunk_id": "858115e1f34bbc73", "source_file": "../../../docs/LRM/avap.md", "doc_type": "spec", "block_type": "narrative", "section": "X.7 Literales > Enteros", "start_line": 812, "end_line": 820, "content": "Ejemplos:\n\n```\n0\n10\n999\n```\n\n---", "metadata": {"complexity": 0}, "token_estimate": 16} +{"chunk_id": "e210af93c0bf4759", "source_file": "../../../docs/LRM/avap.md", "doc_type": "code_example", "block_type": "regex", "section": "X.7 Literales > Números flotantes", "start_line": 825, "end_line": 827, "content": "```regex\nFLOAT ::= [0-9]+\\.[0-9]* | \\.[0-9]+\n```", "metadata": {"complexity": 0}, "token_estimate": 24} +{"chunk_id": "d1d45c84ab2c0471", "source_file": "../../../docs/LRM/avap.md", "doc_type": "spec", "block_type": "narrative", "section": "X.7 Literales > Números flotantes", "start_line": 828, "end_line": 836, "content": "Ejemplos:\n\n```\n1.0\n3.14\n.5\n```\n\n---", "metadata": {"complexity": 0}, "token_estimate": 21} +{"chunk_id": "9dff5c20b868241b", "source_file": "../../../docs/LRM/avap.md", "doc_type": "spec", "block_type": "narrative", "section": "X.7 Literales > Strings", "start_line": 840, "end_line": 840, "content": "AVAP soporta cadenas con comillas simples y dobles, con soporte de secuencias de escape.", "metadata": {"complexity": 0}, "token_estimate": 25} +{"chunk_id": "15a41be430604349", "source_file": "../../../docs/LRM/avap.md", "doc_type": "code_example", "block_type": "regex", "section": "X.7 Literales > Strings", "start_line": 843, "end_line": 847, "content": "```regex\nSTRING_DOUBLE ::= \"\\\"\" ( [^\"\\\\] | ESCAPE_SEQ )* \"\\\"\"\nSTRING_SINGLE ::= \"'\" ( [^'\\\\] | ESCAPE_SEQ )* \"'\"\nESCAPE_SEQ ::= \"\\\\\" ( '\"' | \"'\" | \"\\\\\" | \"n\" | \"t\" | \"r\" | \"0\" )\n```", "metadata": {"complexity": 0}, "token_estimate": 67} +{"chunk_id": "d61fe050c059539b", "source_file": "../../../docs/LRM/avap.md", "doc_type": "spec", "block_type": "narrative", "section": "X.7 Literales > Strings", "start_line": 848, "end_line": 856, "content": "Ejemplos:\n\n```\n\"hola\"\n'texto'\n\"https://api.com\"\n```\n\nSecuencias de escape soportadas:", "metadata": {"complexity": 0}, "token_estimate": 30} +{"chunk_id": "4322d659ed25b08a", "source_file": "../../../docs/LRM/avap.md", "doc_type": "spec", "block_type": "table", "section": "X.7 Literales > Strings", "start_line": 859, "end_line": 867, "content": "| Secuencia | Significado |\n|-----------|-------------------|\n| `\\\"` | Comilla doble |\n| `\\'` | Comilla simple |\n| `\\\\` | Barra invertida |\n| `\\n` | Salto de línea |\n| `\\t` | Tabulación |\n| `\\r` | Retorno de carro |\n| `\\0` | Carácter nulo |", "metadata": {"complexity": 0}, "token_estimate": 97} +{"chunk_id": "09cd03196dee4905", "source_file": "../../../docs/LRM/avap.md", "doc_type": "spec", "block_type": "narrative", "section": "X.7 Literales > Strings", "start_line": 868, "end_line": 870, "content": "> **Nota:** `\\n` dentro de un string es un carácter de datos, no un terminador de sentencia. El EOL físico sigue siendo el único terminador.\n\n---", "metadata": {"complexity": 0}, "token_estimate": 40} +{"chunk_id": "89862fa1c7d6fa31", "source_file": "../../../docs/LRM/avap.md", "doc_type": "spec", "block_type": "narrative", "section": "X.8 Literales booleanos", "start_line": 874, "end_line": 881, "content": "Tokens:\n\n```\nTrue\nFalse\n```\n\n---", "metadata": {"complexity": 0}, "token_estimate": 11} +{"chunk_id": "a1d6aee149860bef", "source_file": "../../../docs/LRM/avap.md", "doc_type": "spec", "block_type": "narrative", "section": "X.9 Literal nulo", "start_line": 885, "end_line": 891, "content": "Token:\n\n```\nNone\n```\n\n---", "metadata": {"complexity": 0}, "token_estimate": 9} +{"chunk_id": "7ecd779d33d47d65", "source_file": "../../../docs/LRM/avap.md", "doc_type": "spec", "block_type": "narrative", "section": "X.10 Operador de desreferenciación", "start_line": 895, "end_line": 897, "content": "AVAP permite acceder al valor de una variable utilizando el prefijo `$`.\n\nEjemplo:", "metadata": {"complexity": 0}, "token_estimate": 20} +{"chunk_id": "baa9aa4e3a708822", "source_file": "../../../docs/LRM/avap.md", "doc_type": "code_example", "block_type": "avap", "section": "X.10 Operador de desreferenciación", "start_line": 900, "end_line": 902, "content": "```avap\naddVar(copia, $original)\n```", "metadata": {"complexity": 0}, "token_estimate": 13} +{"chunk_id": "ef3ab4b2960421a7", "source_file": "../../../docs/LRM/avap.md", "doc_type": "spec", "block_type": "narrative", "section": "X.10 Operador de desreferenciación", "start_line": 903, "end_line": 909, "content": "Token:\n\n```\nDEREF ::= $\n```\n\n---", "metadata": {"complexity": 0}, "token_estimate": 11} +{"chunk_id": "abae8d52cca4f34b", "source_file": "../../../docs/LRM/avap.md", "doc_type": "spec", "block_type": "narrative", "section": "X.11 Orden de precedencia léxica", "start_line": 913, "end_line": 928, "content": "Para evitar ambigüedades, el lexer debe aplicar el principio **longest match first**.\n\nOrden obligatorio:\n\n1. comentarios (`///` antes que `//`, luego `/* */`)\n2. whitespace\n3. palabras reservadas\n4. identificadores\n5. números flotantes\n6. enteros\n7. strings\n8. operadores compuestos (`**`, `==`, `<=`, `>=`, `!=`)\n9. operadores simples\n10. delimitadores\n\n---", "metadata": {"complexity": 0}, "token_estimate": 108} +{"chunk_id": "3b94b2289cc2dfaf", "source_file": "../../../docs/LRM/avap.md", "doc_type": "spec", "block_type": "narrative", "section": "X.12 Separación formal: nivel léxico vs nivel sintáctico", "start_line": 932, "end_line": 942, "content": "```\nNIVEL LÉXICO — produce tokens: IDENTIFIER, INTEGER, FLOAT, STRING,\n operadores, delimitadores, EOL, palabras reservadas.\n\nNIVEL SINTÁCTICO — consume tokens: construye el AST según las reglas BNF\n de las Secciones I–IX.\n```\n\nEl Apéndice X cubre el nivel léxico. Las Secciones I–IX cubren el nivel sintáctico.\n\n---", "metadata": {"complexity": 0}, "token_estimate": 101} +{"chunk_id": "b17c5e47bf3ab720", "source_file": "../../../docs/LRM/avap.md", "doc_type": "spec", "block_type": "narrative", "section": "X.13 Tokens producidos por el lexer", "start_line": 946, "end_line": 994, "content": "El lexer produce los siguientes tokens:\n\n```\nIDENTIFIER\nINTEGER\nFLOAT\nSTRING\n\nASSIGN\nPLUS\nMINUS\nMULT\nDIV\nMOD\nPOWER\n\nEQ\nNEQ\nLT\nGT\nLTE\nGTE\n\nAND\nOR\nNOT\nIN\nIS\n\nLPAREN\nRPAREN\nLBRACKET\nRBRACKET\nLBRACE\nRBRACE\nCOMMA\nDOT\nCOLON\n\nDEREF\n\nTrue\nFalse\nNone\n\nEOL\n```\n\n---", "metadata": {"complexity": 0}, "token_estimate": 103} +{"chunk_id": "8514bf5ba41b03cd", "source_file": "../../../docs/LRM/avap.md", "doc_type": "spec", "block_type": "narrative", "section": "X.14 Elementos ignorados por el lexer", "start_line": 998, "end_line": 1007, "content": "Los siguientes elementos se descartan durante el análisis léxico:\n\n```\nWHITESPACE\nLINE_COMMENT\nDOC_COMMENT\nBLOCK_COMMENT\n```\n\nEstos tokens no son enviados al parser.", "metadata": {"complexity": 0}, "token_estimate": 41} +{"chunk_id": "ea8e42e2603b690b", "source_file": "../../../docs/LRM/avap.md", "doc_type": "spec", "block_type": "narrative", "section": "XI.1 Modelo de Memoria y Resolución de Variables", "start_line": 1012, "end_line": 1022, "content": "AVAP utiliza un modelo de memoria basado en **tres tipos de ámbitos (scopes)**:\n\n```\nGlobal Scope\nMain Local Scope\nFunction Scope\n```\n\nCada tipo de ámbito tiene reglas estrictas de visibilidad.\n\n---", "metadata": {"complexity": 0}, "token_estimate": 53} +{"chunk_id": "70c803bfeda2191f", "source_file": "../../../docs/LRM/avap.md", "doc_type": "spec", "block_type": "narrative", "section": "XI.1.1 Global Scope", "start_line": 1026, "end_line": 1037, "content": "El **Global Scope** contiene variables declaradas como globales y es accesible desde cualquier parte del programa.\n\nPropiedades:\n\n- existe durante toda la vida del proceso del intérprete\n- es visible desde el flujo principal\n- es visible desde todas las funciones\n- es visible desde goroutines\n\nLas variables globales actúan como **estado compartido del programa**.\n\n---", "metadata": {"complexity": 0}, "token_estimate": 80} +{"chunk_id": "e9d01b03575f1839", "source_file": "../../../docs/LRM/avap.md", "doc_type": "spec", "block_type": "narrative", "section": "XI.1.2 Main Local Scope", "start_line": 1041, "end_line": 1061, "content": "El **Main Local Scope** corresponde al flujo de ejecución principal del script, fuera de cualquier función.\n\nEjemplo:\n\n```\nx = 10\ny = 20\n```\n\nEstas variables son **locales del flujo principal**.\n\nReglas:\n\n- son accesibles dentro del flujo principal\n- **no son accesibles desde funciones**\n- **no son accesibles desde goroutines**\n- desaparecen cuando finaliza la ejecución del script\n\nEsto evita dependencias implícitas entre funciones y el flujo principal.\n\n---", "metadata": {"complexity": 0}, "token_estimate": 117} +{"chunk_id": "bc7fa2e899950fd6", "source_file": "../../../docs/LRM/avap.md", "doc_type": "spec", "block_type": "narrative", "section": "XI.1.3 Function Scope", "start_line": 1065, "end_line": 1085, "content": "Cada vez que se invoca una función:\n\n```\nfunction nombre(parametros)\n```\n\nel motor crea un **Function Scope independiente**.\n\nEste ámbito contiene:\n\n- parámetros de la función\n- variables creadas dentro de la función\n- resultados intermedios\n\nPropiedades:\n\n- solo es visible dentro de esa función\n- no es visible desde el exterior\n- se destruye cuando la función termina\n\n---", "metadata": {"complexity": 0}, "token_estimate": 91} +{"chunk_id": "0ea3def2da82aee4", "source_file": "../../../docs/LRM/avap.md", "doc_type": "spec", "block_type": "narrative", "section": "XI.1.4 Resolución de variables", "start_line": 1089, "end_line": 1100, "content": "La resolución de variables sigue el siguiente orden jerárquico:\n\n```\n1. Function Scope\n2. Global Scope\n```\n\nEl **Main Local Scope no es visible dentro de funciones**.\n\nSi una variable no existe en los scopes visibles, el motor produce un **error de ejecución**.\n\n---", "metadata": {"complexity": 0}, "token_estimate": 64} +{"chunk_id": "0e9ec8d414356b98", "source_file": "../../../docs/LRM/avap.md", "doc_type": "spec", "block_type": "narrative", "section": "XI.1.5 Aislamiento entre funciones", "start_line": 1104, "end_line": 1121, "content": "Cada invocación de función crea un **scope independiente**.\n\nEjemplo:\n\n```\nfunction ejemplo()\n{\n x = 10\n}\n```\n\nLa variable `x`:\n\n- solo existe dentro de esa ejecución de la función\n- no es visible desde otras funciones\n- no es visible desde el flujo principal\n\n---", "metadata": {"complexity": 0}, "token_estimate": 71} +{"chunk_id": "c32225df6dfcde1d", "source_file": "../../../docs/LRM/avap.md", "doc_type": "spec", "block_type": "narrative", "section": "XI.1.6 Acceso desde goroutines", "start_line": 1125, "end_line": 1137, "content": "Las goroutines creadas mediante:\n\n```\ngo funcion()\n```\n\nsiguen las mismas reglas de scope que una función normal.\n\nPor lo tanto:\n\n- pueden acceder a **Global Scope**\n- pueden acceder a su propio **Function Scope**\n- **no pueden acceder al Main Local Scope**", "metadata": {"uses_async": true, "complexity": 1}, "token_estimate": 63} diff --git a/scripts/pipelines/ingestion/requirements.txt b/scripts/pipelines/ingestion/requirements.txt new file mode 100644 index 0000000..2ba6820 --- /dev/null +++ b/scripts/pipelines/ingestion/requirements.txt @@ -0,0 +1,8 @@ +datasketch +tqdm +tiktoken +redis +"elasticsearch<9.0.0" +python-dotenv +tqdm +httpx \ No newline at end of file diff --git a/scripts/pipelines/ingestion/test.py b/scripts/pipelines/ingestion/test.py new file mode 100644 index 0000000..a80f2ac --- /dev/null +++ b/scripts/pipelines/ingestion/test.py @@ -0,0 +1,7 @@ +import socket +try: + test_sock = socket.create_connection(("127.0.0.1", 9200), timeout=2) + print(" --> DEBUG: ¡El puerto 9200 está abierto para Python!") + test_sock.close() +except Exception as e: + print(f" --> DEBUG: Error de socket puro: {e}")