Compare commits
1 Commits
| Author | SHA1 | Date |
|---|---|---|
|
|
27fb889275 |
|
|
@ -1,55 +0,0 @@
|
||||||
# CODEOWNERS
|
|
||||||
#
|
|
||||||
# Ownership and review rules for the Brunix Assistance Engine repository.
|
|
||||||
#
|
|
||||||
# Teams:
|
|
||||||
# @BRUNIX-AI/engineering — Core engineering team. Owns the production
|
|
||||||
# codebase, infrastructure, gRPC contract, and all architectural decisions.
|
|
||||||
# Required reviewer on every pull request targeting `online`.
|
|
||||||
#
|
|
||||||
# @BRUNIX-AI/research — Scientific research team. Responsible for RAG
|
|
||||||
# evaluation, embedding model benchmarking, dataset generation, and
|
|
||||||
# experiment documentation. Write access to research/ and docs/product/.
|
|
||||||
# All changes to production code require review from engineering.
|
|
||||||
#
|
|
||||||
# This file is enforced by GitHub branch protection rules on `online`.
|
|
||||||
# See: Settings → Branches → online → Require review from Code Owners
|
|
||||||
|
|
||||||
# Default — every PR requires engineering approval
|
|
||||||
* @BRUNIX-AI/engineering @rafa-ruiz
|
|
||||||
|
|
||||||
# ── Production engine ────────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
# gRPC contract — any change requires explicit CTO sign-off
|
|
||||||
Docker/protos/brunix.proto @BRUNIX-AI/engineering @rafa-ruiz
|
|
||||||
|
|
||||||
# Core engine — graph, server, prompts, state, evaluation
|
|
||||||
Docker/src/ @BRUNIX-AI/engineering @rafa-ruiz
|
|
||||||
|
|
||||||
# ── Ingestion & knowledge base ───────────────────────────────────────────────
|
|
||||||
|
|
||||||
# Ingestion pipelines
|
|
||||||
scripts/pipelines/ @BRUNIX-AI/engineering @rafa-ruiz
|
|
||||||
|
|
||||||
# Grammar config — any change requires a full index rebuild
|
|
||||||
scripts/pipelines/ingestion/avap_config.json @BRUNIX-AI/engineering @rafa-ruiz
|
|
||||||
|
|
||||||
# Golden dataset — any change requires a new EvaluateRAG baseline before merging
|
|
||||||
Docker/src/golden_dataset.json @BRUNIX-AI/engineering @rafa-ruiz
|
|
||||||
|
|
||||||
# ── Research ─────────────────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
# Research folder — managed by the research team, no engineering approval needed
|
|
||||||
# for experiment documentation, benchmarks and datasets
|
|
||||||
research/ @BRUNIX-AI/research @BRUNIX-AI/engineering
|
|
||||||
|
|
||||||
# ── Governance & documentation ───────────────────────────────────────────────
|
|
||||||
|
|
||||||
# ADRs and PRDs — all decisions require CTO approval
|
|
||||||
docs/ADR/ @BRUNIX-AI/engineering @rafa-ruiz
|
|
||||||
docs/product/ @BRUNIX-AI/engineering @rafa-ruiz
|
|
||||||
|
|
||||||
# Governance documents
|
|
||||||
CONTRIBUTING.md @BRUNIX-AI/engineering @rafa-ruiz
|
|
||||||
SECURITY.md @BRUNIX-AI/engineering @rafa-ruiz
|
|
||||||
.github/ @BRUNIX-AI/engineering @rafa-ruiz
|
|
||||||
129
CONTRIBUTING.md
129
CONTRIBUTING.md
|
|
@ -15,9 +15,7 @@
|
||||||
7. [Changelog Policy](#7-changelog-policy)
|
7. [Changelog Policy](#7-changelog-policy)
|
||||||
8. [Documentation Policy](#8-documentation-policy)
|
8. [Documentation Policy](#8-documentation-policy)
|
||||||
9. [Architecture Decision Records (ADRs)](#9-architecture-decision-records-adrs)
|
9. [Architecture Decision Records (ADRs)](#9-architecture-decision-records-adrs)
|
||||||
10. [Product Requirements Documents (PRDs)](#10-product-requirements-documents-prds)
|
10. [Incident & Blockage Reporting](#10-incident--blockage-reporting)
|
||||||
11. [Research & Experiments Policy](#11-research--experiments-policy)
|
|
||||||
12. [Incident & Blockage Reporting](#12-incident--blockage-reporting)
|
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
|
|
@ -94,16 +92,14 @@ A PR is not ready for review unless **all applicable items** in the following ch
|
||||||
- [ ] No new environment variables were introduced
|
- [ ] No new environment variables were introduced
|
||||||
- [ ] New environment variables are documented in the `.env` reference table in `README.md`
|
- [ ] New environment variables are documented in the `.env` reference table in `README.md`
|
||||||
|
|
||||||
**Changelog** *(see [Section 7](#7-changelog-policy))*
|
**Changelog** *(see [Section 6](#6-changelog-policy))*
|
||||||
- [ ] No changelog entry required (internal refactor, comment/typo fix, zero behavioral change)
|
- [ ] No changelog entry required (internal refactor, comment/typo fix, zero behavioral change)
|
||||||
- [ ] Changelog updated with correct version bump and date
|
- [ ] Changelog updated with correct version bump and date
|
||||||
|
|
||||||
**Documentation** *(see [Section 8](#8-documentation-policy))*
|
**Documentation** *(see [Section 8](#8-documentation-policy))*
|
||||||
- [ ] No documentation update required (internal change, no impact on setup or API)
|
- [ ] No documentation update required (internal change, no impact on setup or API)
|
||||||
- [ ] `README.md` or relevant docs updated to reflect this change
|
- [ ] `README.md` or relevant docs updated to reflect this change
|
||||||
- [ ] If a significant architectural decision was made, an ADR was created in `docs/ADR/`
|
- [ ] If a significant architectural decision was made, an ADR was created in `docs/adr/`
|
||||||
- [ ] If a new user-facing feature was introduced, a PRD was created in `docs/product/`
|
|
||||||
- [ ] If an experiment was conducted, results were documented in `research/`
|
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
|
|
@ -174,10 +170,10 @@ The `changelog` file tracks all notable changes and follows [Semantic Versioning
|
||||||
|
|
||||||
### Format
|
### Format
|
||||||
|
|
||||||
New entries go under `[Unreleased]` at the top of the file. When a PR merges, `[Unreleased]` is renamed to the new version with its date:
|
New entries go at the top of the file, above the previous version:
|
||||||
|
|
||||||
```
|
```
|
||||||
## [Unreleased]
|
## [X.Y.Z] - YYYY-MM-DD
|
||||||
|
|
||||||
### Added
|
### Added
|
||||||
- LABEL: Description of the new feature or capability.
|
- LABEL: Description of the new feature or capability.
|
||||||
|
|
@ -189,7 +185,7 @@ New entries go under `[Unreleased]` at the top of the file. When a PR merges, `[
|
||||||
- LABEL: Description of the bug resolved.
|
- LABEL: Description of the bug resolved.
|
||||||
```
|
```
|
||||||
|
|
||||||
Use uppercase short labels for scanability: `ENGINE:`, `API:`, `PROTO:`, `DOCKER:`, `INFRA:`, `SECURITY:`, `ENV:`, `CONFIG:`, `DOCS:`, `FEATURE:`.
|
Use uppercase short labels for scanability: `API:`, `DOCKER:`, `INFRA:`, `SECURITY:`, `ENV:`, `CONFIG:`.
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
|
|
@ -223,9 +219,7 @@ Update `README.md` (or the relevant doc file) if the PR includes any of the foll
|
||||||
| `docs/API_REFERENCE.md` | Complete gRPC API contract and examples |
|
| `docs/API_REFERENCE.md` | Complete gRPC API contract and examples |
|
||||||
| `docs/RUNBOOK.md` | Operational playbooks and incident response |
|
| `docs/RUNBOOK.md` | Operational playbooks and incident response |
|
||||||
| `docs/AVAP_CHUNKER_CONFIG.md` | `avap_config.json` reference — blocks, statements, semantic tags |
|
| `docs/AVAP_CHUNKER_CONFIG.md` | `avap_config.json` reference — blocks, statements, semantic tags |
|
||||||
| `docs/ADR/` | Architecture Decision Records |
|
| `docs/adr/` | Architecture Decision Records |
|
||||||
| `docs/product/` | Product Requirements Documents |
|
|
||||||
| `research/` | Experiment results, benchmarks, datasets |
|
|
||||||
|
|
||||||
> **PRs that change user-facing behavior or setup without updating documentation will be rejected.**
|
> **PRs that change user-facing behavior or setup without updating documentation will be rejected.**
|
||||||
|
|
||||||
|
|
@ -239,7 +233,7 @@ Architecture Decision Records document **significant technical decisions** — c
|
||||||
|
|
||||||
Write an ADR when a PR introduces or changes:
|
Write an ADR when a PR introduces or changes:
|
||||||
|
|
||||||
- A fundamental technology choice (communication protocol, storage backend, framework, model)
|
- A fundamental technology choice (communication protocol, storage backend, framework)
|
||||||
- A design pattern that other components will follow
|
- A design pattern that other components will follow
|
||||||
- A deliberate trade-off with known consequences
|
- A deliberate trade-off with known consequences
|
||||||
- A decision that future engineers might otherwise reverse without understanding the rationale
|
- A decision that future engineers might otherwise reverse without understanding the rationale
|
||||||
|
|
@ -250,11 +244,10 @@ Write an ADR when a PR introduces or changes:
|
||||||
- Bug fixes
|
- Bug fixes
|
||||||
- Dependency version bumps
|
- Dependency version bumps
|
||||||
- Configuration changes
|
- Configuration changes
|
||||||
- New user-facing features (use a PRD instead)
|
|
||||||
|
|
||||||
### ADR format
|
### ADR format
|
||||||
|
|
||||||
ADRs live in `docs/ADR/` and follow this naming convention:
|
ADRs live in `docs/adr/` and follow this naming convention:
|
||||||
|
|
||||||
```
|
```
|
||||||
ADR-XXXX-short-title.md
|
ADR-XXXX-short-title.md
|
||||||
|
|
@ -268,7 +261,7 @@ Each ADR must contain:
|
||||||
# ADR-XXXX: Title
|
# ADR-XXXX: Title
|
||||||
|
|
||||||
**Date:** YYYY-MM-DD
|
**Date:** YYYY-MM-DD
|
||||||
**Status:** Proposed | Under Evaluation | Accepted | Deprecated | Superseded by ADR-YYYY
|
**Status:** Proposed | Accepted | Deprecated | Superseded by ADR-YYYY
|
||||||
**Deciders:** Names or roles
|
**Deciders:** Names or roles
|
||||||
|
|
||||||
## Context
|
## Context
|
||||||
|
|
@ -288,106 +281,14 @@ What are the positive and negative results of this decision?
|
||||||
|
|
||||||
| ADR | Title | Status |
|
| ADR | Title | Status |
|
||||||
|---|---|---|
|
|---|---|---|
|
||||||
| [ADR-0001](docs/ADR/ADR-0001-grpc-primary-interface.md) | gRPC as the Primary Communication Interface | Accepted |
|
| [ADR-0001](docs/adr/ADR-0001-grpc-primary-interface.md) | gRPC as the Primary Communication Interface | Accepted |
|
||||||
| [ADR-0002](docs/ADR/ADR-0002-two-phase-streaming.md) | Two-Phase Streaming Design for AskAgentStream | Accepted |
|
| [ADR-0002](docs/adr/ADR-0002-two-phase-streaming.md) | Two-Phase Streaming Design for AskAgentStream | Accepted |
|
||||||
| [ADR-0003](docs/ADR/ADR-0003-hybrid-retrieval-rrf.md) | Hybrid Retrieval (BM25 + kNN) with RRF Fusion | Accepted |
|
| [ADR-0003](docs/adr/ADR-0003-hybrid-retrieval-rrf.md) | Hybrid Retrieval (BM25 + kNN) with RRF Fusion | Accepted |
|
||||||
| [ADR-0004](docs/ADR/ADR-0004-claude-eval-judge.md) | Claude as the RAGAS Evaluation Judge | Accepted |
|
| [ADR-0004](docs/adr/ADR-0004-claude-eval-judge.md) | Claude as the RAGAS Evaluation Judge | Accepted |
|
||||||
| [ADR-0005](docs/ADR/ADR-0005-embedding-model-selection.md) | Embedding Model Selection — BGE-M3 vs Qwen3-Embedding-0.6B | Under Evaluation |
|
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## 10. Product Requirements Documents (PRDs)
|
## 10. Incident & Blockage Reporting
|
||||||
|
|
||||||
Product Requirements Documents capture **user-facing features** — what is being built, why it is needed, and how it will be validated. Every feature that modifies the public API, the gRPC contract, or the user experience of any client (VS Code extension, OpenAI-compatible proxy, etc.) requires a PRD before implementation begins.
|
|
||||||
|
|
||||||
### When to write a PRD
|
|
||||||
|
|
||||||
Write a PRD when a PR introduces or changes:
|
|
||||||
|
|
||||||
- A new capability visible to any external consumer (extension, API client, proxy)
|
|
||||||
- A change to the gRPC contract (`brunix.proto`)
|
|
||||||
- A change to the HTTP proxy endpoints or behavior
|
|
||||||
- A feature requested by product or business stakeholders
|
|
||||||
|
|
||||||
### When NOT to write a PRD
|
|
||||||
|
|
||||||
- Internal architectural changes (use an ADR instead)
|
|
||||||
- Bug fixes with no change in user-visible behavior
|
|
||||||
- Infrastructure or tooling changes
|
|
||||||
|
|
||||||
### PRD format
|
|
||||||
|
|
||||||
PRDs live in `docs/product/` and follow this naming convention:
|
|
||||||
|
|
||||||
```
|
|
||||||
PRD-XXXX-short-title.md
|
|
||||||
```
|
|
||||||
|
|
||||||
Each PRD must contain:
|
|
||||||
|
|
||||||
```markdown
|
|
||||||
# PRD-XXXX: Title
|
|
||||||
|
|
||||||
**Date:** YYYY-MM-DD
|
|
||||||
**Status:** Proposed | Implemented
|
|
||||||
**Requested by:** Name / role
|
|
||||||
**Related ADR:** ADR-XXXX (if applicable)
|
|
||||||
|
|
||||||
## Problem
|
|
||||||
What user or business problem does this solve?
|
|
||||||
|
|
||||||
## Solution
|
|
||||||
What are we building?
|
|
||||||
|
|
||||||
## Scope
|
|
||||||
What is in scope and explicitly out of scope?
|
|
||||||
|
|
||||||
## Technical design
|
|
||||||
Key implementation decisions.
|
|
||||||
|
|
||||||
## Validation
|
|
||||||
How do we know this works? Acceptance criteria.
|
|
||||||
|
|
||||||
## Impact on parallel workstreams
|
|
||||||
Does this affect any ongoing experiment or evaluation?
|
|
||||||
```
|
|
||||||
|
|
||||||
### Existing PRDs
|
|
||||||
|
|
||||||
| PRD | Title | Status |
|
|
||||||
|---|---|---|
|
|
||||||
| [PRD-0001](docs/product/PRD-0001-openai-compatible-proxy.md) | OpenAI-Compatible HTTP Proxy | Implemented |
|
|
||||||
| [PRD-0002](docs/product/PRD-0002-editor-context-injection.md) | Editor Context Injection for VS Code Extension | Proposed |
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 11. Research & Experiments Policy
|
|
||||||
|
|
||||||
All scientific experiments, benchmark results, and dataset evaluations conducted by the research team must be documented and committed to the repository under `research/`.
|
|
||||||
|
|
||||||
### Rules
|
|
||||||
|
|
||||||
- Every experiment must have a corresponding result file in `research/` before any engineering decision based on that experiment is considered valid.
|
|
||||||
- Benchmark scripts, evaluation notebooks, and raw results must be committed alongside a summary README that explains the methodology, datasets used, metrics, and conclusions.
|
|
||||||
- Experiments that inform an ADR must be referenced from that ADR with a direct path to the result files.
|
|
||||||
- The golden dataset used by `EvaluateRAG` (`Docker/src/golden_dataset.json`) is a production artifact. Any modification requires explicit approval from the CTO and a new baseline EvaluateRAG run before the change is merged.
|
|
||||||
|
|
||||||
### Directory structure
|
|
||||||
|
|
||||||
```
|
|
||||||
research/
|
|
||||||
embeddings/ ← embedding model benchmarks (BEIR, MTEB)
|
|
||||||
experiments/ ← RAG architecture experiments
|
|
||||||
datasets/ ← synthetic datasets and golden datasets
|
|
||||||
```
|
|
||||||
|
|
||||||
### Why this matters
|
|
||||||
|
|
||||||
An engineering decision based on an experiment that is not reproducible, not committed, or not peer-reviewable has no scientific validity. All decisions with impact on the production system must be traceable to documented, committed evidence.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 12. Incident & Blockage Reporting
|
|
||||||
|
|
||||||
If you encounter a technical blockage (connection timeouts, service downtime, tunnel failures):
|
If you encounter a technical blockage (connection timeouts, service downtime, tunnel failures):
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -18,28 +18,8 @@ service AssistanceEngine {
|
||||||
// ---------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
message AgentRequest {
|
message AgentRequest {
|
||||||
// ── Core fields (v1) ──────────────────────────────────────────────────────
|
|
||||||
string query = 1;
|
string query = 1;
|
||||||
string session_id = 2;
|
string session_id = 2;
|
||||||
|
|
||||||
// ── Editor context fields (v2 — PRD-0002) ────────────────────────────────
|
|
||||||
// All three fields are optional. Clients that do not send them default to
|
|
||||||
// empty string. Existing clients remain fully compatible without changes.
|
|
||||||
|
|
||||||
// Full content of the active file open in the editor at query time.
|
|
||||||
// Gives the assistant awareness of the complete code the user is working on.
|
|
||||||
string editor_content = 3;
|
|
||||||
|
|
||||||
// Text currently selected in the editor, if any.
|
|
||||||
// Most precise signal of user intent — if non-empty, the question almost
|
|
||||||
// certainly refers to this specific code block.
|
|
||||||
string selected_text = 4;
|
|
||||||
|
|
||||||
// Free-form additional context (e.g. file path, language identifier,
|
|
||||||
// open diagnostic errors). Extensible without requiring future proto changes.
|
|
||||||
string extra_context = 5;
|
|
||||||
|
|
||||||
string user_info = 6;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
message AgentResponse {
|
message AgentResponse {
|
||||||
|
|
|
||||||
|
|
@ -11,29 +11,12 @@ from ragas.llms import LangchainLLMWrapper
|
||||||
from ragas.embeddings import LangchainEmbeddingsWrapper
|
from ragas.embeddings import LangchainEmbeddingsWrapper
|
||||||
from datasets import Dataset
|
from datasets import Dataset
|
||||||
from langchain_anthropic import ChatAnthropic
|
from langchain_anthropic import ChatAnthropic
|
||||||
from ragas.run_config import RunConfig
|
|
||||||
import asyncio
|
|
||||||
import time
|
|
||||||
|
|
||||||
class RateLimitedChatAnthropic(ChatAnthropic):
|
|
||||||
|
|
||||||
# Ragas usa principalmente este método asíncrono internamente
|
|
||||||
async def _agenerate(self, messages, stop=None, run_manager=None, **kwargs):
|
|
||||||
await asyncio.sleep(3.0) # <-- PON AQUÍ LA PAUSA EN SEGUNDOS (ej: 3 segundos)
|
|
||||||
return await super()._agenerate(messages, stop=stop, run_manager=run_manager, **kwargs)
|
|
||||||
|
|
||||||
# Añadimos la pausa síncrona también por seguridad
|
|
||||||
def _generate(self, messages, stop=None, run_manager=None, **kwargs):
|
|
||||||
time.sleep(3.0) # <-- MISMA PAUSA AQUÍ
|
|
||||||
return super()._generate(messages, stop=stop, run_manager=run_manager, **kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
GOLDEN_DATASET_PATH = Path(__file__).parent / "golden_dataset.json"
|
GOLDEN_DATASET_PATH = Path(__file__).parent / "golden_dataset.json"
|
||||||
CLAUDE_MODEL = os.getenv("ANTHROPIC_MODEL", "claude-sonnet-4-20250514")
|
CLAUDE_MODEL = os.getenv("ANTHROPIC_MODEL", "claude-sonnet-4-20250514")
|
||||||
ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY", "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")
|
ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY", "")
|
||||||
K_RETRIEVE = 5
|
K_RETRIEVE = 5
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -155,7 +138,7 @@ def run_evaluation( es_client, llm, embeddings, index_name, category = None, lim
|
||||||
|
|
||||||
logger.info(f"[eval] makind: {len(questions)} questions, index={index_name}")
|
logger.info(f"[eval] makind: {len(questions)} questions, index={index_name}")
|
||||||
|
|
||||||
claude_judge = RateLimitedChatAnthropic(
|
claude_judge = ChatAnthropic(
|
||||||
model=CLAUDE_MODEL,
|
model=CLAUDE_MODEL,
|
||||||
api_key=ANTHROPIC_API_KEY,
|
api_key=ANTHROPIC_API_KEY,
|
||||||
temperature=0,
|
temperature=0,
|
||||||
|
|
@ -195,7 +178,7 @@ def run_evaluation( es_client, llm, embeddings, index_name, category = None, lim
|
||||||
"answer_preview": answer[:300],
|
"answer_preview": answer[:300],
|
||||||
"n_chunks": len(contexts),
|
"n_chunks": len(contexts),
|
||||||
})
|
})
|
||||||
time.sleep(2.5)
|
|
||||||
if not rows["question"]:
|
if not rows["question"]:
|
||||||
return {"error": "NO SAMPLES GENETARED"}
|
return {"error": "NO SAMPLES GENETARED"}
|
||||||
|
|
||||||
|
|
@ -209,33 +192,16 @@ def run_evaluation( es_client, llm, embeddings, index_name, category = None, lim
|
||||||
if hasattr(metric, "embeddings"):
|
if hasattr(metric, "embeddings"):
|
||||||
metric.embeddings = ragas_emb
|
metric.embeddings = ragas_emb
|
||||||
|
|
||||||
time.sleep(5)
|
|
||||||
logger.info("[eval] JUDGING BY CLAUDE...")
|
logger.info("[eval] JUDGING BY CLAUDE...")
|
||||||
run_config = RunConfig(max_workers=1)
|
result = ragas_evaluate(dataset, metrics=metrics)
|
||||||
result = ragas_evaluate(dataset, metrics=metrics, run_config=run_config)
|
|
||||||
|
|
||||||
elapsed = time.time() - t_start
|
elapsed = time.time() - t_start
|
||||||
|
|
||||||
# RAGAS >= 0.2 returns an EvaluationResult object, not a dict.
|
|
||||||
# Extract per-metric means from the underlying DataFrame.
|
|
||||||
try:
|
|
||||||
df = result.to_pandas()
|
|
||||||
def _mean(col):
|
|
||||||
return round(float(df[col].dropna().mean()), 4) if col in df.columns else 0.0
|
|
||||||
except Exception:
|
|
||||||
# Fallback: try legacy dict-style access
|
|
||||||
df = None
|
|
||||||
def _mean(col):
|
|
||||||
try:
|
|
||||||
return round(float(result[col]), 4)
|
|
||||||
except Exception:
|
|
||||||
return 0.0
|
|
||||||
|
|
||||||
scores = {
|
scores = {
|
||||||
"faithfulness": _mean("faithfulness"),
|
"faithfulness": round(float(result.get("faithfulness", 0)), 4),
|
||||||
"answer_relevancy": _mean("answer_relevancy"),
|
"answer_relevancy": round(float(result.get("answer_relevancy", 0)), 4),
|
||||||
"context_recall": _mean("context_recall"),
|
"context_recall": round(float(result.get("context_recall", 0)), 4),
|
||||||
"context_precision": _mean("context_precision"),
|
"context_precision": round(float(result.get("context_precision", 0)), 4),
|
||||||
}
|
}
|
||||||
|
|
||||||
valid_scores = [v for v in scores.values() if v > 0]
|
valid_scores = [v for v in scores.values() if v > 0]
|
||||||
|
|
|
||||||
|
|
@ -1,32 +0,0 @@
|
||||||
[
|
|
||||||
{
|
|
||||||
"id": "GD-001",
|
|
||||||
"category": "RETRIEVAL",
|
|
||||||
"question": "What is AVAP and what is it designed for?",
|
|
||||||
"ground_truth": "AVAP (Advanced Virtual API Programming) is a Turing-complete Domain-Specific Language (DSL) architecturally designed for the secure, concurrent, and deterministic orchestration of microservices and HTTP I/O. It is not a general-purpose language; its hybrid engine and strict grammar are optimized for fast processing of HTTP transactions, in-memory data manipulation, and interaction with external connectors. AVAP does not have internal print commands — all data output is performed through the HTTP interface using commands like addResult()."
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": "GD-002",
|
|
||||||
"category": "RETRIEVAL",
|
|
||||||
"question": "How does AVAP handle conditional logic? What commands are used and how are blocks closed?",
|
|
||||||
"ground_truth": "AVAP uses a mixed structural grammar for conditional logic, combining keyword fluidity with strict mathematical closures. The if() / else() / end() structure evaluates a logical or comparison expression. Every conditional block requires a mandatory end() closing statement. The if() command compares two values using a comparator operator (e.g., '==', '!=', '>', '<', '>=', '<='). An optional else() block handles the false branch. Example: if(saldo, 0, \">\") executes the true branch when the variable 'saldo' is greater than zero, otherwise the else() block runs, and end() closes the structure."
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": "GD-003",
|
|
||||||
"category": "CODE_GENERATION",
|
|
||||||
"question": "Write an AVAP script that reads a 'password' parameter, generates a SHA-256 hash of it, and returns the hash.",
|
|
||||||
"ground_truth": "The following AVAP script reads a 'password' query parameter, hashes it using SHA-256 via encodeSHA256(), and exposes the result via addResult():\n\naddParam(\"password\", password)\nencodeSHA256(password, hashed_password)\naddResult(hashed_password)\n\nKey commands used:\n- addParam(\"password\", password): reads the 'password' HTTP parameter into the variable 'password'.\n- encodeSHA256(password, hashed_password): computes the SHA-256 hash of the input and stores the 64-character hex digest in 'hashed_password'.\n- addResult(hashed_password): adds 'hashed_password' to the HTTP JSON response body."
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": "GD-004",
|
|
||||||
"category": "CODE_GENERATION",
|
|
||||||
"question": "Show an AVAP script that loops from 1 to 5, builds a JSON object with each iteration index as a key, and returns it.",
|
|
||||||
"ground_truth": "The following AVAP script iterates from 1 to 5 using startLoop/endLoop, dynamically builds a JSON object using AddvariableToJSON() on each iteration, and returns the result:\n\naddVar(mi_json, \"{}\")\nstartLoop(i, 1, 5)\n item = \"item_%s\" % i\n AddvariableToJSON(item, \"valor_generado\", mi_json)\nendLoop()\naddResult(mi_json)\n\nKey commands used:\n- addVar(mi_json, \"{}\"): initializes an empty JSON object.\n- startLoop(i, 1, 5) / endLoop(): iterates the variable 'i' from 1 to 5 inclusive.\n- AddvariableToJSON(item, \"valor_generado\", mi_json): inserts each generated key-value pair into the JSON object.\n- addResult(mi_json): exposes the final JSON in the HTTP response."
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": "GD-005",
|
|
||||||
"category": "RETRIEVAL",
|
|
||||||
"question": "How does AVAP support external HTTP calls? What commands are available and how is timeout handled?",
|
|
||||||
"ground_truth": "AVAP provides two commands for making external HTTP calls: RequestPost and RequestGet. To avoid blocking threads due to network latency, AVAP requires a mandatory timeout parameter (in milliseconds) for both commands. If the timeout is exceeded, the destination variable receives None. RequestPost(url, querystring, headers, body, destino, timeout) executes an HTTP POST and stores the response in 'destino'. RequestGet(url, querystring, headers, destino, timeout) executes an HTTP GET similarly. Both commands are part of AVAP's Section V (Third-Party Connectors and External HTTP Requests) and allow calling external APIs without additional drivers."
|
|
||||||
}
|
|
||||||
]
|
|
||||||
|
|
@ -1,302 +0,0 @@
|
||||||
[
|
|
||||||
{
|
|
||||||
"id": "GD-R-001",
|
|
||||||
"category": "RETRIEVAL",
|
|
||||||
"question": "What is AVAP and what is it designed for?",
|
|
||||||
"ground_truth": "AVAP is a Turing-complete Domain-Specific Language (DSL) architecturally designed for the secure, concurrent, and deterministic orchestration of microservices and HTTP I/O. It is not a general-purpose language. Its hybrid engine and strict grammar are optimized for fast HTTP transaction processing, in-memory data manipulation, and interaction with external connectors. AVAP has no internal print commands — all data output is performed through the HTTP interface using addResult()."
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": "GD-R-002",
|
|
||||||
"category": "RETRIEVAL",
|
|
||||||
"question": "How does the if() conditional block work in AVAP? How are blocks closed?",
|
|
||||||
"ground_truth": "AVAP uses if() / else() / end() for conditional logic. The if() command evaluates a comparison between two values using a comparator operator (==, !=, >, <, >=, <=, in). Every conditional block must be closed with end(). The else() block is optional and handles the false branch. Example: if(saldo, 0, \">\") executes the true branch when saldo is greater than zero, otherwise the else() block runs, and end() closes the structure. AVAP also supports a mode 2 where a full Python-style expression is passed as a string: if(None, None, \"user_type == 'VIP' or compras > 100\")."
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": "GD-R-003",
|
|
||||||
"category": "RETRIEVAL",
|
|
||||||
"question": "How does AVAP handle external HTTP calls? What commands are available and how is timeout managed?",
|
|
||||||
"ground_truth": "AVAP provides RequestGet and RequestPost for external HTTP calls. To avoid blocking threads due to network latency, AVAP requires a mandatory timeout parameter in milliseconds. If the timeout is exceeded, the destination variable receives None. RequestPost(url, querystring, headers, body, destino, timeout) executes an HTTP POST storing the response in destino. RequestGet(url, querystring, headers, destino, timeout) executes an HTTP GET. Both commands allow calling external APIs without additional drivers."
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": "GD-R-004",
|
|
||||||
"category": "RETRIEVAL",
|
|
||||||
"question": "How do functions work in AVAP? What is the scope of variables inside a function?",
|
|
||||||
"ground_truth": "Functions in AVAP are hermetic memory enclosures. When entering a function, AVAP creates a new dictionary of local variables isolated from the global context. The return() command acts as a flow switch: it injects the calculated value to the caller and releases local memory. If used inside a startLoop, it also breaks the iteration. Variables declared inside a function are only visible within that function — they are not accessible from the main flow or other functions. AVAP has three scope types: Global Scope, Main Local Scope, and Function Scope."
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": "GD-R-005",
|
|
||||||
"category": "RETRIEVAL",
|
|
||||||
"question": "What are the three types of variable scopes in AVAP and what are their visibility rules?",
|
|
||||||
"ground_truth": "AVAP uses three scope types: Global Scope contains globally declared variables, accessible from anywhere in the program and persists for the entire interpreter process lifetime. Main Local Scope contains variables declared in the main flow — accessible within the main flow but not from functions or goroutines, and disappears when script execution ends. Function Scope is created independently for each function invocation and contains function parameters and locally created variables — only visible within that function, not from outside, and is destroyed when the function terminates. If a variable does not exist in the visible scopes, the engine produces a runtime error."
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": "GD-R-006",
|
|
||||||
"category": "RETRIEVAL",
|
|
||||||
"question": "How does concurrency work in AVAP? What are goroutines and how are they launched?",
|
|
||||||
"ground_truth": "AVAP implements an advanced system based on lightweight threads (goroutines), allowing the server to process long I/O operations without blocking the main thread. The go command launches a goroutine: identifier = go function_name(parameters). It creates a new isolated execution context and returns a unique identifier. Goroutines follow the same scope rules as normal functions — they can access Global Scope and their own Function Scope, but cannot access the Main Local Scope. The gather command is used to collect results from goroutines."
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": "GD-R-007",
|
|
||||||
"category": "RETRIEVAL",
|
|
||||||
"question": "What is the addParam command and how does it capture HTTP request parameters?",
|
|
||||||
"ground_truth": "addParam captures input parameters from HTTP requests (URL query parameters, request body, or form data) and assigns them to a variable. Syntax: addParam(\"paramName\", targetVar). It reads the value of paramName from the incoming HTTP request and stores it in targetVar. If the parameter is not present in the request, the variable receives None. It is the primary mechanism for reading external input in AVAP since the language has no direct access to the request object."
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": "GD-R-008",
|
|
||||||
"category": "RETRIEVAL",
|
|
||||||
"question": "How does the startLoop / endLoop construct work in AVAP?",
|
|
||||||
"ground_truth": "startLoop and endLoop define iteration blocks in AVAP. Syntax: startLoop(varName, from, to) where varName is the loop counter, from is the start value, and to is the end value inclusive. The loop counter increments by 1 on each iteration. endLoop() closes the block. Example: startLoop(i, 1, 10) iterates i from 1 to 10. Variables modified inside the loop are accessible after endLoop. To exit a loop early, you can set the counter variable beyond the end value (e.g. i = 11 inside a loop that goes to 10)."
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": "GD-R-009",
|
|
||||||
"category": "RETRIEVAL",
|
|
||||||
"question": "What is the addResult command and how does it build the HTTP response?",
|
|
||||||
"ground_truth": "addResult adds a variable to the HTTP JSON response body. Syntax: addResult(varName). Each call to addResult adds one key-value pair to the response object where the key is the variable name and the value is its current value. AVAP has no internal print commands — addResult is the only way to expose data to the caller. Multiple addResult calls build up a JSON object with multiple fields. The HTTP status code is set separately via the _status variable."
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": "GD-R-010",
|
|
||||||
"category": "RETRIEVAL",
|
|
||||||
"question": "How does error handling work in AVAP with try() and exception()?",
|
|
||||||
"ground_truth": "AVAP uses try() / exception() / end() for error handling. The try() block wraps code that may fail. If an exception occurs inside the try block, execution jumps to the exception() block instead of halting. exception(errorVar) captures the error message into errorVar. The end() command closes the structure. Without a try block, any unhandled exception stops script execution and returns a 400 error. With a try block, you can handle the error gracefully — for example by setting _status to 500 and returning a structured error message."
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": "GD-R-011",
|
|
||||||
"category": "RETRIEVAL",
|
|
||||||
"question": "What is the replace() command in AVAP and how is it used?",
|
|
||||||
"ground_truth": "The replace() command performs string substitution in AVAP. Syntax: replace(sourceString, searchValue, replaceValue, targetVar). It replaces all occurrences of searchValue in sourceString with replaceValue and stores the result in targetVar. Example: replace(\"REF_1234_OLD\", \"OLD\", \"NEW\", ref_actualizada) stores \"REF_1234_NEW\" in ref_actualizada. The source can be a literal string or a variable name. The command does not modify the original variable — it always writes to targetVar."
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": "GD-R-012",
|
|
||||||
"category": "RETRIEVAL",
|
|
||||||
"question": "What are the reserved keywords in AVAP that cannot be used as identifiers?",
|
|
||||||
"ground_truth": "AVAP has the following reserved keywords that cannot be used as variable or function names: Control flow — if, else, end, startLoop, endLoop, try, exception, return. Function declaration — function. Concurrency — go, gather. Modularity — include, import. Logical operators — and, or, not, in, is. Literals — True, False, None. Using any of these as an identifier will cause a lexer or parser error."
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": "GD-R-013",
|
|
||||||
"category": "RETRIEVAL",
|
|
||||||
"question": "How does AVAP handle string formatting and concatenation?",
|
|
||||||
"ground_truth": "AVAP supports two main string operations. Concatenation uses the + operator: result = \"Hello, \" + name produces a concatenated string. String formatting uses Python-style % operator: log = \"Evento registrado por: %s\" % nombre substitutes the variable value into the format string. Strings support single and double quotes. Escape sequences supported include \\n (newline), \\t (tab), \\r (carriage return), \\\" (double quote), \\' (single quote), and \\\\ (backslash). Note that \\n inside a string is a data character, not a statement terminator — the physical EOL is the only statement terminator in AVAP."
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": "GD-R-014",
|
|
||||||
"category": "RETRIEVAL",
|
|
||||||
"question": "How does the encodeSHA256 command work in AVAP?",
|
|
||||||
"ground_truth": "encodeSHA256 computes the SHA-256 hash of an input value and stores the result in a destination variable. Syntax: encodeSHA256(inputValue, destVar). The result is a 64-character lowercase hexadecimal string representing the SHA-256 digest. Example: encodeSHA256(\"payload_data\", checksum) stores the hash of the string \"payload_data\" into the variable checksum. The input can be a string literal or a variable. It is commonly used for integrity verification, password hashing, and generating checksums."
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": "GD-R-015",
|
|
||||||
"category": "RETRIEVAL",
|
|
||||||
"question": "How does AVAP handle date and time operations?",
|
|
||||||
"ground_truth": "AVAP provides two date/time commands. getDateTime(format, offsetSeconds, timezone, destVar) gets the current date/time, optionally applying an offset in seconds and converting to the specified timezone. Example: getDateTime(\"%Y-%m-%d %H:%M:%S\", 0, \"Europe/Madrid\", sql_date) stores the current Madrid time formatted for SQL. getDateTime(\"\", 86400, \"UTC\", expira) gets the current UTC time plus 86400 seconds (1 day ahead), useful for expiration timestamps. stampToDatetime(unixTimestamp, format, offset, destVar) converts a Unix timestamp to a human-readable string. Example: stampToDatetime(1708726162, \"%d/%m/%Y\", 0, fecha_human)."
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": "GD-R-016",
|
|
||||||
"category": "RETRIEVAL",
|
|
||||||
"question": "What is the AddvariableToJSON command and how is it used to build JSON objects?",
|
|
||||||
"ground_truth": "AddvariableToJSON inserts a key-value pair into an existing JSON object variable. Syntax: AddvariableToJSON(key, value, jsonVar). The key can be a string literal or a variable. The value can be a string, number, or variable. The jsonVar must be an already-declared variable typically initialized as \"{}\" via addVar. Example: addVar(mi_json, \"{}\") then AddvariableToJSON(\"status\", \"ok\", mi_json) adds the key \"status\" with value \"ok\" to mi_json. It is commonly used inside loops to build dynamic JSON objects iteratively."
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": "GD-R-017",
|
|
||||||
"category": "RETRIEVAL",
|
|
||||||
"question": "How does the getListLen command work and what is it used for?",
|
|
||||||
"ground_truth": "getListLen retrieves the length of a list variable and stores it in a destination variable. Syntax: getListLen(listVar, destVar). Example: getListLen(registros, total) stores the number of elements in registros into total. It is commonly used before a startLoop to set the upper bound of iteration, enabling dynamic loops that adapt to the actual size of the data. Example pattern: getListLen(mi_lista, cantidad) followed by startLoop(i, 0, cantidad) to iterate over all elements."
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": "GD-R-018",
|
|
||||||
"category": "RETRIEVAL",
|
|
||||||
"question": "How does the randomString command work in AVAP?",
|
|
||||||
"ground_truth": "randomString generates a random string of a specified length using a character pattern. Syntax: randomString(pattern, length, destVar). The pattern is a regex-style character class defining which characters to use. Example: randomString(\"[A-Z]\\d\", 32, token_seguridad) generates a 32-character random string using uppercase letters and digits. Another example: randomString(\"[a-zA-Z0-9]\", 16, token) generates a 16-character alphanumeric token. It is commonly used for generating secure tokens, session identifiers, and temporary passwords."
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": "GD-R-019",
|
|
||||||
"category": "RETRIEVAL",
|
|
||||||
"question": "What is the $ dereference operator in AVAP and when is it used?",
|
|
||||||
"ground_truth": "The $ operator in AVAP is the dereference operator, used to access the value of a variable by reference at assignment time. Syntax: addVar(copia, $original) copies the current value of original into copia. The token is defined as DEREF in the lexer. It is used when you need to capture the current value of a variable into another variable, particularly useful when a variable may change later and you need to preserve its value at a specific point in execution."
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": "GD-R-020",
|
|
||||||
"category": "RETRIEVAL",
|
|
||||||
"question": "How does AVAP handle ORM database operations? What commands are available?",
|
|
||||||
"ground_truth": "AVAP provides native ORM commands for database operations without requiring additional drivers. ormCheckTable(tableName, resultVar) checks if a table exists storing True or False in resultVar. ormCreateTable(columns, types, tableName, resultVar) creates a new table with the specified column names and types. ormDirect(query, resultVar) executes a raw SQL query directly. ormAccessSelect executes SELECT queries and ormAccessInsert executes INSERT operations. avapConnector is used to initialize the database connection. The connector and ORM commands are distinguished only by context — the UUID passed as argument determines whether the adapter resolves as a database ORM or a third-party service proxy."
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": "GD-C-001",
|
|
||||||
"category": "CODE_GENERATION",
|
|
||||||
"question": "Write an AVAP script that reads a 'name' parameter and returns a personalized greeting.",
|
|
||||||
"ground_truth": "The following AVAP script reads a name parameter and returns a personalized greeting:\n\naddParam(\"name\", name)\nresult = \"Hello, \" + name\naddResult(result)\n\nKey commands: addParam reads the HTTP parameter 'name' into variable name. The + operator concatenates the greeting string with the name. addResult exposes result in the JSON response."
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": "GD-C-002",
|
|
||||||
"category": "CODE_GENERATION",
|
|
||||||
"question": "Write an AVAP script that reads a 'password' parameter, generates a SHA-256 hash, and returns it.",
|
|
||||||
"ground_truth": "The following AVAP script hashes a password parameter using SHA-256:\n\naddParam(\"password\", password)\nencodeSHA256(password, hashed_password)\naddResult(hashed_password)\n\nKey commands: addParam reads the 'password' HTTP parameter. encodeSHA256 computes the SHA-256 hash and stores the 64-character hex digest in hashed_password. addResult exposes the hash in the JSON response."
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": "GD-C-003",
|
|
||||||
"category": "CODE_GENERATION",
|
|
||||||
"question": "Write an AVAP script that loops from 1 to 5, builds a JSON object with each index as a key, and returns it.",
|
|
||||||
"ground_truth": "The following AVAP script builds a JSON object iteratively:\n\naddVar(mi_json, \"{}\")\nstartLoop(i, 1, 5)\n item = \"item_%s\" % i\n AddvariableToJSON(item, \"valor_generado\", mi_json)\nendLoop()\naddResult(mi_json)\n\nKey commands: addVar initializes an empty JSON object. startLoop iterates i from 1 to 5 inclusive. The % operator formats the key name dynamically. AddvariableToJSON inserts each key-value pair into mi_json. addResult exposes the final object."
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": "GD-C-004",
|
|
||||||
"category": "CODE_GENERATION",
|
|
||||||
"question": "Write an AVAP script that validates if a 'role' parameter belongs to a list of allowed roles and returns the access result.",
|
|
||||||
"ground_truth": "The following AVAP script validates role membership:\n\naddParam(\"rol\", r)\nif(r, [\"admin\", \"editor\", \"root\"], \"in\")\n acceso = True\nelse()\n acceso = False\nend()\naddResult(acceso)\n\nKey commands: addParam reads the 'rol' parameter. The if() with \"in\" comparator checks list membership directly against a list literal. else() handles the false branch. end() closes the conditional block. addResult exposes the boolean result."
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": "GD-C-005",
|
|
||||||
"category": "CODE_GENERATION",
|
|
||||||
"question": "Write an AVAP script that makes a GET request to an external API and handles connection errors.",
|
|
||||||
"ground_truth": "The following AVAP script performs a GET request with error handling:\n\ntry()\n RequestGet(\"https://api.test.com/data\", 0, 0, respuesta)\nexception(e)\n addVar(error_trace, \"Fallo de conexion: %s\" % e)\n addResult(error_trace)\nend()\naddResult(respuesta)\n\nKey commands: try() wraps the potentially failing operation. RequestGet fetches the URL storing the response in respuesta. exception(e) captures any error message. The % operator formats the error string. addResult exposes either the response or the error."
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": "GD-C-006",
|
|
||||||
"category": "CODE_GENERATION",
|
|
||||||
"question": "Write an AVAP function that takes two numbers and returns their sum, then call it and return the result.",
|
|
||||||
"ground_truth": "The following AVAP script defines and calls a sum function:\n\nfunction suma(a, b){\n total = a + b\n return(total)\n}\nresultado = suma(10, 20)\naddResult(resultado)\n\nKey commands: function declares a named function with parameters a and b. The + operator adds the values. return() sends the result back to the caller and releases the function scope. The function is called with literal values 10 and 20. addResult exposes the result."
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": "GD-C-007",
|
|
||||||
"category": "CODE_GENERATION",
|
|
||||||
"question": "Write an AVAP script that reads a 'subtotal' parameter, computes 21% VAT, and returns the total.",
|
|
||||||
"ground_truth": "The following AVAP script calculates the total with VAT:\n\naddParam(\"subtotal\", subtotal)\niva = subtotal * 0.21\ntotal = subtotal + iva\naddResult(total)\n\nKey commands: addParam reads the subtotal from the HTTP request. The * operator multiplies by the tax rate 0.21. The + operator adds subtotal and iva. addResult exposes the final total. AVAP supports float arithmetic natively."
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": "GD-C-008",
|
|
||||||
"category": "CODE_GENERATION",
|
|
||||||
"question": "Write an AVAP script that reads an 'api_key' parameter and returns status 403 if it is null.",
|
|
||||||
"ground_truth": "The following AVAP script validates that an API key is present:\n\naddParam(\"api_key\", key)\nif(key, None, \"==\")\n addVar(_status, 403)\n addVar(error, \"Acceso denegado: falta API KEY\")\n addResult(error)\nend()\n\nKey commands: addParam reads the api_key parameter — if not present it will be None. The if() with \"==\" and None checks for null. addVar sets _status to 403 which becomes the HTTP response code. addResult exposes the error message."
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": "GD-C-009",
|
|
||||||
"category": "CODE_GENERATION",
|
|
||||||
"question": "Write an AVAP script that generates a 32-character random alphanumeric token and returns it.",
|
|
||||||
"ground_truth": "The following AVAP script generates a secure random token:\n\nrandomString(\"[a-zA-Z0-9]\", 32, token_seguridad)\naddResult(token_seguridad)\n\nKey commands: randomString generates a random string using the character class [a-zA-Z0-9] at length 32 and stores it in token_seguridad. addResult exposes the token in the HTTP response."
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": "GD-C-010",
|
|
||||||
"category": "CODE_GENERATION",
|
|
||||||
"question": "Write an AVAP script that reads a 'lang' parameter and returns 'Hola' if it is 'es' or 'Hello' if it is 'en'.",
|
|
||||||
"ground_truth": "The following AVAP script returns a greeting based on language:\n\naddParam(\"lang\", l)\nif(l, \"es\", \"=\")\n addVar(msg, \"Hola\")\nelse()\n addVar(msg, \"Hello\")\nend()\naddResult(msg)\n\nKey commands: addParam reads the lang parameter into l. The if() with \"=\" comparator checks string equality. else() handles all other cases. addVar sets the message. addResult exposes the localized greeting."
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": "GD-C-011",
|
|
||||||
"category": "CODE_GENERATION",
|
|
||||||
"question": "Write an AVAP script that checks if a database table exists and creates it if it does not.",
|
|
||||||
"ground_truth": "The following AVAP script checks and creates a database table:\n\normCheckTable(tabla_pruebas, resultado_comprobacion)\nif(resultado_comprobacion, False, \"==\")\n ormCreateTable(\"username,age\", \"VARCHAR,INTEGER\", tabla_pruebas, resultado_creacion)\nend()\naddResult(resultado_comprobacion)\naddResult(resultado_creacion)\n\nKey commands: ormCheckTable checks if the table exists storing True or False. The if() block only executes if the check returned False. ormCreateTable creates the table with the specified columns and types. Both results are exposed via addResult."
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": "GD-C-012",
|
|
||||||
"category": "CODE_GENERATION",
|
|
||||||
"question": "Write an AVAP script that gets the current UTC timestamp and adds 24 hours to compute an expiration time.",
|
|
||||||
"ground_truth": "The following AVAP script computes an expiration timestamp 24 hours from now:\n\ngetDateTime(\"\", 86400, \"UTC\", expira)\naddResult(expira)\n\nKey commands: getDateTime with an empty format string returns a raw timestamp. The second parameter 86400 is the offset in seconds (60 * 60 * 24 = 86400 = 1 day). The timezone is set to UTC. The result is stored in expira and exposed via addResult."
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": "GD-C-013",
|
|
||||||
"category": "CODE_GENERATION",
|
|
||||||
"question": "Write an AVAP script that receives a new password parameter, validates it is not equal to the old password, and returns a confirmation.",
|
|
||||||
"ground_truth": "The following AVAP script validates a password change:\n\naddParam(\"password\", pass_nueva)\npass_antigua = \"password\"\nif(pass_nueva, pass_antigua, \"!=\")\n addVar(cambio, \"Contrasena actualizada\")\nend()\naddResult(cambio)\n\nKey commands: addParam reads the new password. The old password is assigned as a literal. The if() with \"!=\" comparator checks inequality. addVar sets the confirmation message only if passwords differ. addResult exposes the message."
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": "GD-C-014",
|
|
||||||
"category": "CODE_GENERATION",
|
|
||||||
"question": "Write an AVAP script that reads a list parameter and returns its element count.",
|
|
||||||
"ground_truth": "The following AVAP script reads a list parameter and returns its length:\n\naddParam(\"data_list\", mi_lista)\ngetListLen(mi_lista, cantidad)\naddResult(cantidad)\n\nKey commands: addParam reads the list from the HTTP request into mi_lista. getListLen computes the number of elements and stores it in cantidad. addResult exposes the count in the JSON response."
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": "GD-C-015",
|
|
||||||
"category": "CODE_GENERATION",
|
|
||||||
"question": "Write an AVAP script that uses a validation function to check a token parameter and returns the authorization result.",
|
|
||||||
"ground_truth": "The following AVAP script uses a function to validate a token:\n\nfunction es_valido(token){\n response = False\n if(token, \"SECRET\", \"=\")\n response = True\n end()\n return(response)\n}\naddParam(\"token\", t)\nautorizado = es_valido(t)\naddResult(autorizado)\n\nKey commands: function defines es_valido with a token parameter. response is initialized to False. The if() with \"=\" checks against the expected secret. return() sends the boolean back to the caller. addResult exposes the authorization result."
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": "GD-C-016",
|
|
||||||
"category": "CODE_GENERATION",
|
|
||||||
"question": "Write an AVAP script that returns two values in the HTTP response: a status code 200 and a message 'Success'.",
|
|
||||||
"ground_truth": "The following AVAP script returns multiple values in the HTTP response:\n\naddVar(_status, 200)\naddVar(status, \"Success\")\naddResult(status)\n\nOr returning both as JSON fields:\n\naddVar(code, 200)\naddVar(status, \"Success\")\naddResult(code)\naddResult(status)\n\nKey commands: _status is the special variable that sets the HTTP response status code. Multiple addResult calls build a JSON object with multiple fields in the response body."
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": "GD-C-017",
|
|
||||||
"category": "CODE_GENERATION",
|
|
||||||
"question": "Write an AVAP script that reads a 'saldo' parameter and returns True if it is greater than zero, False otherwise.",
|
|
||||||
"ground_truth": "The following AVAP script checks if a balance is positive:\n\naddParam(\"saldo\", saldo)\nif(saldo, 0, \">\")\n permitir = True\nelse()\n permitir = False\nend()\naddResult(permitir)\n\nKey commands: addParam reads the saldo parameter. The if() with \">\" comparator checks if saldo is greater than 0. else() handles the zero or negative case. end() closes the block. addResult exposes the boolean result."
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": "GD-C-018",
|
|
||||||
"category": "CODE_GENERATION",
|
|
||||||
"question": "Write an AVAP script that converts a Unix timestamp parameter to a human-readable date in dd/mm/yyyy format.",
|
|
||||||
"ground_truth": "The following AVAP script converts a Unix timestamp to a readable date:\n\naddParam(\"timestamp\", ts)\nstampToDatetime(ts, \"%d/%m/%Y\", 0, fecha_human)\naddResult(fecha_human)\n\nKey commands: addParam reads the timestamp from the HTTP request. stampToDatetime converts the Unix epoch integer to a formatted date string using \"%d/%m/%Y\" which produces day/month/year. The third parameter is a timezone offset in seconds. The result is stored in fecha_human and returned via addResult."
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": "GD-C-019",
|
|
||||||
"category": "CODE_GENERATION",
|
|
||||||
"question": "Write an AVAP script that replaces all spaces in a string parameter with hyphens and returns the result.",
|
|
||||||
"ground_truth": "The following AVAP script replaces spaces with hyphens:\n\naddParam(\"text\", input_text)\nreplace(input_text, \" \", \"-\", clean_text)\naddResult(clean_text)\n\nKey commands: addParam reads the text parameter. replace() substitutes all occurrences of space with hyphen in input_text and stores the result in clean_text. The original variable is not modified. addResult exposes the transformed string."
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": "GD-C-020",
|
|
||||||
"category": "CODE_GENERATION",
|
|
||||||
"question": "Write an AVAP script that uses try/exception to execute a raw SQL query and return status 500 on database errors.",
|
|
||||||
"ground_truth": "The following AVAP script executes SQL with error handling:\n\ntry()\n ormDirect(\"UPDATE tabla SET col=1 WHERE id=1\", res)\nexception(e)\n addVar(_status, 500)\n addResult(\"Error de base de datos\")\nend()\naddResult(res)\n\nKey commands: try() wraps the database operation. ormDirect executes raw SQL storing the result in res. exception(e) catches any database error. addVar sets _status to 500 to signal a server error. The final addResult exposes the query result on success."
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": "GD-V-001",
|
|
||||||
"category": "CONVERSATIONAL",
|
|
||||||
"question": "Can you summarize what you just explained about AVAP scopes in fewer words?",
|
|
||||||
"ground_truth": "AVAP has three scopes: Global (visible everywhere, lives for the whole process), Main Local (visible only in the main script flow, not inside functions), and Function (created per function call, destroyed when the function returns). Functions cannot see main flow variables, and the main flow cannot see function-internal variables."
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": "GD-V-002",
|
|
||||||
"category": "CONVERSATIONAL",
|
|
||||||
"question": "You mentioned that addResult builds the JSON response — can you clarify how multiple addResult calls work together?",
|
|
||||||
"ground_truth": "Each addResult call adds one field to the JSON response object. The field name is the variable name passed to addResult and the field value is the current value of that variable. So calling addResult(code) and addResult(status) produces a JSON response like {\"code\": 200, \"status\": \"Success\"}. The fields are added in the order the addResult calls are executed during script execution."
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": "GD-V-003",
|
|
||||||
"category": "CONVERSATIONAL",
|
|
||||||
"question": "What is the difference between addVar and a plain assignment like x = 10 in AVAP?",
|
|
||||||
"ground_truth": "Both addVar and direct assignment declare variables. addVar(varName, value) is the explicit command form — it supports intelligent value resolution checking if the value is an existing variable, a number, or a literal. Direct assignment x = 10 is syntactic sugar that works identically for simple cases. addVar is preferred for declaring new variables with explicit intent, while direct assignment is more natural for updating values or computed expressions."
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": "GD-V-004",
|
|
||||||
"category": "CONVERSATIONAL",
|
|
||||||
"question": "Can you explain again the difference between the two modes of the if() command?",
|
|
||||||
"ground_truth": "Mode 1 is structured comparison: if(variable, value, comparator) — for example if(saldo, 0, \">\") directly compares the variable saldo against 0 using the > operator. Mode 2 is expression mode: if(None, None, \"expression\") — for example if(None, None, \"user_type == 'VIP' or compras > 100\") evaluates a full Python-style boolean expression passed as a string. Mode 2 is more flexible but requires passing None as the first two arguments."
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": "GD-V-005",
|
|
||||||
"category": "CONVERSATIONAL",
|
|
||||||
"question": "What happens if an error occurs in AVAP without a try block?",
|
|
||||||
"ground_truth": "Without a try block, any unhandled exception stops script execution immediately and the server returns a 400 Bad Request error with the error message in the response body. The remaining commands in the script are not executed. With a try block, the error is caught by exception(), the script continues running, and you can handle the error gracefully — for example by setting _status to 500 and returning a structured error message."
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": "GD-V-006",
|
|
||||||
"category": "CONVERSATIONAL",
|
|
||||||
"question": "Can you explain again how the timeout in RequestGet works?",
|
|
||||||
"ground_truth": "The timeout parameter in RequestGet and RequestPost is specified in milliseconds. If the external server does not respond within that time, the request is aborted and the destination variable receives None instead of a response. This prevents the AVAP thread from blocking indefinitely on a slow or unavailable external service. You should always check if the result variable is None after a request to handle timeout cases gracefully."
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": "GD-V-007",
|
|
||||||
"category": "CONVERSATIONAL",
|
|
||||||
"question": "Can I iterate over a list of items in AVAP instead of a numeric range?",
|
|
||||||
"ground_truth": "Yes, but AVAP loops are always numeric — startLoop uses a start and end integer. To iterate over a list, combine getListLen to get the total count, use that count as the loop boundary, and inside the loop use the index variable to access each element. Example: getListLen(mi_lista, total) then startLoop(i, 0, total) with list access inside. Lists are zero-indexed so the index starts at 0."
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": "GD-V-008",
|
|
||||||
"category": "CONVERSATIONAL",
|
|
||||||
"question": "What is the difference between RequestGet and RequestPost in practice?",
|
|
||||||
"ground_truth": "RequestGet sends an HTTP GET request — used for retrieving data, with parameters passed as query string. RequestPost sends an HTTP POST request — used for submitting data, with a body payload that can be JSON or form data. Both require a timeout parameter in milliseconds and store the response in a destination variable. Both return None in the destination variable if the request times out. The key structural difference is that RequestPost includes a body parameter while RequestGet does not."
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": "GD-V-009",
|
|
||||||
"category": "CONVERSATIONAL",
|
|
||||||
"question": "Goroutines cannot access Main Local Scope — can you give a practical example of why that matters?",
|
|
||||||
"ground_truth": "If you declare a variable in the main flow and launch a goroutine, the goroutine cannot read that variable. For example if you do addVar(counter, 0) in the main flow and then call go myFunction(), the function myFunction cannot access counter — it would get a runtime error. To share data with goroutines you must either pass the value as a function parameter, or declare the variable in Global Scope. This isolation prevents race conditions between concurrent goroutines and the main flow."
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": "GD-V-010",
|
|
||||||
"category": "CONVERSATIONAL",
|
|
||||||
"question": "What format does encodeSHA256 return its output in?",
|
|
||||||
"ground_truth": "encodeSHA256 always returns a 64-character lowercase hexadecimal string. This is the standard SHA-256 digest representation — 256 bits expressed as 64 hex characters (0-9 and a-f). The output is deterministic — the same input always produces the same hash — which is why SHA-256 is used for integrity verification rather than for generating unique identifiers."
|
|
||||||
}
|
|
||||||
]
|
|
||||||
|
|
@ -20,7 +20,6 @@ logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
session_store: dict[str, list] = defaultdict(list)
|
session_store: dict[str, list] = defaultdict(list)
|
||||||
|
|
||||||
|
|
||||||
def format_context(docs):
|
def format_context(docs):
|
||||||
chunks = []
|
chunks = []
|
||||||
for i, doc in enumerate(docs, 1):
|
for i, doc in enumerate(docs, 1):
|
||||||
|
|
@ -143,89 +142,6 @@ def hybrid_search_native(es_client, embeddings, query, index_name, k=8):
|
||||||
logger.info(f"[hybrid] RRF -> {len(docs)} final docs")
|
logger.info(f"[hybrid] RRF -> {len(docs)} final docs")
|
||||||
return docs
|
return docs
|
||||||
|
|
||||||
def _build_classify_prompt(question: str, history_text: str, selected_text: str) -> str:
|
|
||||||
prompt = (
|
|
||||||
CLASSIFY_PROMPT_TEMPLATE
|
|
||||||
.replace("{history}", history_text)
|
|
||||||
.replace("{message}", question)
|
|
||||||
)
|
|
||||||
if selected_text:
|
|
||||||
editor_section = (
|
|
||||||
"\n\n<editor_selection>\n"
|
|
||||||
"The user currently has the following AVAP code selected in their editor. "
|
|
||||||
"If the question refers to 'this', 'here', 'the code above', or similar, "
|
|
||||||
"it is about this selection.\n"
|
|
||||||
f"{selected_text}\n"
|
|
||||||
"</editor_selection>"
|
|
||||||
)
|
|
||||||
|
|
||||||
prompt = prompt.replace(
|
|
||||||
f"<user_message>{question}</user_message>",
|
|
||||||
f"{editor_section}\n\n<user_message>{question}</user_message>"
|
|
||||||
)
|
|
||||||
return prompt
|
|
||||||
|
|
||||||
|
|
||||||
def _build_reformulate_query(question: str, selected_text: str) -> str:
|
|
||||||
if not selected_text:
|
|
||||||
return question
|
|
||||||
return f"{selected_text}\n\nUser question about the above: {question}"
|
|
||||||
|
|
||||||
|
|
||||||
def _build_generation_prompt(template_prompt: SystemMessage, context: str,
|
|
||||||
editor_content: str, selected_text: str,
|
|
||||||
extra_context: str) -> SystemMessage:
|
|
||||||
base = template_prompt.content.format(context=context)
|
|
||||||
|
|
||||||
sections = []
|
|
||||||
|
|
||||||
if selected_text:
|
|
||||||
sections.append(
|
|
||||||
"<selected_code>\n"
|
|
||||||
"The user has the following AVAP code selected in their editor. "
|
|
||||||
"Ground your answer in this code first. "
|
|
||||||
"Use the RAG context as supplementary reference only.\n"
|
|
||||||
f"{selected_text}\n"
|
|
||||||
"</selected_code>"
|
|
||||||
)
|
|
||||||
|
|
||||||
if editor_content:
|
|
||||||
sections.append(
|
|
||||||
"<editor_file>\n"
|
|
||||||
"Full content of the active file open in the editor "
|
|
||||||
"(use for broader context if needed):\n"
|
|
||||||
f"{editor_content}\n"
|
|
||||||
"</editor_file>"
|
|
||||||
)
|
|
||||||
|
|
||||||
if extra_context:
|
|
||||||
sections.append(
|
|
||||||
"<extra_context>\n"
|
|
||||||
f"{extra_context}\n"
|
|
||||||
"</extra_context>"
|
|
||||||
)
|
|
||||||
|
|
||||||
if sections:
|
|
||||||
editor_block = "\n\n".join(sections)
|
|
||||||
base = editor_block + "\n\n" + base
|
|
||||||
|
|
||||||
return SystemMessage(content=base)
|
|
||||||
|
|
||||||
|
|
||||||
def _parse_query_type(raw: str) -> tuple[str, bool]:
|
|
||||||
parts = raw.strip().upper().split()
|
|
||||||
query_type = "RETRIEVAL"
|
|
||||||
use_editor = False
|
|
||||||
if parts:
|
|
||||||
first = parts[0]
|
|
||||||
if first.startswith("CODE_GENERATION") or "CODE" in first:
|
|
||||||
query_type = "CODE_GENERATION"
|
|
||||||
elif first.startswith("CONVERSATIONAL"):
|
|
||||||
query_type = "CONVERSATIONAL"
|
|
||||||
if len(parts) > 1 and parts[1] == "EDITOR":
|
|
||||||
use_editor = True
|
|
||||||
return query_type, use_editor
|
|
||||||
|
|
||||||
def build_graph(llm, embeddings, es_client, index_name):
|
def build_graph(llm, embeddings, es_client, index_name):
|
||||||
|
|
||||||
def _persist(state: AgentState, response: BaseMessage):
|
def _persist(state: AgentState, response: BaseMessage):
|
||||||
|
|
@ -240,37 +156,43 @@ def build_graph(llm, embeddings, es_client, index_name):
|
||||||
user_msg.get("content", "")
|
user_msg.get("content", "")
|
||||||
if isinstance(user_msg, dict) else "")
|
if isinstance(user_msg, dict) else "")
|
||||||
history_msgs = messages[:-1]
|
history_msgs = messages[:-1]
|
||||||
selected_text = state.get("selected_text", "")
|
|
||||||
|
|
||||||
history_text = format_history_for_classify(history_msgs) if history_msgs else "(no history)"
|
if not history_msgs:
|
||||||
prompt_content = _build_classify_prompt(question, history_text, selected_text)
|
prompt_content = (
|
||||||
|
CLASSIFY_PROMPT_TEMPLATE
|
||||||
|
.replace("{history}", "(no history)")
|
||||||
|
.replace("{message}", question)
|
||||||
|
)
|
||||||
|
resp = llm.invoke([SystemMessage(content=prompt_content)])
|
||||||
|
raw = resp.content.strip().upper()
|
||||||
|
query_type = _parse_query_type(raw)
|
||||||
|
logger.info(f"[classify] no historic content raw='{raw}' -> {query_type}")
|
||||||
|
return {"query_type": query_type}
|
||||||
|
|
||||||
|
history_text = format_history_for_classify(history_msgs)
|
||||||
|
prompt_content = (
|
||||||
|
CLASSIFY_PROMPT_TEMPLATE
|
||||||
|
.replace("{history}", history_text)
|
||||||
|
.replace("{message}", question)
|
||||||
|
)
|
||||||
resp = llm.invoke([SystemMessage(content=prompt_content)])
|
resp = llm.invoke([SystemMessage(content=prompt_content)])
|
||||||
raw = resp.content.strip().upper()
|
raw = resp.content.strip().upper()
|
||||||
query_type, use_editor_ctx = _parse_query_type(raw)
|
query_type = _parse_query_type(raw)
|
||||||
logger.info(f"[classify] selected={bool(selected_text)} raw='{raw}' -> {query_type} editor={use_editor_ctx}")
|
logger.info(f"[classify] raw='{raw}' -> {query_type}")
|
||||||
return {"query_type": query_type, "use_editor_context": use_editor_ctx}
|
return {"query_type": query_type}
|
||||||
|
|
||||||
|
def _parse_query_type(raw: str) -> str:
|
||||||
|
if raw.startswith("CODE_GENERATION") or "CODE" in raw:
|
||||||
|
return "CODE_GENERATION"
|
||||||
|
if raw.startswith("CONVERSATIONAL"):
|
||||||
|
return "CONVERSATIONAL"
|
||||||
|
return "RETRIEVAL"
|
||||||
|
|
||||||
def reformulate(state: AgentState) -> AgentState:
|
def reformulate(state: AgentState) -> AgentState:
|
||||||
user_msg = state["messages"][-1]
|
user_msg = state["messages"][-1]
|
||||||
selected_text = state.get("selected_text", "")
|
resp = llm.invoke([REFORMULATE_PROMPT, user_msg])
|
||||||
question = getattr(user_msg, "content",
|
|
||||||
user_msg.get("content", "")
|
|
||||||
if isinstance(user_msg, dict) else "")
|
|
||||||
|
|
||||||
anchor = _build_reformulate_query(question, selected_text)
|
|
||||||
|
|
||||||
if selected_text:
|
|
||||||
|
|
||||||
from langchain_core.messages import HumanMessage as HM
|
|
||||||
resp = llm.invoke([REFORMULATE_PROMPT, HM(content=anchor)])
|
|
||||||
else:
|
|
||||||
query_type = state.get("query_type", "RETRIEVAL")
|
|
||||||
mode_hint = HumanMessage(content=f"[MODE: {query_type}]\n{question}")
|
|
||||||
resp = llm.invoke([REFORMULATE_PROMPT, mode_hint])
|
|
||||||
|
|
||||||
reformulated = resp.content.strip()
|
reformulated = resp.content.strip()
|
||||||
logger.info(f"[reformulate] selected={bool(selected_text)} -> '{reformulated}'")
|
logger.info(f"[reformulate] -> '{reformulated}'")
|
||||||
return {"reformulated_query": reformulated}
|
return {"reformulated_query": reformulated}
|
||||||
|
|
||||||
def retrieve(state: AgentState) -> AgentState:
|
def retrieve(state: AgentState) -> AgentState:
|
||||||
|
|
@ -287,13 +209,8 @@ def build_graph(llm, embeddings, es_client, index_name):
|
||||||
return {"context": context}
|
return {"context": context}
|
||||||
|
|
||||||
def generate(state):
|
def generate(state):
|
||||||
use_editor = state.get("use_editor_context", False)
|
prompt = SystemMessage(
|
||||||
prompt = _build_generation_prompt(
|
content=GENERATE_PROMPT.content.format(context=state["context"])
|
||||||
template_prompt = GENERATE_PROMPT,
|
|
||||||
context = state.get("context", ""),
|
|
||||||
editor_content = state.get("editor_content", "") if use_editor else "",
|
|
||||||
selected_text = state.get("selected_text", "") if use_editor else "",
|
|
||||||
extra_context = state.get("extra_context", ""),
|
|
||||||
)
|
)
|
||||||
resp = llm.invoke([prompt] + state["messages"])
|
resp = llm.invoke([prompt] + state["messages"])
|
||||||
logger.info(f"[generate] {len(resp.content)} chars")
|
logger.info(f"[generate] {len(resp.content)} chars")
|
||||||
|
|
@ -301,13 +218,8 @@ def build_graph(llm, embeddings, es_client, index_name):
|
||||||
return {"messages": [resp]}
|
return {"messages": [resp]}
|
||||||
|
|
||||||
def generate_code(state):
|
def generate_code(state):
|
||||||
use_editor = state.get("use_editor_context", False)
|
prompt = SystemMessage(
|
||||||
prompt = _build_generation_prompt(
|
content=CODE_GENERATION_PROMPT.content.format(context=state["context"])
|
||||||
template_prompt = CODE_GENERATION_PROMPT,
|
|
||||||
context = state.get("context", ""),
|
|
||||||
editor_content = state.get("editor_content", "") if use_editor else "",
|
|
||||||
selected_text = state.get("selected_text", "") if use_editor else "",
|
|
||||||
extra_context = state.get("extra_context", ""),
|
|
||||||
)
|
)
|
||||||
resp = llm.invoke([prompt] + state["messages"])
|
resp = llm.invoke([prompt] + state["messages"])
|
||||||
logger.info(f"[generate_code] {len(resp.content)} chars")
|
logger.info(f"[generate_code] {len(resp.content)} chars")
|
||||||
|
|
@ -315,16 +227,8 @@ def build_graph(llm, embeddings, es_client, index_name):
|
||||||
return {"messages": [resp]}
|
return {"messages": [resp]}
|
||||||
|
|
||||||
def respond_conversational(state):
|
def respond_conversational(state):
|
||||||
extra_context = state.get("extra_context", "")
|
resp = llm.invoke([CONVERSATIONAL_PROMPT] + state["messages"])
|
||||||
if extra_context:
|
logger.info("[conversational] from comversation")
|
||||||
enriched = SystemMessage(content=(
|
|
||||||
CONVERSATIONAL_PROMPT.content +
|
|
||||||
f"\n\n<extra_context>\n{extra_context}\n</extra_context>"
|
|
||||||
))
|
|
||||||
else:
|
|
||||||
enriched = CONVERSATIONAL_PROMPT
|
|
||||||
resp = llm.invoke([enriched] + state["messages"])
|
|
||||||
logger.info("[conversational] from conversation")
|
|
||||||
_persist(state, resp)
|
_persist(state, resp)
|
||||||
return {"messages": [resp]}
|
return {"messages": [resp]}
|
||||||
|
|
||||||
|
|
@ -350,9 +254,9 @@ def build_graph(llm, embeddings, es_client, index_name):
|
||||||
"classify",
|
"classify",
|
||||||
route_by_type,
|
route_by_type,
|
||||||
{
|
{
|
||||||
"RETRIEVAL": "reformulate",
|
"RETRIEVAL": "reformulate",
|
||||||
"CODE_GENERATION": "reformulate",
|
"CODE_GENERATION": "reformulate",
|
||||||
"CONVERSATIONAL": "respond_conversational",
|
"CONVERSATIONAL": "respond_conversational",
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -362,7 +266,7 @@ def build_graph(llm, embeddings, es_client, index_name):
|
||||||
"retrieve",
|
"retrieve",
|
||||||
route_after_retrieve,
|
route_after_retrieve,
|
||||||
{
|
{
|
||||||
"generate": "generate",
|
"generate": "generate",
|
||||||
"generate_code": "generate_code",
|
"generate_code": "generate_code",
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
@ -380,39 +284,46 @@ def build_prepare_graph(llm, embeddings, es_client, index_name):
|
||||||
messages = state["messages"]
|
messages = state["messages"]
|
||||||
user_msg = messages[-1]
|
user_msg = messages[-1]
|
||||||
question = getattr(user_msg, "content",
|
question = getattr(user_msg, "content",
|
||||||
user_msg.get("content", "")
|
user_msg.get("content", "")
|
||||||
if isinstance(user_msg, dict) else "")
|
if isinstance(user_msg, dict) else "")
|
||||||
history_msgs = messages[:-1]
|
history_msgs = messages[:-1]
|
||||||
selected_text = state.get("selected_text", "")
|
|
||||||
|
|
||||||
history_text = format_history_for_classify(history_msgs) if history_msgs else "(no history)"
|
if not history_msgs:
|
||||||
prompt_content = _build_classify_prompt(question, history_text, selected_text)
|
prompt_content = (
|
||||||
|
CLASSIFY_PROMPT_TEMPLATE
|
||||||
|
.replace("{history}", "(no history)")
|
||||||
|
.replace("{message}", question)
|
||||||
|
)
|
||||||
|
resp = llm.invoke([SystemMessage(content=prompt_content)])
|
||||||
|
raw = resp.content.strip().upper()
|
||||||
|
query_type = _parse_query_type(raw)
|
||||||
|
logger.info(f"[prepare/classify] no history raw='{raw}' -> {query_type}")
|
||||||
|
return {"query_type": query_type}
|
||||||
|
|
||||||
|
history_text = format_history_for_classify(history_msgs)
|
||||||
|
prompt_content = (
|
||||||
|
CLASSIFY_PROMPT_TEMPLATE
|
||||||
|
.replace("{history}", history_text)
|
||||||
|
.replace("{message}", question)
|
||||||
|
)
|
||||||
resp = llm.invoke([SystemMessage(content=prompt_content)])
|
resp = llm.invoke([SystemMessage(content=prompt_content)])
|
||||||
raw = resp.content.strip().upper()
|
raw = resp.content.strip().upper()
|
||||||
query_type, use_editor_ctx = _parse_query_type(raw)
|
query_type = _parse_query_type(raw)
|
||||||
logger.info(f"[prepare/classify] selected={bool(selected_text)} raw='{raw}' -> {query_type} editor={use_editor_ctx}")
|
logger.info(f"[prepare/classify] raw='{raw}' -> {query_type}")
|
||||||
return {"query_type": query_type, "use_editor_context": use_editor_ctx}
|
return {"query_type": query_type}
|
||||||
|
|
||||||
|
def _parse_query_type(raw: str) -> str:
|
||||||
|
if raw.startswith("CODE_GENERATION") or "CODE" in raw:
|
||||||
|
return "CODE_GENERATION"
|
||||||
|
if raw.startswith("CONVERSATIONAL"):
|
||||||
|
return "CONVERSATIONAL"
|
||||||
|
return "RETRIEVAL"
|
||||||
|
|
||||||
def reformulate(state: AgentState) -> AgentState:
|
def reformulate(state: AgentState) -> AgentState:
|
||||||
user_msg = state["messages"][-1]
|
user_msg = state["messages"][-1]
|
||||||
selected_text = state.get("selected_text", "")
|
resp = llm.invoke([REFORMULATE_PROMPT, user_msg])
|
||||||
question = getattr(user_msg, "content",
|
|
||||||
user_msg.get("content", "")
|
|
||||||
if isinstance(user_msg, dict) else "")
|
|
||||||
|
|
||||||
anchor = _build_reformulate_query(question, selected_text)
|
|
||||||
|
|
||||||
if selected_text:
|
|
||||||
from langchain_core.messages import HumanMessage as HM
|
|
||||||
resp = llm.invoke([REFORMULATE_PROMPT, HM(content=anchor)])
|
|
||||||
else:
|
|
||||||
query_type = state.get("query_type", "RETRIEVAL")
|
|
||||||
mode_hint = HumanMessage(content=f"[MODE: {query_type}]\n{question}")
|
|
||||||
resp = llm.invoke([REFORMULATE_PROMPT, mode_hint])
|
|
||||||
|
|
||||||
reformulated = resp.content.strip()
|
reformulated = resp.content.strip()
|
||||||
logger.info(f"[prepare/reformulate] selected={bool(selected_text)} -> '{reformulated}'")
|
logger.info(f"[prepare/reformulate] -> '{reformulated}'")
|
||||||
return {"reformulated_query": reformulated}
|
return {"reformulated_query": reformulated}
|
||||||
|
|
||||||
def retrieve(state: AgentState) -> AgentState:
|
def retrieve(state: AgentState) -> AgentState:
|
||||||
|
|
@ -455,7 +366,7 @@ def build_prepare_graph(llm, embeddings, es_client, index_name):
|
||||||
|
|
||||||
graph_builder.add_edge("reformulate", "retrieve")
|
graph_builder.add_edge("reformulate", "retrieve")
|
||||||
graph_builder.add_edge("retrieve", END)
|
graph_builder.add_edge("retrieve", END)
|
||||||
graph_builder.add_edge("skip_retrieve", END)
|
graph_builder.add_edge("skip_retrieve",END)
|
||||||
|
|
||||||
return graph_builder.compile()
|
return graph_builder.compile()
|
||||||
|
|
||||||
|
|
@ -464,37 +375,17 @@ def build_final_messages(state: AgentState) -> list:
|
||||||
query_type = state.get("query_type", "RETRIEVAL")
|
query_type = state.get("query_type", "RETRIEVAL")
|
||||||
context = state.get("context", "")
|
context = state.get("context", "")
|
||||||
messages = state.get("messages", [])
|
messages = state.get("messages", [])
|
||||||
editor_content = state.get("editor_content", "")
|
|
||||||
selected_text = state.get("selected_text", "")
|
|
||||||
extra_context = state.get("extra_context", "")
|
|
||||||
|
|
||||||
if query_type == "CONVERSATIONAL":
|
if query_type == "CONVERSATIONAL":
|
||||||
extra_context = state.get("extra_context", "")
|
return [CONVERSATIONAL_PROMPT] + messages
|
||||||
if extra_context:
|
|
||||||
enriched = SystemMessage(content=(
|
|
||||||
CONVERSATIONAL_PROMPT.content +
|
|
||||||
f"\n\n<extra_context>\n{extra_context}\n</extra_context>"
|
|
||||||
))
|
|
||||||
else:
|
|
||||||
enriched = CONVERSATIONAL_PROMPT
|
|
||||||
return [enriched] + messages
|
|
||||||
|
|
||||||
use_editor = state.get("use_editor_context", False)
|
|
||||||
if query_type == "CODE_GENERATION":
|
if query_type == "CODE_GENERATION":
|
||||||
prompt = _build_generation_prompt(
|
prompt = SystemMessage(
|
||||||
template_prompt = CODE_GENERATION_PROMPT,
|
content=CODE_GENERATION_PROMPT.content.format(context=context)
|
||||||
context = context,
|
|
||||||
editor_content = editor_content if use_editor else "",
|
|
||||||
selected_text = selected_text if use_editor else "",
|
|
||||||
extra_context = extra_context,
|
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
prompt = _build_generation_prompt(
|
prompt = SystemMessage(
|
||||||
template_prompt = GENERATE_PROMPT,
|
content=GENERATE_PROMPT.content.format(context=context)
|
||||||
context = context,
|
|
||||||
editor_content = editor_content if use_editor else "",
|
|
||||||
selected_text= selected_text if use_editor else "",
|
|
||||||
extra_context = extra_context,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
return [prompt] + messages
|
return [prompt] + messages
|
||||||
|
|
@ -154,42 +154,13 @@ def _query_from_messages(messages: list[ChatMessage]) -> str:
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
|
|
||||||
async def _invoke_blocking(query: str, session_id: str, context = {}) -> str:
|
async def _invoke_blocking(query: str, session_id: str) -> str:
|
||||||
|
|
||||||
loop = asyncio.get_event_loop()
|
loop = asyncio.get_event_loop()
|
||||||
|
|
||||||
def _call():
|
def _call():
|
||||||
stub = get_stub()
|
stub = get_stub()
|
||||||
|
req = brunix_pb2.AgentRequest(query=query, session_id=session_id)
|
||||||
|
|
||||||
try:
|
|
||||||
ed_contxt = context["editor_content"] or ""
|
|
||||||
except Exception:
|
|
||||||
ed_contxt = ""
|
|
||||||
|
|
||||||
try:
|
|
||||||
sel_contxt = context["selected_text"] or ""
|
|
||||||
except Exception:
|
|
||||||
sel_contxt = ""
|
|
||||||
|
|
||||||
try:
|
|
||||||
ext_contxt = context["extra_context"] or ""
|
|
||||||
except Exception:
|
|
||||||
ext_contxt = ""
|
|
||||||
|
|
||||||
try:
|
|
||||||
us_info = str(context["user_info"]) or "{}"
|
|
||||||
except Exception:
|
|
||||||
us_info = "{}"
|
|
||||||
|
|
||||||
|
|
||||||
req = brunix_pb2.AgentRequest(query=query, session_id=session_id,
|
|
||||||
editor_content=ed_contxt,
|
|
||||||
selected_text=sel_contxt,
|
|
||||||
extra_context=ext_contxt,
|
|
||||||
user_info=us_info)
|
|
||||||
|
|
||||||
|
|
||||||
parts = []
|
parts = []
|
||||||
for resp in stub.AskAgent(req):
|
for resp in stub.AskAgent(req):
|
||||||
if resp.text:
|
if resp.text:
|
||||||
|
|
@ -199,7 +170,7 @@ async def _invoke_blocking(query: str, session_id: str, context = {}) -> str:
|
||||||
return await loop.run_in_executor(_thread_pool, _call)
|
return await loop.run_in_executor(_thread_pool, _call)
|
||||||
|
|
||||||
|
|
||||||
async def _iter_stream(query: str, session_id: str, context = {}) -> AsyncIterator[brunix_pb2.AgentResponse]:
|
async def _iter_stream(query: str, session_id: str) -> AsyncIterator[brunix_pb2.AgentResponse]:
|
||||||
|
|
||||||
loop = asyncio.get_event_loop()
|
loop = asyncio.get_event_loop()
|
||||||
queue: asyncio.Queue = asyncio.Queue()
|
queue: asyncio.Queue = asyncio.Queue()
|
||||||
|
|
@ -207,38 +178,8 @@ async def _iter_stream(query: str, session_id: str, context = {}) -> AsyncIterat
|
||||||
def _producer():
|
def _producer():
|
||||||
try:
|
try:
|
||||||
stub = get_stub()
|
stub = get_stub()
|
||||||
print("CONTEXT ====")
|
req = brunix_pb2.AgentRequest(query=query, session_id=session_id)
|
||||||
print(context)
|
for resp in stub.AskAgentStream(req): # ← AskAgentStream
|
||||||
print("======= ====")
|
|
||||||
|
|
||||||
try:
|
|
||||||
ed_contxt = context["editor_content"] or ""
|
|
||||||
except Exception:
|
|
||||||
ed_contxt = ""
|
|
||||||
|
|
||||||
try:
|
|
||||||
sel_contxt = context["selected_text"] or ""
|
|
||||||
except Exception:
|
|
||||||
sel_contxt = ""
|
|
||||||
|
|
||||||
try:
|
|
||||||
ext_contxt = context["extra_context"] or ""
|
|
||||||
except Exception:
|
|
||||||
ext_contxt = ""
|
|
||||||
|
|
||||||
try:
|
|
||||||
us_info = str(context["user_info"]) or "{}"
|
|
||||||
except Exception:
|
|
||||||
us_info = "{}"
|
|
||||||
|
|
||||||
|
|
||||||
req = brunix_pb2.AgentRequest(query=query, session_id=session_id,
|
|
||||||
editor_content=ed_contxt,
|
|
||||||
selected_text=sel_contxt,
|
|
||||||
extra_context=ext_contxt,
|
|
||||||
user_info=us_info)
|
|
||||||
|
|
||||||
for resp in stub.AskAgentStream(req): # AskAgentStream
|
|
||||||
asyncio.run_coroutine_threadsafe(queue.put(resp), loop).result()
|
asyncio.run_coroutine_threadsafe(queue.put(resp), loop).result()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
asyncio.run_coroutine_threadsafe(queue.put(e), loop).result()
|
asyncio.run_coroutine_threadsafe(queue.put(e), loop).result()
|
||||||
|
|
@ -256,17 +197,16 @@ async def _iter_stream(query: str, session_id: str, context = {}) -> AsyncIterat
|
||||||
yield item
|
yield item
|
||||||
|
|
||||||
|
|
||||||
async def _stream_chat(query: str, session_id: str, req_id: str, context = {}) -> AsyncIterator[str]:
|
async def _stream_chat(query: str, session_id: str, req_id: str) -> AsyncIterator[str]:
|
||||||
|
|
||||||
try:
|
try:
|
||||||
async for resp in _iter_stream(query, session_id, context):
|
async for resp in _iter_stream(query, session_id):
|
||||||
if resp.is_final:
|
if resp.is_final:
|
||||||
yield _sse(_chat_chunk("", req_id, finish="stop"))
|
yield _sse(_chat_chunk("", req_id, finish="stop"))
|
||||||
break
|
break
|
||||||
if resp.text:
|
if resp.text:
|
||||||
yield _sse(_chat_chunk(resp.text, req_id))
|
yield _sse(_chat_chunk(resp.text, req_id))
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"[stream_chat] error: {e}", exc_info=True)
|
logger.error(f"[stream_chat] error: {e}")
|
||||||
yield _sse(_chat_chunk(f"[Error: {e}]", req_id, finish="stop"))
|
yield _sse(_chat_chunk(f"[Error: {e}]", req_id, finish="stop"))
|
||||||
|
|
||||||
yield _sse_done()
|
yield _sse_done()
|
||||||
|
|
@ -345,17 +285,8 @@ async def list_models():
|
||||||
@app.post("/v1/chat/completions")
|
@app.post("/v1/chat/completions")
|
||||||
async def chat_completions(req: ChatCompletionRequest):
|
async def chat_completions(req: ChatCompletionRequest):
|
||||||
query = _query_from_messages(req.messages)
|
query = _query_from_messages(req.messages)
|
||||||
|
session_id = req.session_id or req.user or "default"
|
||||||
session_id = req.session_id or "default"
|
|
||||||
req_id = f"chatcmpl-{uuid.uuid4().hex}"
|
req_id = f"chatcmpl-{uuid.uuid4().hex}"
|
||||||
|
|
||||||
context = {}
|
|
||||||
|
|
||||||
try:
|
|
||||||
context = json.loads(req.user)
|
|
||||||
except Exception as e:
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
logger.info(f"[chat] session={session_id} stream={req.stream} query='{query[:80]}'")
|
logger.info(f"[chat] session={session_id} stream={req.stream} query='{query[:80]}'")
|
||||||
|
|
||||||
|
|
@ -365,7 +296,7 @@ async def chat_completions(req: ChatCompletionRequest):
|
||||||
if req.stream:
|
if req.stream:
|
||||||
|
|
||||||
return StreamingResponse(
|
return StreamingResponse(
|
||||||
_stream_chat(query, session_id, req_id, context),
|
_stream_chat(query, session_id, req_id),
|
||||||
media_type="text/event-stream",
|
media_type="text/event-stream",
|
||||||
headers={"Cache-Control": "no-cache", "X-Accel-Buffering": "no"},
|
headers={"Cache-Control": "no-cache", "X-Accel-Buffering": "no"},
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -4,8 +4,7 @@ from langchain_core.messages import SystemMessage
|
||||||
CLASSIFY_PROMPT_TEMPLATE = (
|
CLASSIFY_PROMPT_TEMPLATE = (
|
||||||
"<role>\n"
|
"<role>\n"
|
||||||
"You are a query classifier for an AVAP language assistant. "
|
"You are a query classifier for an AVAP language assistant. "
|
||||||
"Your only job is to classify the user message into one of three categories "
|
"Your only job is to classify the user message into one of three categories.\n"
|
||||||
"and determine whether the user is explicitly asking about the editor code.\n"
|
|
||||||
"</role>\n\n"
|
"</role>\n\n"
|
||||||
|
|
||||||
"<categories>\n"
|
"<categories>\n"
|
||||||
|
|
@ -29,27 +28,9 @@ CLASSIFY_PROMPT_TEMPLATE = (
|
||||||
"'describe it in your own words', 'what did you mean?'\n"
|
"'describe it in your own words', 'what did you mean?'\n"
|
||||||
"</categories>\n\n"
|
"</categories>\n\n"
|
||||||
|
|
||||||
"<editor_rule>\n"
|
|
||||||
"The second word of your response indicates whether the user is explicitly "
|
|
||||||
"asking about the code in their editor or selected text.\n"
|
|
||||||
"Answer EDITOR only if the user message clearly refers to specific code "
|
|
||||||
"they are looking at — using expressions like: "
|
|
||||||
"'this code', 'este codigo', 'esto', 'this function', 'fix this', "
|
|
||||||
"'explain this', 'what does this do', 'que hace esto', "
|
|
||||||
"'como mejoro esto', 'el codigo del editor', 'lo que tengo aqui', "
|
|
||||||
"'this selection', 'lo seleccionado', or similar.\n"
|
|
||||||
"Answer NO_EDITOR in all other cases — including general AVAP questions, "
|
|
||||||
"code generation requests, and conversational follow-ups that do not "
|
|
||||||
"refer to specific editor code.\n"
|
|
||||||
"</editor_rule>\n\n"
|
|
||||||
|
|
||||||
"<output_rule>\n"
|
"<output_rule>\n"
|
||||||
"Your entire response must be exactly two words separated by a single space.\n"
|
"Your entire response must be exactly one word: "
|
||||||
"First word: RETRIEVAL, CODE_GENERATION, or CONVERSATIONAL.\n"
|
"RETRIEVAL, CODE_GENERATION, or CONVERSATIONAL. Nothing else.\n"
|
||||||
"Second word: EDITOR or NO_EDITOR.\n"
|
|
||||||
"Valid examples: 'RETRIEVAL NO_EDITOR', 'CODE_GENERATION EDITOR', "
|
|
||||||
"'CONVERSATIONAL NO_EDITOR'.\n"
|
|
||||||
"No other output. No punctuation. No explanation.\n"
|
|
||||||
"</output_rule>\n\n"
|
"</output_rule>\n\n"
|
||||||
|
|
||||||
"<conversation_history>\n"
|
"<conversation_history>\n"
|
||||||
|
|
@ -68,23 +49,10 @@ REFORMULATE_PROMPT = SystemMessage(
|
||||||
"into keyword queries that will find the right AVAP documentation chunks.\n"
|
"into keyword queries that will find the right AVAP documentation chunks.\n"
|
||||||
"</role>\n\n"
|
"</role>\n\n"
|
||||||
|
|
||||||
"<mode_rule>\n"
|
|
||||||
"The input starts with [MODE: X]. Follow these rules strictly:\n"
|
|
||||||
"- MODE RETRIEVAL: rewrite as compact keywords. DO NOT expand with AVAP commands. "
|
|
||||||
"DO NOT translate — preserve the original language.\n"
|
|
||||||
"- MODE CODE_GENERATION: apply the command expansion mapping in <task>.\n"
|
|
||||||
"- MODE CONVERSATIONAL: return the question as-is.\n"
|
|
||||||
"</mode_rule>\n\n"
|
|
||||||
|
|
||||||
"<language_rule>\n"
|
|
||||||
"NEVER translate the query. If the user writes in Spanish, rewrite in Spanish. "
|
|
||||||
"If the user writes in English, rewrite in English.\n"
|
|
||||||
"</language_rule>\n\n"
|
|
||||||
|
|
||||||
"<task>\n"
|
"<task>\n"
|
||||||
"Rewrite the user message into a compact keyword query for semantic search.\n\n"
|
"Rewrite the user message into a compact keyword query for semantic search.\n\n"
|
||||||
|
|
||||||
"SPECIAL RULE for CODE_GENERATION only:\n"
|
"SPECIAL RULE for code generation requests:\n"
|
||||||
"When the user asks to generate/create/build/show AVAP code, expand the query "
|
"When the user asks to generate/create/build/show AVAP code, expand the query "
|
||||||
"with the AVAP commands typically needed. Use this mapping:\n\n"
|
"with the AVAP commands typically needed. Use this mapping:\n\n"
|
||||||
|
|
||||||
|
|
@ -112,27 +80,21 @@ REFORMULATE_PROMPT = SystemMessage(
|
||||||
"- Remove filler words.\n"
|
"- Remove filler words.\n"
|
||||||
"- Output a single line.\n"
|
"- Output a single line.\n"
|
||||||
"- Never answer the question.\n"
|
"- Never answer the question.\n"
|
||||||
"- Never translate.\n"
|
|
||||||
"</rules>\n\n"
|
"</rules>\n\n"
|
||||||
|
|
||||||
"<examples>\n"
|
"<examples>\n"
|
||||||
"<example>\n"
|
"<example>\n"
|
||||||
"<input>[MODE: RETRIEVAL] Que significa AVAP?</input>\n"
|
"<input>What does AVAP stand for?</input>\n"
|
||||||
"<o>AVAP significado definición lenguaje DSL</o>\n"
|
"<o>AVAP stand for</o>\n"
|
||||||
"</example>\n\n"
|
"</example>\n\n"
|
||||||
|
|
||||||
"<example>\n"
|
"<example>\n"
|
||||||
"<input>[MODE: RETRIEVAL] What does AVAP stand for?</input>\n"
|
"<input>dime como seria un API que devuelva hello world con AVAP</input>\n"
|
||||||
"<o>AVAP definition language stands for</o>\n"
|
|
||||||
"</example>\n\n"
|
|
||||||
|
|
||||||
"<example>\n"
|
|
||||||
"<input>[MODE: CODE_GENERATION] dime como seria un API que devuelva hello world con AVAP</input>\n"
|
|
||||||
"<o>AVAP registerEndpoint addResult _status hello world example</o>\n"
|
"<o>AVAP registerEndpoint addResult _status hello world example</o>\n"
|
||||||
"</example>\n\n"
|
"</example>\n\n"
|
||||||
|
|
||||||
"<example>\n"
|
"<example>\n"
|
||||||
"<input>[MODE: CODE_GENERATION] generate an AVAP script that reads a parameter and queries the DB</input>\n"
|
"<input>generate an AVAP script that reads a parameter and queries the DB</input>\n"
|
||||||
"<o>AVAP addParam ormAccessSelect avapConnector registerEndpoint addResult</o>\n"
|
"<o>AVAP addParam ormAccessSelect avapConnector registerEndpoint addResult</o>\n"
|
||||||
"</example>\n"
|
"</example>\n"
|
||||||
"</examples>\n\n"
|
"</examples>\n\n"
|
||||||
|
|
@ -254,10 +216,8 @@ GENERATE_PROMPT = SystemMessage(
|
||||||
"</role>\n\n"
|
"</role>\n\n"
|
||||||
|
|
||||||
"<critical_constraint>\n"
|
"<critical_constraint>\n"
|
||||||
"AVAP is a new proprietary language. For AVAP technical questions, use ONLY "
|
"AVAP is a new proprietary language. Use ONLY content inside <context>. "
|
||||||
"content inside <context>. Treat any AVAP knowledge outside <context> as unreliable.\n"
|
"Treat any AVAP knowledge outside <context> as unreliable.\n"
|
||||||
"For user-specific information (name, role, preferences), use the <extra_context> "
|
|
||||||
"section if present — it overrides any retrieval result.\n"
|
|
||||||
"</critical_constraint>\n\n"
|
"</critical_constraint>\n\n"
|
||||||
|
|
||||||
"<task>\n"
|
"<task>\n"
|
||||||
|
|
@ -272,7 +232,6 @@ GENERATE_PROMPT = SystemMessage(
|
||||||
"</thinking_steps>\n\n"
|
"</thinking_steps>\n\n"
|
||||||
|
|
||||||
"<output_format>\n"
|
"<output_format>\n"
|
||||||
"Answer in the same language the user used.\n\n"
|
|
||||||
"Answer:\n"
|
"Answer:\n"
|
||||||
"<direct answer; include code blocks if context has relevant code>\n\n"
|
"<direct answer; include code blocks if context has relevant code>\n\n"
|
||||||
|
|
||||||
|
|
@ -288,4 +247,4 @@ GENERATE_PROMPT = SystemMessage(
|
||||||
"{context}\n"
|
"{context}\n"
|
||||||
"</context>"
|
"</context>"
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
@ -1,4 +1,3 @@
|
||||||
import base64
|
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
from concurrent import futures
|
from concurrent import futures
|
||||||
|
|
@ -80,33 +79,7 @@ class BrunixEngine(brunix_pb2_grpc.AssistanceEngineServicer):
|
||||||
def AskAgent(self, request, context):
|
def AskAgent(self, request, context):
|
||||||
session_id = request.session_id or "default"
|
session_id = request.session_id or "default"
|
||||||
query = request.query
|
query = request.query
|
||||||
|
logger.info(f"[AskAgent] session={session_id} query='{query[:80]}'")
|
||||||
try:
|
|
||||||
editor_content = base64.b64decode(request.editor_content).decode("utf-8") if request.editor_content else ""
|
|
||||||
except Exception:
|
|
||||||
editor_content = ""
|
|
||||||
logger.warning("[AskAgent] editor_content base64 decode failed")
|
|
||||||
|
|
||||||
|
|
||||||
try:
|
|
||||||
selected_text = base64.b64decode(request.selected_text).decode("utf-8") if request.selected_text else ""
|
|
||||||
except Exception:
|
|
||||||
selected_text = ""
|
|
||||||
logger.warning("[AskAgent] selected_text base64 decode failed")
|
|
||||||
|
|
||||||
try:
|
|
||||||
extra_context = base64.b64decode(request.extra_context).decode("utf-8") if request.extra_context else ""
|
|
||||||
except Exception:
|
|
||||||
extra_context = ""
|
|
||||||
logger.warning("[AskAgent] extra_context base64 decode failed")
|
|
||||||
|
|
||||||
user_info = request.user_info or "{}"
|
|
||||||
|
|
||||||
logger.info(
|
|
||||||
f"[AskAgent] session={session_id} "
|
|
||||||
f"editor={bool(editor_content)} selected={bool(selected_text)} "
|
|
||||||
f"query='{query[:80]}'"
|
|
||||||
)
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
history = list(session_store.get(session_id, []))
|
history = list(session_store.get(session_id, []))
|
||||||
|
|
@ -118,11 +91,6 @@ class BrunixEngine(brunix_pb2_grpc.AssistanceEngineServicer):
|
||||||
"reformulated_query": "",
|
"reformulated_query": "",
|
||||||
"context": "",
|
"context": "",
|
||||||
"query_type": "",
|
"query_type": "",
|
||||||
|
|
||||||
"editor_content": editor_content,
|
|
||||||
"selected_text": selected_text,
|
|
||||||
"extra_context": extra_context,
|
|
||||||
"user_info": user_info
|
|
||||||
}
|
}
|
||||||
|
|
||||||
final_state = self.graph.invoke(initial_state)
|
final_state = self.graph.invoke(initial_state)
|
||||||
|
|
@ -151,33 +119,7 @@ class BrunixEngine(brunix_pb2_grpc.AssistanceEngineServicer):
|
||||||
def AskAgentStream(self, request, context):
|
def AskAgentStream(self, request, context):
|
||||||
session_id = request.session_id or "default"
|
session_id = request.session_id or "default"
|
||||||
query = request.query
|
query = request.query
|
||||||
|
logger.info(f"[AskAgentStream] session={session_id} query='{query[:80]}'")
|
||||||
try:
|
|
||||||
editor_content = base64.b64decode(request.editor_content).decode("utf-8") if request.editor_content else ""
|
|
||||||
except Exception:
|
|
||||||
editor_content = ""
|
|
||||||
logger.warning("[AskAgent] editor_content base64 decode failed")
|
|
||||||
|
|
||||||
|
|
||||||
try:
|
|
||||||
selected_text = base64.b64decode(request.selected_text).decode("utf-8") if request.selected_text else ""
|
|
||||||
except Exception:
|
|
||||||
selected_text = ""
|
|
||||||
logger.warning("[AskAgent] selected_text base64 decode failed")
|
|
||||||
|
|
||||||
try:
|
|
||||||
extra_context = base64.b64decode(request.extra_context).decode("utf-8") if request.extra_context else ""
|
|
||||||
except Exception:
|
|
||||||
extra_context = ""
|
|
||||||
logger.warning("[AskAgent] extra_context base64 decode failed")
|
|
||||||
|
|
||||||
user_info = request.user_info or "{}"
|
|
||||||
|
|
||||||
logger.info(
|
|
||||||
f"[AskAgentStream] session={session_id} "
|
|
||||||
f"editor={bool(editor_content)} selected={bool(selected_text)} context={extra_context} "
|
|
||||||
f"query='{query[:80]}'"
|
|
||||||
)
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
history = list(session_store.get(session_id, []))
|
history = list(session_store.get(session_id, []))
|
||||||
|
|
@ -189,11 +131,6 @@ class BrunixEngine(brunix_pb2_grpc.AssistanceEngineServicer):
|
||||||
"reformulated_query": "",
|
"reformulated_query": "",
|
||||||
"context": "",
|
"context": "",
|
||||||
"query_type": "",
|
"query_type": "",
|
||||||
|
|
||||||
"editor_content": editor_content,
|
|
||||||
"selected_text": selected_text,
|
|
||||||
"extra_context": extra_context,
|
|
||||||
"user_info": user_info
|
|
||||||
}
|
}
|
||||||
|
|
||||||
prepared = self.prepare_graph.invoke(initial_state)
|
prepared = self.prepare_graph.invoke(initial_state)
|
||||||
|
|
|
||||||
|
|
@ -4,15 +4,8 @@ from langgraph.graph.message import add_messages
|
||||||
|
|
||||||
|
|
||||||
class AgentState(TypedDict):
|
class AgentState(TypedDict):
|
||||||
# -- CORE
|
|
||||||
messages: Annotated[list, add_messages]
|
messages: Annotated[list, add_messages]
|
||||||
reformulated_query: str
|
reformulated_query: str
|
||||||
context: str
|
context: str
|
||||||
query_type: str
|
query_type: str
|
||||||
session_id: str
|
session_id: str
|
||||||
# -- OPEN AI API
|
|
||||||
editor_content: str
|
|
||||||
selected_text: str
|
|
||||||
extra_context: str
|
|
||||||
user_info: str
|
|
||||||
use_editor_context: bool
|
|
||||||
|
|
@ -1,396 +0,0 @@
|
||||||
"""
|
|
||||||
tests/test_prd_0002.py
|
|
||||||
|
|
||||||
Unit tests for PRD-0002 — Editor Context Injection.
|
|
||||||
|
|
||||||
These tests run without any external dependencies (no Elasticsearch, no Ollama,
|
|
||||||
no gRPC server). They validate the logic of the components modified in PRD-0002:
|
|
||||||
|
|
||||||
- _parse_query_type — classifier output parser (graph.py)
|
|
||||||
- _parse_editor_context — user field parser (openai_proxy.py)
|
|
||||||
- _build_classify_prompt — classify prompt builder (graph.py)
|
|
||||||
- _build_reformulate_query — reformulate anchor builder (graph.py)
|
|
||||||
- _build_generation_prompt — generation prompt builder (graph.py)
|
|
||||||
- _decode_b64 — base64 decoder (server.py)
|
|
||||||
|
|
||||||
Run with:
|
|
||||||
pytest tests/test_prd_0002.py -v
|
|
||||||
"""
|
|
||||||
|
|
||||||
import base64
|
|
||||||
import json
|
|
||||||
import sys
|
|
||||||
import os
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# Minimal stubs so we can import graph.py and openai_proxy.py without
|
|
||||||
# the full Docker/src environment loaded
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
# Stub brunix_pb2 so openai_proxy imports cleanly
|
|
||||||
import types
|
|
||||||
|
|
||||||
brunix_pb2 = types.ModuleType("brunix_pb2")
|
|
||||||
brunix_pb2.AgentRequest = lambda **kw: kw
|
|
||||||
brunix_pb2.AgentResponse = lambda **kw: kw
|
|
||||||
sys.modules["brunix_pb2"] = brunix_pb2
|
|
||||||
sys.modules["brunix_pb2_grpc"] = types.ModuleType("brunix_pb2_grpc")
|
|
||||||
|
|
||||||
# Stub grpc
|
|
||||||
grpc_mod = types.ModuleType("grpc")
|
|
||||||
grpc_mod.insecure_channel = lambda *a, **kw: None
|
|
||||||
grpc_mod.Channel = object
|
|
||||||
grpc_mod.RpcError = Exception
|
|
||||||
sys.modules["grpc"] = grpc_mod
|
|
||||||
|
|
||||||
# Stub grpc_reflection
|
|
||||||
refl = types.ModuleType("grpc_reflection.v1alpha.reflection")
|
|
||||||
sys.modules["grpc_reflection"] = types.ModuleType("grpc_reflection")
|
|
||||||
sys.modules["grpc_reflection.v1alpha"] = types.ModuleType("grpc_reflection.v1alpha")
|
|
||||||
sys.modules["grpc_reflection.v1alpha.reflection"] = refl
|
|
||||||
|
|
||||||
# Add Docker/src to path so we can import the modules directly
|
|
||||||
DOCKER_SRC = os.path.join(os.path.dirname(__file__), "..", "Docker", "src")
|
|
||||||
sys.path.insert(0, os.path.abspath(DOCKER_SRC))
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# Import the functions under test
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
# We import only the pure functions — no LLM, no ES, no gRPC calls
|
|
||||||
|
|
||||||
def _parse_query_type(raw: str):
|
|
||||||
"""Copy of _parse_query_type from graph.py — tested in isolation."""
|
|
||||||
parts = raw.strip().upper().split()
|
|
||||||
query_type = "RETRIEVAL"
|
|
||||||
use_editor = False
|
|
||||||
if parts:
|
|
||||||
first = parts[0]
|
|
||||||
if first.startswith("CODE_GENERATION") or "CODE" in first:
|
|
||||||
query_type = "CODE_GENERATION"
|
|
||||||
elif first.startswith("CONVERSATIONAL"):
|
|
||||||
query_type = "CONVERSATIONAL"
|
|
||||||
if len(parts) > 1 and parts[1] == "EDITOR":
|
|
||||||
use_editor = True
|
|
||||||
return query_type, use_editor
|
|
||||||
|
|
||||||
|
|
||||||
def _decode_b64(value: str) -> str:
|
|
||||||
"""Copy of _decode_b64 from server.py — tested in isolation."""
|
|
||||||
try:
|
|
||||||
return base64.b64decode(value).decode("utf-8") if value else ""
|
|
||||||
except Exception:
|
|
||||||
return ""
|
|
||||||
|
|
||||||
|
|
||||||
def _parse_editor_context(user):
|
|
||||||
"""Copy of _parse_editor_context from openai_proxy.py — tested in isolation."""
|
|
||||||
if not user:
|
|
||||||
return "", "", "", ""
|
|
||||||
try:
|
|
||||||
ctx = json.loads(user)
|
|
||||||
if isinstance(ctx, dict):
|
|
||||||
return (
|
|
||||||
ctx.get("editor_content", "") or "",
|
|
||||||
ctx.get("selected_text", "") or "",
|
|
||||||
ctx.get("extra_context", "") or "",
|
|
||||||
json.dumps(ctx.get("user_info", {})),
|
|
||||||
)
|
|
||||||
except (json.JSONDecodeError, TypeError):
|
|
||||||
pass
|
|
||||||
return "", "", "", ""
|
|
||||||
|
|
||||||
|
|
||||||
def _build_reformulate_query(question: str, selected_text: str) -> str:
|
|
||||||
"""Copy of _build_reformulate_query from graph.py — tested in isolation."""
|
|
||||||
if not selected_text:
|
|
||||||
return question
|
|
||||||
return f"{selected_text}\n\nUser question about the above: {question}"
|
|
||||||
|
|
||||||
|
|
||||||
def _build_generation_prompt_injects(editor_content, selected_text, use_editor):
|
|
||||||
"""Helper — returns True if editor context would be injected."""
|
|
||||||
sections = []
|
|
||||||
if selected_text and use_editor:
|
|
||||||
sections.append("selected_code")
|
|
||||||
if editor_content and use_editor:
|
|
||||||
sections.append("editor_file")
|
|
||||||
return len(sections) > 0
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# Tests: _parse_query_type
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
class TestParseQueryType:
|
|
||||||
|
|
||||||
def test_retrieval_no_editor(self):
|
|
||||||
qt, ue = _parse_query_type("RETRIEVAL NO_EDITOR")
|
|
||||||
assert qt == "RETRIEVAL"
|
|
||||||
assert ue is False
|
|
||||||
|
|
||||||
def test_retrieval_editor(self):
|
|
||||||
qt, ue = _parse_query_type("RETRIEVAL EDITOR")
|
|
||||||
assert qt == "RETRIEVAL"
|
|
||||||
assert ue is True
|
|
||||||
|
|
||||||
def test_code_generation_no_editor(self):
|
|
||||||
qt, ue = _parse_query_type("CODE_GENERATION NO_EDITOR")
|
|
||||||
assert qt == "CODE_GENERATION"
|
|
||||||
assert ue is False
|
|
||||||
|
|
||||||
def test_code_generation_editor(self):
|
|
||||||
qt, ue = _parse_query_type("CODE_GENERATION EDITOR")
|
|
||||||
assert qt == "CODE_GENERATION"
|
|
||||||
assert ue is True
|
|
||||||
|
|
||||||
def test_conversational_no_editor(self):
|
|
||||||
qt, ue = _parse_query_type("CONVERSATIONAL NO_EDITOR")
|
|
||||||
assert qt == "CONVERSATIONAL"
|
|
||||||
assert ue is False
|
|
||||||
|
|
||||||
def test_single_token_defaults_no_editor(self):
|
|
||||||
"""If model returns only one token, use_editor defaults to False."""
|
|
||||||
qt, ue = _parse_query_type("RETRIEVAL")
|
|
||||||
assert qt == "RETRIEVAL"
|
|
||||||
assert ue is False
|
|
||||||
|
|
||||||
def test_empty_defaults_retrieval_no_editor(self):
|
|
||||||
qt, ue = _parse_query_type("")
|
|
||||||
assert qt == "RETRIEVAL"
|
|
||||||
assert ue is False
|
|
||||||
|
|
||||||
def test_case_insensitive(self):
|
|
||||||
qt, ue = _parse_query_type("retrieval editor")
|
|
||||||
assert qt == "RETRIEVAL"
|
|
||||||
assert ue is True
|
|
||||||
|
|
||||||
def test_code_shorthand(self):
|
|
||||||
"""'CODE' alone should map to CODE_GENERATION."""
|
|
||||||
qt, ue = _parse_query_type("CODE NO_EDITOR")
|
|
||||||
assert qt == "CODE_GENERATION"
|
|
||||||
assert ue is False
|
|
||||||
|
|
||||||
def test_extra_whitespace(self):
|
|
||||||
qt, ue = _parse_query_type(" RETRIEVAL NO_EDITOR ")
|
|
||||||
assert qt == "RETRIEVAL"
|
|
||||||
assert ue is False
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# Tests: _decode_b64
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
class TestDecodeB64:
|
|
||||||
|
|
||||||
def test_valid_base64_spanish(self):
|
|
||||||
text = "addVar(mensaje, \"Hola mundo\")\naddResult(mensaje)"
|
|
||||||
encoded = base64.b64encode(text.encode("utf-8")).decode("utf-8")
|
|
||||||
assert _decode_b64(encoded) == text
|
|
||||||
|
|
||||||
def test_valid_base64_english(self):
|
|
||||||
text = "registerEndpoint(\"GET\", \"/hello\", [], \"public\", handler, \"\")"
|
|
||||||
encoded = base64.b64encode(text.encode("utf-8")).decode("utf-8")
|
|
||||||
assert _decode_b64(encoded) == text
|
|
||||||
|
|
||||||
def test_empty_string_returns_empty(self):
|
|
||||||
assert _decode_b64("") == ""
|
|
||||||
|
|
||||||
def test_none_equivalent_empty(self):
|
|
||||||
assert _decode_b64(None) == ""
|
|
||||||
|
|
||||||
def test_invalid_base64_returns_empty(self):
|
|
||||||
assert _decode_b64("not_valid_base64!!!") == ""
|
|
||||||
|
|
||||||
def test_unicode_content(self):
|
|
||||||
text = "// función de validación\nif(token, \"SECRET\", \"=\")"
|
|
||||||
encoded = base64.b64encode(text.encode("utf-8")).decode("utf-8")
|
|
||||||
assert _decode_b64(encoded) == text
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# Tests: _parse_editor_context
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
class TestParseEditorContext:
|
|
||||||
|
|
||||||
def _encode(self, text: str) -> str:
|
|
||||||
return base64.b64encode(text.encode()).decode()
|
|
||||||
|
|
||||||
def test_full_context_parsed(self):
|
|
||||||
editor = self._encode("addVar(x, 10)")
|
|
||||||
selected = self._encode("addResult(x)")
|
|
||||||
extra = self._encode("/path/to/file.avap")
|
|
||||||
user_json = json.dumps({
|
|
||||||
"editor_content": editor,
|
|
||||||
"selected_text": selected,
|
|
||||||
"extra_context": extra,
|
|
||||||
"user_info": {"dev_id": 1, "project_id": 2, "org_id": 3}
|
|
||||||
})
|
|
||||||
ec, st, ex, ui = _parse_editor_context(user_json)
|
|
||||||
assert ec == editor
|
|
||||||
assert st == selected
|
|
||||||
assert ex == extra
|
|
||||||
assert json.loads(ui) == {"dev_id": 1, "project_id": 2, "org_id": 3}
|
|
||||||
|
|
||||||
def test_empty_user_returns_empty_tuple(self):
|
|
||||||
ec, st, ex, ui = _parse_editor_context(None)
|
|
||||||
assert ec == st == ex == ""
|
|
||||||
|
|
||||||
def test_empty_string_returns_empty_tuple(self):
|
|
||||||
ec, st, ex, ui = _parse_editor_context("")
|
|
||||||
assert ec == st == ex == ""
|
|
||||||
|
|
||||||
def test_plain_string_not_json_returns_empty(self):
|
|
||||||
"""Non-JSON user field — backward compat, no error raised."""
|
|
||||||
ec, st, ex, ui = _parse_editor_context("plain string")
|
|
||||||
assert ec == st == ex == ""
|
|
||||||
|
|
||||||
def test_missing_fields_default_empty(self):
|
|
||||||
user_json = json.dumps({"editor_content": "abc"})
|
|
||||||
ec, st, ex, ui = _parse_editor_context(user_json)
|
|
||||||
assert ec == "abc"
|
|
||||||
assert st == ""
|
|
||||||
assert ex == ""
|
|
||||||
|
|
||||||
def test_user_info_missing_defaults_empty_object(self):
|
|
||||||
user_json = json.dumps({"editor_content": "abc"})
|
|
||||||
_, _, _, ui = _parse_editor_context(user_json)
|
|
||||||
assert json.loads(ui) == {}
|
|
||||||
|
|
||||||
def test_user_info_full_object(self):
|
|
||||||
user_json = json.dumps({
|
|
||||||
"editor_content": "",
|
|
||||||
"selected_text": "",
|
|
||||||
"extra_context": "",
|
|
||||||
"user_info": {"dev_id": 42, "project_id": 7, "org_id": 99}
|
|
||||||
})
|
|
||||||
_, _, _, ui = _parse_editor_context(user_json)
|
|
||||||
parsed = json.loads(ui)
|
|
||||||
assert parsed["dev_id"] == 42
|
|
||||||
assert parsed["project_id"] == 7
|
|
||||||
assert parsed["org_id"] == 99
|
|
||||||
|
|
||||||
def test_session_id_not_leaked_into_context(self):
|
|
||||||
"""session_id must NOT appear in editor context — it has its own field."""
|
|
||||||
user_json = json.dumps({
|
|
||||||
"editor_content": "",
|
|
||||||
"selected_text": "",
|
|
||||||
"extra_context": "",
|
|
||||||
"user_info": {}
|
|
||||||
})
|
|
||||||
ec, st, ex, ui = _parse_editor_context(user_json)
|
|
||||||
assert "session_id" not in ec
|
|
||||||
assert "session_id" not in st
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# Tests: _build_reformulate_query
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
class TestBuildReformulateQuery:
|
|
||||||
|
|
||||||
def test_no_selected_text_returns_question(self):
|
|
||||||
q = "Que significa AVAP?"
|
|
||||||
assert _build_reformulate_query(q, "") == q
|
|
||||||
|
|
||||||
def test_selected_text_prepended_to_question(self):
|
|
||||||
q = "que hace esto?"
|
|
||||||
selected = "addVar(x, 10)\naddResult(x)"
|
|
||||||
result = _build_reformulate_query(q, selected)
|
|
||||||
assert result.startswith(selected)
|
|
||||||
assert q in result
|
|
||||||
|
|
||||||
def test_selected_text_anchor_format(self):
|
|
||||||
q = "fix this"
|
|
||||||
selected = "try()\n ormDirect(query, res)\nexception(e)\nend()"
|
|
||||||
result = _build_reformulate_query(q, selected)
|
|
||||||
assert "User question about the above:" in result
|
|
||||||
assert selected in result
|
|
||||||
assert q in result
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# Tests: editor context injection logic
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
class TestEditorContextInjection:
|
|
||||||
|
|
||||||
def test_no_injection_when_use_editor_false(self):
|
|
||||||
"""Editor content must NOT be injected when use_editor_context is False."""
|
|
||||||
injected = _build_generation_prompt_injects(
|
|
||||||
editor_content = "addVar(x, 10)",
|
|
||||||
selected_text = "addResult(x)",
|
|
||||||
use_editor = False,
|
|
||||||
)
|
|
||||||
assert injected is False
|
|
||||||
|
|
||||||
def test_injection_when_use_editor_true_and_content_present(self):
|
|
||||||
"""Editor content MUST be injected when use_editor_context is True."""
|
|
||||||
injected = _build_generation_prompt_injects(
|
|
||||||
editor_content = "addVar(x, 10)",
|
|
||||||
selected_text = "addResult(x)",
|
|
||||||
use_editor = True,
|
|
||||||
)
|
|
||||||
assert injected is True
|
|
||||||
|
|
||||||
def test_no_injection_when_content_empty_even_if_flag_true(self):
|
|
||||||
"""Empty fields must never be injected even if flag is True."""
|
|
||||||
injected = _build_generation_prompt_injects(
|
|
||||||
editor_content = "",
|
|
||||||
selected_text = "",
|
|
||||||
use_editor = True,
|
|
||||||
)
|
|
||||||
assert injected is False
|
|
||||||
|
|
||||||
def test_partial_injection_selected_only(self):
|
|
||||||
"""selected_text alone triggers injection when flag is True."""
|
|
||||||
injected = _build_generation_prompt_injects(
|
|
||||||
editor_content = "",
|
|
||||||
selected_text = "addResult(x)",
|
|
||||||
use_editor = True,
|
|
||||||
)
|
|
||||||
assert injected is True
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# Tests: classifier routing — EDITOR signal
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
class TestClassifierEditorSignal:
|
|
||||||
"""
|
|
||||||
These tests validate that the two-token output format is correctly parsed
|
|
||||||
for all combinations the classifier can produce.
|
|
||||||
"""
|
|
||||||
|
|
||||||
VALID_OUTPUTS = [
|
|
||||||
("RETRIEVAL NO_EDITOR", "RETRIEVAL", False),
|
|
||||||
("RETRIEVAL EDITOR", "RETRIEVAL", True),
|
|
||||||
("CODE_GENERATION NO_EDITOR", "CODE_GENERATION", False),
|
|
||||||
("CODE_GENERATION EDITOR", "CODE_GENERATION", True),
|
|
||||||
("CONVERSATIONAL NO_EDITOR", "CONVERSATIONAL", False),
|
|
||||||
("CONVERSATIONAL EDITOR", "CONVERSATIONAL", True),
|
|
||||||
]
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("raw,expected_qt,expected_ue", VALID_OUTPUTS)
|
|
||||||
def test_valid_two_token_output(self, raw, expected_qt, expected_ue):
|
|
||||||
qt, ue = _parse_query_type(raw)
|
|
||||||
assert qt == expected_qt
|
|
||||||
assert ue == expected_ue
|
|
||||||
|
|
||||||
def test_editor_flag_false_for_general_avap_question(self):
|
|
||||||
"""'Que significa AVAP?' -> RETRIEVAL NO_EDITOR."""
|
|
||||||
qt, ue = _parse_query_type("RETRIEVAL NO_EDITOR")
|
|
||||||
assert ue is False
|
|
||||||
|
|
||||||
def test_editor_flag_true_for_explicit_editor_reference(self):
|
|
||||||
"""'que hace este codigo?' with selected_text -> RETRIEVAL EDITOR."""
|
|
||||||
qt, ue = _parse_query_type("RETRIEVAL EDITOR")
|
|
||||||
assert ue is True
|
|
||||||
|
|
||||||
def test_editor_flag_false_for_code_generation_without_reference(self):
|
|
||||||
"""'dame un API de hello world' -> CODE_GENERATION NO_EDITOR."""
|
|
||||||
qt, ue = _parse_query_type("CODE_GENERATION NO_EDITOR")
|
|
||||||
assert ue is False
|
|
||||||
448
NOTICE
448
NOTICE
|
|
@ -1,448 +0,0 @@
|
||||||
NOTICE
|
|
||||||
======
|
|
||||||
|
|
||||||
Brunix Assistance Engine
|
|
||||||
Copyright (c) 2026 101OBEX Corp. All rights reserved.
|
|
||||||
|
|
||||||
This product includes software developed by third parties under open source
|
|
||||||
licenses. The following is a list of the open source components used in this
|
|
||||||
product, along with their respective licenses and copyright notices.
|
|
||||||
|
|
||||||
-------------------------------------------------------------------------------
|
|
||||||
|
|
||||||
RUNTIME DEPENDENCIES (Docker/requirements.txt)
|
|
||||||
-----------------------------------------------
|
|
||||||
|
|
||||||
aiohttp (3.13.3)
|
|
||||||
License: Apache 2.0
|
|
||||||
Copyright: aio-libs contributors
|
|
||||||
https://github.com/aio-libs/aiohttp
|
|
||||||
|
|
||||||
annotated-types (0.7.0)
|
|
||||||
License: MIT
|
|
||||||
Copyright: Adrian Garcia Badaracco, Samuel Colvin, Zac Hatfield-Dodds
|
|
||||||
https://github.com/annotated-types/annotated-types
|
|
||||||
|
|
||||||
anyio (4.12.1)
|
|
||||||
License: MIT
|
|
||||||
Copyright: Alex Grönholm
|
|
||||||
https://github.com/agronholm/anyio
|
|
||||||
|
|
||||||
attrs (25.4.0)
|
|
||||||
License: MIT
|
|
||||||
Copyright: Hynek Schlawack
|
|
||||||
https://github.com/python-attrs/attrs
|
|
||||||
|
|
||||||
boto3 (1.42.58)
|
|
||||||
License: Apache 2.0
|
|
||||||
Copyright: Amazon Web Services
|
|
||||||
https://github.com/boto/boto3
|
|
||||||
|
|
||||||
botocore (1.42.58)
|
|
||||||
License: Apache 2.0
|
|
||||||
Copyright: Amazon Web Services
|
|
||||||
https://github.com/boto/botocore
|
|
||||||
|
|
||||||
certifi
|
|
||||||
License: MPL 2.0
|
|
||||||
Copyright: Kenneth Reitz
|
|
||||||
https://github.com/certifi/python-certifi
|
|
||||||
|
|
||||||
charset-normalizer (3.4.4)
|
|
||||||
License: MIT
|
|
||||||
Copyright: Ahmed TAHRI
|
|
||||||
https://github.com/Ousret/charset_normalizer
|
|
||||||
|
|
||||||
chonkie (1.5.6)
|
|
||||||
License: MIT
|
|
||||||
Copyright: Bhavnick Minhas
|
|
||||||
https://github.com/chonkie-ai/chonkie
|
|
||||||
|
|
||||||
click (8.3.1)
|
|
||||||
License: BSD 3-Clause
|
|
||||||
Copyright: Armin Ronacher
|
|
||||||
https://github.com/pallets/click
|
|
||||||
|
|
||||||
dataclasses-json (0.6.7)
|
|
||||||
License: MIT
|
|
||||||
Copyright: Lídia Contreras, Radek Nohejl
|
|
||||||
https://github.com/lidatong/dataclasses-json
|
|
||||||
|
|
||||||
elastic-transport (8.17.1)
|
|
||||||
License: Apache 2.0
|
|
||||||
Copyright: Elasticsearch B.V.
|
|
||||||
https://github.com/elastic/elastic-transport-python
|
|
||||||
|
|
||||||
elasticsearch (8.19.3)
|
|
||||||
License: Apache 2.0
|
|
||||||
Copyright: Elasticsearch B.V.
|
|
||||||
https://github.com/elastic/elasticsearch-py
|
|
||||||
|
|
||||||
fastapi (0.111+)
|
|
||||||
License: MIT
|
|
||||||
Copyright: Sebastián Ramírez
|
|
||||||
https://github.com/fastapi/fastapi
|
|
||||||
|
|
||||||
filelock (3.24.3)
|
|
||||||
License: Unlicense / Public Domain
|
|
||||||
https://github.com/tox-dev/filelock
|
|
||||||
|
|
||||||
grpcio (1.78.1)
|
|
||||||
License: Apache 2.0
|
|
||||||
Copyright: The gRPC Authors
|
|
||||||
https://github.com/grpc/grpc
|
|
||||||
|
|
||||||
grpcio-reflection (1.78.1)
|
|
||||||
License: Apache 2.0
|
|
||||||
Copyright: The gRPC Authors
|
|
||||||
https://github.com/grpc/grpc
|
|
||||||
|
|
||||||
grpcio-tools (1.78.1)
|
|
||||||
License: Apache 2.0
|
|
||||||
Copyright: The gRPC Authors
|
|
||||||
https://github.com/grpc/grpc
|
|
||||||
|
|
||||||
httpcore (1.0.9)
|
|
||||||
License: BSD 3-Clause
|
|
||||||
Copyright: Tom Christie
|
|
||||||
https://github.com/encode/httpcore
|
|
||||||
|
|
||||||
httpx (0.28.1)
|
|
||||||
License: BSD 3-Clause
|
|
||||||
Copyright: Tom Christie
|
|
||||||
https://github.com/encode/httpx
|
|
||||||
|
|
||||||
huggingface-hub (0.36.2)
|
|
||||||
License: Apache 2.0
|
|
||||||
Copyright: HuggingFace Inc.
|
|
||||||
https://github.com/huggingface/huggingface_hub
|
|
||||||
|
|
||||||
jinja2 (3.1.6)
|
|
||||||
License: BSD 3-Clause
|
|
||||||
Copyright: Armin Ronacher
|
|
||||||
https://github.com/pallets/jinja
|
|
||||||
|
|
||||||
joblib (1.5.3)
|
|
||||||
License: BSD 3-Clause
|
|
||||||
Copyright: Gael Varoquaux
|
|
||||||
https://github.com/joblib/joblib
|
|
||||||
|
|
||||||
jsonpatch (1.33)
|
|
||||||
License: BSD 3-Clause
|
|
||||||
Copyright: Stefan Kögl
|
|
||||||
https://github.com/stefankoegl/python-json-patch
|
|
||||||
|
|
||||||
langchain (1.2.10)
|
|
||||||
License: MIT
|
|
||||||
Copyright: LangChain, Inc.
|
|
||||||
https://github.com/langchain-ai/langchain
|
|
||||||
|
|
||||||
langchain-anthropic
|
|
||||||
License: MIT
|
|
||||||
Copyright: LangChain, Inc.
|
|
||||||
https://github.com/langchain-ai/langchain
|
|
||||||
|
|
||||||
langchain-aws (1.3.1)
|
|
||||||
License: MIT
|
|
||||||
Copyright: LangChain, Inc.
|
|
||||||
https://github.com/langchain-ai/langchain
|
|
||||||
|
|
||||||
langchain-community (0.4.1)
|
|
||||||
License: MIT
|
|
||||||
Copyright: LangChain, Inc.
|
|
||||||
https://github.com/langchain-ai/langchain
|
|
||||||
|
|
||||||
langchain-core (1.2.15)
|
|
||||||
License: MIT
|
|
||||||
Copyright: LangChain, Inc.
|
|
||||||
https://github.com/langchain-ai/langchain
|
|
||||||
|
|
||||||
langchain-elasticsearch (1.0.0)
|
|
||||||
License: MIT
|
|
||||||
Copyright: LangChain, Inc.
|
|
||||||
https://github.com/langchain-ai/langchain
|
|
||||||
|
|
||||||
langchain-huggingface (1.2.0)
|
|
||||||
License: MIT
|
|
||||||
Copyright: LangChain, Inc.
|
|
||||||
https://github.com/langchain-ai/langchain
|
|
||||||
|
|
||||||
langchain-ollama (1.0.1)
|
|
||||||
License: MIT
|
|
||||||
Copyright: LangChain, Inc.
|
|
||||||
https://github.com/langchain-ai/langchain
|
|
||||||
|
|
||||||
langgraph (1.0.9)
|
|
||||||
License: MIT
|
|
||||||
Copyright: LangChain, Inc.
|
|
||||||
https://github.com/langchain-ai/langgraph
|
|
||||||
|
|
||||||
langsmith (0.7.6)
|
|
||||||
License: MIT
|
|
||||||
Copyright: LangChain, Inc.
|
|
||||||
https://github.com/langchain-ai/langsmith
|
|
||||||
|
|
||||||
loguru (0.7.3)
|
|
||||||
License: MIT
|
|
||||||
Copyright: Delgan
|
|
||||||
https://github.com/Delgan/loguru
|
|
||||||
|
|
||||||
model2vec (0.7.0)
|
|
||||||
License: MIT
|
|
||||||
Copyright: MinishLab
|
|
||||||
https://github.com/MinishLab/model2vec
|
|
||||||
|
|
||||||
nltk (3.9.3)
|
|
||||||
License: Apache 2.0
|
|
||||||
Copyright: NLTK Project
|
|
||||||
https://github.com/nltk/nltk
|
|
||||||
|
|
||||||
numpy (2.4.2)
|
|
||||||
License: BSD 3-Clause
|
|
||||||
Copyright: NumPy Developers
|
|
||||||
https://github.com/numpy/numpy
|
|
||||||
|
|
||||||
ollama (0.6.1)
|
|
||||||
License: MIT
|
|
||||||
Copyright: Ollama
|
|
||||||
https://github.com/ollama/ollama-python
|
|
||||||
|
|
||||||
orjson (3.11.7)
|
|
||||||
License: Apache 2.0 / MIT
|
|
||||||
Copyright: ijl
|
|
||||||
https://github.com/ijl/orjson
|
|
||||||
|
|
||||||
packaging (24.2)
|
|
||||||
License: Apache 2.0 / BSD 2-Clause
|
|
||||||
Copyright: PyPA
|
|
||||||
https://github.com/pypa/packaging
|
|
||||||
|
|
||||||
pandas (3.0.1)
|
|
||||||
License: BSD 3-Clause
|
|
||||||
Copyright: The Pandas Development Team
|
|
||||||
https://github.com/pandas-dev/pandas
|
|
||||||
|
|
||||||
protobuf (6.33.5)
|
|
||||||
License: BSD 3-Clause
|
|
||||||
Copyright: Google LLC
|
|
||||||
https://github.com/protocolbuffers/protobuf
|
|
||||||
|
|
||||||
pydantic (2.12.5)
|
|
||||||
License: MIT
|
|
||||||
Copyright: Samuel Colvin
|
|
||||||
https://github.com/pydantic/pydantic
|
|
||||||
|
|
||||||
pydantic-settings (2.13.1)
|
|
||||||
License: MIT
|
|
||||||
Copyright: Samuel Colvin
|
|
||||||
https://github.com/pydantic/pydantic-settings
|
|
||||||
|
|
||||||
pygments (2.19.2)
|
|
||||||
License: BSD 2-Clause
|
|
||||||
Copyright: Georg Brandl
|
|
||||||
https://github.com/pygments/pygments
|
|
||||||
|
|
||||||
python-dateutil (2.9.0)
|
|
||||||
License: Apache 2.0 / BSD 3-Clause
|
|
||||||
Copyright: Gustavo Niemeyer
|
|
||||||
https://github.com/dateutil/dateutil
|
|
||||||
|
|
||||||
python-dotenv (1.2.1)
|
|
||||||
License: BSD 3-Clause
|
|
||||||
Copyright: Saurabh Kumar
|
|
||||||
https://github.com/theskumar/python-dotenv
|
|
||||||
|
|
||||||
pyyaml (6.0.3)
|
|
||||||
License: MIT
|
|
||||||
Copyright: Kirill Simonov
|
|
||||||
https://github.com/yaml/pyyaml
|
|
||||||
|
|
||||||
ragas (0.4.3+)
|
|
||||||
License: Apache 2.0
|
|
||||||
Copyright: Exploding Gradients
|
|
||||||
https://github.com/explodinggradients/ragas
|
|
||||||
|
|
||||||
rapidfuzz (3.14.3)
|
|
||||||
License: MIT
|
|
||||||
Copyright: Max Bachmann
|
|
||||||
https://github.com/rapidfuzz/RapidFuzz
|
|
||||||
|
|
||||||
regex (2026.2.19)
|
|
||||||
License: Apache 2.0
|
|
||||||
Copyright: Matthew Barnett
|
|
||||||
https://github.com/mrabarnett/mrab-regex
|
|
||||||
|
|
||||||
requests (2.32.5)
|
|
||||||
License: Apache 2.0
|
|
||||||
Copyright: Kenneth Reitz
|
|
||||||
https://github.com/psf/requests
|
|
||||||
|
|
||||||
rich (14.3.3)
|
|
||||||
License: MIT
|
|
||||||
Copyright: Will McGugan
|
|
||||||
https://github.com/Textualize/rich
|
|
||||||
|
|
||||||
s3transfer (0.16.0)
|
|
||||||
License: Apache 2.0
|
|
||||||
Copyright: Amazon Web Services
|
|
||||||
https://github.com/boto/s3transfer
|
|
||||||
|
|
||||||
safetensors (0.7.0)
|
|
||||||
License: Apache 2.0
|
|
||||||
Copyright: HuggingFace Inc.
|
|
||||||
https://github.com/huggingface/safetensors
|
|
||||||
|
|
||||||
setuptools (82.0.0)
|
|
||||||
License: MIT
|
|
||||||
Copyright: Jason R. Coombs
|
|
||||||
https://github.com/pypa/setuptools
|
|
||||||
|
|
||||||
six (1.17.0)
|
|
||||||
License: MIT
|
|
||||||
Copyright: Benjamin Peterson
|
|
||||||
https://github.com/benjaminp/six
|
|
||||||
|
|
||||||
sqlalchemy (2.0.46)
|
|
||||||
License: MIT
|
|
||||||
Copyright: SQLAlchemy authors
|
|
||||||
https://github.com/sqlalchemy/sqlalchemy
|
|
||||||
|
|
||||||
tenacity (9.1.4)
|
|
||||||
License: Apache 2.0
|
|
||||||
Copyright: Julien Danjou
|
|
||||||
https://github.com/jd/tenacity
|
|
||||||
|
|
||||||
tokenizers (0.22.2)
|
|
||||||
License: Apache 2.0
|
|
||||||
Copyright: HuggingFace Inc.
|
|
||||||
https://github.com/huggingface/tokenizers
|
|
||||||
|
|
||||||
tqdm (4.67.3)
|
|
||||||
License: MIT / MPL 2.0
|
|
||||||
Copyright: Casper da Costa-Luis
|
|
||||||
https://github.com/tqdm/tqdm
|
|
||||||
|
|
||||||
typing-extensions (4.15.0)
|
|
||||||
License: PSF 2.0
|
|
||||||
Copyright: Python Software Foundation
|
|
||||||
https://github.com/python/typing_extensions
|
|
||||||
|
|
||||||
urllib3 (2.6.3)
|
|
||||||
License: MIT
|
|
||||||
Copyright: Andrey Petrov
|
|
||||||
https://github.com/urllib3/urllib3
|
|
||||||
|
|
||||||
uvicorn (0.29+)
|
|
||||||
License: BSD 3-Clause
|
|
||||||
Copyright: Tom Christie
|
|
||||||
https://github.com/encode/uvicorn
|
|
||||||
|
|
||||||
xxhash (3.6.0)
|
|
||||||
License: BSD 2-Clause
|
|
||||||
Copyright: Yue Du
|
|
||||||
https://github.com/ifduyue/python-xxhash
|
|
||||||
|
|
||||||
yarl (1.22.0)
|
|
||||||
License: Apache 2.0
|
|
||||||
Copyright: aio-libs contributors
|
|
||||||
https://github.com/aio-libs/yarl
|
|
||||||
|
|
||||||
zstandard (0.25.0)
|
|
||||||
License: BSD 3-Clause
|
|
||||||
Copyright: Gregory Szorc
|
|
||||||
https://github.com/indygreg/python-zstandard
|
|
||||||
|
|
||||||
-------------------------------------------------------------------------------
|
|
||||||
|
|
||||||
DEVELOPMENT DEPENDENCIES (pyproject.toml — dev group)
|
|
||||||
------------------------------------------------------
|
|
||||||
These dependencies are used only during development and research.
|
|
||||||
They are not included in the production Docker image.
|
|
||||||
|
|
||||||
beir (2.2.0+)
|
|
||||||
License: Apache 2.0
|
|
||||||
Copyright: Nandan Thakur
|
|
||||||
https://github.com/beir-cellar/beir
|
|
||||||
|
|
||||||
datasets
|
|
||||||
License: Apache 2.0
|
|
||||||
Copyright: HuggingFace Inc.
|
|
||||||
https://github.com/huggingface/datasets
|
|
||||||
|
|
||||||
jupyter
|
|
||||||
License: BSD 3-Clause
|
|
||||||
Copyright: Project Jupyter Contributors
|
|
||||||
https://github.com/jupyter/jupyter
|
|
||||||
|
|
||||||
langfuse (<3)
|
|
||||||
License: MIT
|
|
||||||
Copyright: Langfuse GmbH
|
|
||||||
https://github.com/langfuse/langfuse
|
|
||||||
|
|
||||||
litellm (1.82.0+)
|
|
||||||
License: MIT
|
|
||||||
Copyright: BerriAI
|
|
||||||
https://github.com/BerriAI/litellm
|
|
||||||
|
|
||||||
mteb (2.8.8+)
|
|
||||||
License: Apache 2.0
|
|
||||||
Copyright: MTEB Authors
|
|
||||||
https://github.com/embeddings-benchmark/mteb
|
|
||||||
|
|
||||||
polars (1.38.1+)
|
|
||||||
License: MIT
|
|
||||||
Copyright: Ritchie Vink
|
|
||||||
https://github.com/pola-rs/polars
|
|
||||||
|
|
||||||
ruff (0.15.1+)
|
|
||||||
License: MIT
|
|
||||||
Copyright: Astral Software
|
|
||||||
https://github.com/astral-sh/ruff
|
|
||||||
|
|
||||||
tree-sitter-language-pack (0.13.0+)
|
|
||||||
License: MIT
|
|
||||||
Copyright: Various
|
|
||||||
https://github.com/Goldziher/tree-sitter-language-pack
|
|
||||||
|
|
||||||
-------------------------------------------------------------------------------
|
|
||||||
|
|
||||||
EXTERNAL SERVICES (not bundled — accessed at runtime via API or network)
|
|
||||||
-------------------------------------------------------------------------
|
|
||||||
|
|
||||||
Ollama
|
|
||||||
License: MIT
|
|
||||||
Copyright: Ollama, Inc.
|
|
||||||
https://github.com/ollama/ollama
|
|
||||||
Note: Used as local LLM and embedding inference server.
|
|
||||||
Not bundled in this repository.
|
|
||||||
|
|
||||||
Elasticsearch (8.x)
|
|
||||||
License: SSPL / Elastic License 2.0
|
|
||||||
Copyright: Elasticsearch B.V.
|
|
||||||
https://github.com/elastic/elasticsearch
|
|
||||||
Note: Used as vector database and full-text search engine.
|
|
||||||
Not bundled in this repository. Deployed separately on Devaron Cluster.
|
|
||||||
|
|
||||||
Anthropic Claude API
|
|
||||||
Copyright: Anthropic, PBC.
|
|
||||||
https://www.anthropic.com
|
|
||||||
Note: Used as evaluation judge in the EvaluateRAG pipeline.
|
|
||||||
Accessed via API key. Not bundled in this repository.
|
|
||||||
|
|
||||||
Langfuse
|
|
||||||
License: MIT (self-hosted)
|
|
||||||
Copyright: Langfuse GmbH
|
|
||||||
https://github.com/langfuse/langfuse
|
|
||||||
Note: Used for LLM observability and tracing.
|
|
||||||
Deployed separately on Devaron Cluster.
|
|
||||||
|
|
||||||
-------------------------------------------------------------------------------
|
|
||||||
|
|
||||||
DISCLAIMER
|
|
||||||
|
|
||||||
The licenses listed above are provided for informational purposes only.
|
|
||||||
101OBEX Corp makes no representations or warranties regarding the accuracy
|
|
||||||
of this list. Users of this software are responsible for ensuring compliance
|
|
||||||
with the applicable license terms of all third-party components.
|
|
||||||
|
|
||||||
For questions regarding licensing, contact: https://www.101obex.com
|
|
||||||
48
README.md
48
README.md
|
|
@ -63,8 +63,6 @@ graph TD
|
||||||
│ │ └── utils/
|
│ │ └── utils/
|
||||||
│ │ ├── emb_factory.py # Provider-agnostic embedding model factory
|
│ │ ├── emb_factory.py # Provider-agnostic embedding model factory
|
||||||
│ │ └── llm_factory.py # Provider-agnostic LLM factory
|
│ │ └── llm_factory.py # Provider-agnostic LLM factory
|
||||||
│ ├── tests/
|
|
||||||
│ │ └── test_prd_0002.py # Unit tests — editor context, classifier, proxy parsing
|
|
||||||
│ ├── Dockerfile # Multi-stage container build
|
│ ├── Dockerfile # Multi-stage container build
|
||||||
│ ├── docker-compose.yaml # Local dev orchestration
|
│ ├── docker-compose.yaml # Local dev orchestration
|
||||||
│ ├── entrypoint.sh # Starts gRPC server + HTTP proxy in parallel
|
│ ├── entrypoint.sh # Starts gRPC server + HTTP proxy in parallel
|
||||||
|
|
@ -77,26 +75,17 @@ graph TD
|
||||||
│ ├── API_REFERENCE.md # Complete gRPC & HTTP API contract with examples
|
│ ├── API_REFERENCE.md # Complete gRPC & HTTP API contract with examples
|
||||||
│ ├── RUNBOOK.md # Operational playbooks and incident response
|
│ ├── RUNBOOK.md # Operational playbooks and incident response
|
||||||
│ ├── AVAP_CHUNKER_CONFIG.md # avap_config.json reference — blocks, statements, semantic tags
|
│ ├── AVAP_CHUNKER_CONFIG.md # avap_config.json reference — blocks, statements, semantic tags
|
||||||
│ ├── ADR/ # Architecture Decision Records
|
│ ├── adr/ # Architecture Decision Records
|
||||||
│ │ ├── ADR-0001-grpc-primary-interface.md
|
│ │ ├── ADR-0001-grpc-primary-interface.md
|
||||||
│ │ ├── ADR-0002-two-phase-streaming.md
|
│ │ ├── ADR-0002-two-phase-streaming.md
|
||||||
│ │ ├── ADR-0003-hybrid-retrieval-rrf.md
|
│ │ ├── ADR-0003-hybrid-retrieval-rrf.md
|
||||||
│ │ ├── ADR-0004-claude-eval-judge.md
|
│ │ └── ADR-0004-claude-eval-judge.md
|
||||||
│ │ └── ADR-0005-embedding-model-selection.md
|
|
||||||
│ └── product/ # Product Requirements Documents
|
|
||||||
│ ├── PRD-0001-openai-compatible-proxy.md
|
|
||||||
│ └── PRD-0002-editor-context-injection.md
|
|
||||||
│ ├── avap_language_github_docs/ # AVAP language reference docs (GitHub source)
|
│ ├── avap_language_github_docs/ # AVAP language reference docs (GitHub source)
|
||||||
│ ├── developer.avapframework.com/ # AVAP developer portal docs
|
│ ├── developer.avapframework.com/ # AVAP developer portal docs
|
||||||
│ ├── LRM/
|
│ ├── LRM/
|
||||||
│ │ └── avap.md # AVAP Language Reference Manual (LRM)
|
│ │ └── avap.md # AVAP Language Reference Manual (LRM)
|
||||||
│ └── samples/ # AVAP code samples (.avap) used for ingestion
|
│ └── samples/ # AVAP code samples (.avap) used for ingestion
|
||||||
│
|
│
|
||||||
├── LICENSE # Proprietary license — 101OBEX Corp, Delaware
|
|
||||||
│
|
|
||||||
├── research/ # Experiment results, benchmarks, datasets (MrHouston)
|
|
||||||
│ └── embeddings/ # Embedding model benchmark results (BEIR)
|
|
||||||
│
|
|
||||||
├── ingestion/
|
├── ingestion/
|
||||||
│ └── chunks.json # Last export of ingested chunks (ES bulk output)
|
│ └── chunks.json # Last export of ingested chunks (ES bulk output)
|
||||||
│
|
│
|
||||||
|
|
@ -120,7 +109,6 @@ graph TD
|
||||||
│ └── ingestion/
|
│ └── ingestion/
|
||||||
│ └── chunks.jsonl # JSONL output from avap_chunker.py
|
│ └── chunks.jsonl # JSONL output from avap_chunker.py
|
||||||
│
|
│
|
||||||
├── research/ # Directory containing all research done alongside its documents, results and notebooks
|
|
||||||
└── src/ # Shared library (used by both Docker and scripts)
|
└── src/ # Shared library (used by both Docker and scripts)
|
||||||
├── config.py # Pydantic settings — reads all environment variables
|
├── config.py # Pydantic settings — reads all environment variables
|
||||||
└── utils/
|
└── utils/
|
||||||
|
|
@ -408,7 +396,7 @@ Returns the full answer as a single message with `is_final: true`. Suitable for
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
grpcurl -plaintext \
|
grpcurl -plaintext \
|
||||||
-d '{"query": "Que significa AVAP?", "session_id": "dev-001"}' \
|
-d '{"query": "What is addVar in AVAP?", "session_id": "dev-001"}' \
|
||||||
localhost:50052 \
|
localhost:50052 \
|
||||||
brunix.AssistanceEngine/AskAgent
|
brunix.AssistanceEngine/AskAgent
|
||||||
```
|
```
|
||||||
|
|
@ -416,7 +404,7 @@ grpcurl -plaintext \
|
||||||
Expected response:
|
Expected response:
|
||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
"text": "AVAP (Advanced Virtual API Programming) es un DSL Turing Completo...",
|
"text": "addVar is an AVAP command used to declare a variable...",
|
||||||
"avap_code": "AVAP-2026",
|
"avap_code": "AVAP-2026",
|
||||||
"is_final": true
|
"is_final": true
|
||||||
}
|
}
|
||||||
|
|
@ -505,33 +493,17 @@ This enables integration with any tool that supports the OpenAI or Ollama API (c
|
||||||
| `POST` | `/v1/completions` | Legacy text completion — streaming and non-streaming |
|
| `POST` | `/v1/completions` | Legacy text completion — streaming and non-streaming |
|
||||||
| `GET` | `/health` | Health check — returns gRPC target and status |
|
| `GET` | `/health` | Health check — returns gRPC target and status |
|
||||||
|
|
||||||
**Non-streaming chat — general query:**
|
**Non-streaming chat:**
|
||||||
```bash
|
```bash
|
||||||
curl -X POST http://localhost:8000/v1/chat/completions \
|
curl http://localhost:8000/v1/chat/completions \
|
||||||
-H "Content-Type: application/json" \
|
-H "Content-Type: application/json" \
|
||||||
-d '{
|
-d '{
|
||||||
"model": "brunix",
|
"model": "brunix",
|
||||||
"messages": [{"role": "user", "content": "Que significa AVAP?"}],
|
"messages": [{"role": "user", "content": "What is AVAP?"}],
|
||||||
"stream": false,
|
"stream": false
|
||||||
"session_id": "dev-001"
|
|
||||||
}'
|
}'
|
||||||
```
|
```
|
||||||
|
|
||||||
**Non-streaming chat — with editor context (VS Code extension):**
|
|
||||||
```bash
|
|
||||||
curl -X POST http://localhost:8000/v1/chat/completions \
|
|
||||||
-H "Content-Type: application/json" \
|
|
||||||
-d '{
|
|
||||||
"model": "brunix",
|
|
||||||
"messages": [{"role": "user", "content": "que hace este codigo?"}],
|
|
||||||
"stream": false,
|
|
||||||
"session_id": "dev-001",
|
|
||||||
"user": "{\"editor_content\":\"\",\"selected_text\":\"<base64>\",\"extra_context\":\"\",\"user_info\":{\"dev_id\":1,\"project_id\":2,\"org_id\":3}}"
|
|
||||||
}'
|
|
||||||
```
|
|
||||||
|
|
||||||
> **Editor context transport:** The `user` field carries editor context as a JSON string. `editor_content`, `selected_text`, and `extra_context` must be Base64-encoded. `user_info` is a JSON object with `dev_id`, `project_id`, and `org_id`. The engine only injects editor context into the response when the classifier detects the user is explicitly referring to their code. See [`docs/API_REFERENCE.md`](./docs/API_REFERENCE.md#6-openai-compatible-proxy) for full details.
|
|
||||||
|
|
||||||
**Streaming chat (SSE):**
|
**Streaming chat (SSE):**
|
||||||
```bash
|
```bash
|
||||||
curl http://localhost:8000/v1/chat/completions \
|
curl http://localhost:8000/v1/chat/completions \
|
||||||
|
|
@ -663,9 +635,7 @@ For the full set of contribution standards, see [CONTRIBUTING.md](./CONTRIBUTING
|
||||||
| [docs/API_REFERENCE.md](./docs/API_REFERENCE.md) | Complete gRPC API contract, message types, client examples |
|
| [docs/API_REFERENCE.md](./docs/API_REFERENCE.md) | Complete gRPC API contract, message types, client examples |
|
||||||
| [docs/RUNBOOK.md](./docs/RUNBOOK.md) | Operational playbooks, health checks, incident response |
|
| [docs/RUNBOOK.md](./docs/RUNBOOK.md) | Operational playbooks, health checks, incident response |
|
||||||
| [docs/AVAP_CHUNKER_CONFIG.md](./docs/AVAP_CHUNKER_CONFIG.md) | `avap_config.json` reference — blocks, statements, semantic tags, how to extend |
|
| [docs/AVAP_CHUNKER_CONFIG.md](./docs/AVAP_CHUNKER_CONFIG.md) | `avap_config.json` reference — blocks, statements, semantic tags, how to extend |
|
||||||
| [docs/ADR/](./docs/ADR/) | Architecture Decision Records |
|
| [docs/adr/](./docs/adr/) | Architecture Decision Records |
|
||||||
| [docs/product/](./docs/product/) | Product Requirements Documents |
|
|
||||||
| [research/](./research/) | Experiment results, benchmarks, and datasets |
|
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
|
|
|
||||||
74
changelog
74
changelog
|
|
@ -2,80 +2,6 @@
|
||||||
|
|
||||||
All notable changes to the **Brunix Assistance Engine** will be documented in this file. This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
All notable changes to the **Brunix Assistance Engine** will be documented in this file. This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
||||||
|
|
||||||
---
|
|
||||||
## [1.6.2] - 2026-03-26
|
|
||||||
### Changed
|
|
||||||
- RESEARCH: updated `embeddings/Embedding model selection.pdf`.
|
|
||||||
|
|
||||||
|
|
||||||
## [1.6.1] - 2026-03-20
|
|
||||||
|
|
||||||
### Added
|
|
||||||
- FEATURE (PRD-0002): Extended `AgentRequest` in `brunix.proto` with four optional fields: `editor_content` (field 3), `selected_text` (field 4), `extra_context` (field 5), `user_info` (field 6) — enabling the VS Code extension to send active file content, selected code, free-form context, and client identity metadata alongside every query. Fields 3–5 are Base64-encoded; field 6 is a JSON string.
|
|
||||||
- FEATURE (PRD-0002): Extended `AgentState` with `editor_content`, `selected_text`, `extra_context`, `user_info`, and `use_editor_context` fields.
|
|
||||||
- FEATURE (PRD-0002): Extended classifier (`CLASSIFY_PROMPT_TEMPLATE`) to output two tokens — query type and editor signal (`EDITOR` / `NO_EDITOR`). `use_editor_context` flag set in state based on classifier output.
|
|
||||||
- FEATURE (PRD-0002): Editor context injected into generation prompt only when `use_editor_context=True` — prevents the model from referencing editor code when the question is unrelated.
|
|
||||||
- FEATURE (PRD-0002): `openai_proxy.py` — parses the standard OpenAI `user` field as a JSON string to extract `editor_content`, `selected_text`, `extra_context`, and `user_info`. Non-Brunix clients that send `user` as a plain string or omit it are handled gracefully with no error.
|
|
||||||
- FEATURE (PRD-0002): `server.py` — Base64 decoding of `editor_content`, `selected_text`, and `extra_context` on request arrival. Malformed Base64 is silently treated as empty string.
|
|
||||||
- TESTS: Added `Docker/tests/test_prd_0002.py` — 40 unit tests covering `_parse_query_type`, `_decode_b64`, `_parse_editor_context`, `_build_reformulate_query`, editor context injection logic, and all valid classifier output combinations. Runs without external dependencies (no Elasticsearch, no Ollama, no gRPC server required).
|
|
||||||
- DOCS: Added `docs/product/PRD-0001-openai-compatible-proxy.md` — product requirements document for the OpenAI-compatible HTTP proxy.
|
|
||||||
- DOCS: Added `docs/product/PRD-0002-editor-context-injection.md` — product requirements document for editor context injection (updated to Implemented status with full technical design).
|
|
||||||
- DOCS: Added `docs/ADR/ADR-0005-embedding-model-selection.md` — comparative evaluation of BGE-M3 vs Qwen3-Embedding-0.6B. Status: Under Evaluation.
|
|
||||||
- DOCS: Added `LICENSE` — proprietary license, 101OBEX, Corp, Delaware.
|
|
||||||
- DOCS: Added `research/` directory structure for MrHouston experiment results and benchmarks.
|
|
||||||
|
|
||||||
### Changed
|
|
||||||
- FEATURE (PRD-0002): `session_id` in `openai_proxy.py` is now read exclusively from the dedicated `session_id` field — no longer falls back to the `user` field. Breaking change for any client that was using `user` as a `session_id` fallback.
|
|
||||||
- ENGINE: `CLASSIFY_PROMPT_TEMPLATE` extended with `<editor_rule>` and updated `<output_rule>` for two-token output format.
|
|
||||||
- ENGINE: `REFORMULATE_PROMPT` extended with `<mode_rule>` and `<language_rule>` — the reformulator now receives `[MODE: X]` prepended to the query and applies command expansion only in `CODE_GENERATION` mode.
|
|
||||||
- ENGINE: `GENERATE_PROMPT` — added "Answer in the same language the user used" to `<output_format>`. Fixes responses defaulting to English for Spanish queries.
|
|
||||||
- ENGINE: `hybrid_search_native` in `graph.py` — BM25 query now uses a `bool` query with `should` boost for `doc_type: spec` and `doc_type: narrative` chunks, improving retrieval of definitional and explanatory content over raw code examples.
|
|
||||||
- DOCS: Updated `docs/API_REFERENCE.md` — full `AgentRequest` table with all 6 fields, Base64 encoding notes, editor context behaviour section, and updated proxy examples.
|
|
||||||
- DOCS: Updated `docs/ARCHITECTURE.md` — new §6 Editor Context Pipeline, updated §4 LangGraph Workflow with two-token classifier, §4.6 reformulator mode-aware and language-preserving, updated component inventory and request lifecycle diagrams.
|
|
||||||
- DOCS: Updated `README.md` — project structure with `Docker/tests/`, `docs/product/`, `docs/ADR/ADR-0005`, `research/`, `LICENSE`. HTTP proxy section updated with editor context curl examples. Documentation index updated.
|
|
||||||
- DOCS: Updated `CONTRIBUTING.md` — added Section 10 (PRDs), Section 11 (Research & Experiments Policy), updated PR checklist, ADR table with ADR-0005.
|
|
||||||
- DOCS: Updated `docs/AVAP_CHUNKER_CONFIG.md` to v2.0 — five new commands (else, end, endLoop, exception, return), naming fix (AddvariableToJSON), nine dual assignment patterns, four new semantic tags.
|
|
||||||
- GOVERNANCE: Updated `.github/CODEOWNERS` — added `@BRUNIX-AI/engineering` and `@BRUNIX-AI/research` teams, explicit rules for proto, golden dataset, grammar config, ADRs and PRDs.
|
|
||||||
|
|
||||||
### Fixed
|
|
||||||
- ENGINE: Fixed retrieval returning wrong chunks for Spanish definition queries — reformulator was translating Spanish queries to English, breaking BM25 lexical matching against Spanish LRM chunks. Root cause: missing language preservation rule in `REFORMULATE_PROMPT`.
|
|
||||||
- ENGINE: Fixed reformulator applying CODE_GENERATION command expansion to RETRIEVAL queries — caused "Que significa AVAP?" to reformulate as "AVAP registerEndpoint addResult _status". Root cause: reformulator had no awareness of query type. Fix: `[MODE: X]` prefix + mode-aware rules.
|
|
||||||
- ENGINE: Fixed responses defaulting to English regardless of query language. Root cause: `GENERATE_PROMPT` had no language instruction (unlike `CODE_GENERATION_PROMPT` which already had it).
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## [1.6.0] - 2026-03-18
|
|
||||||
|
|
||||||
### Added
|
|
||||||
- ENGINE: Added `AskAgentStream` RPC — real token-by-token streaming directly from Ollama. Two-phase design: classify + reformulate + retrieve runs first via `build_prepare_graph`, then `llm.stream()` forwards tokens to the client as they arrive.
|
|
||||||
- ENGINE: Added `EvaluateRAG` RPC — RAGAS evaluation pipeline with Claude as judge. Runs faithfulness, answer_relevancy, context_recall and context_precision against a golden dataset and returns a global score with verdict (EXCELLENT / ACCEPTABLE / INSUFFICIENT).
|
|
||||||
- ENGINE: Added `openai_proxy.py` — OpenAI and Ollama compatible HTTP API running on port 8000. Routes `stream: false` to `AskAgent` and `stream: true` to `AskAgentStream`. Endpoints: `POST /v1/chat/completions`, `POST /v1/completions`, `GET /v1/models`, `POST /api/chat`, `POST /api/generate`, `GET /api/tags`, `GET /health`.
|
|
||||||
- ENGINE: Added `entrypoint.sh` — starts gRPC server and HTTP proxy as parallel processes with mutual watchdog. If either crashes, the container stops cleanly.
|
|
||||||
- ENGINE: Added session memory — `session_store` dict indexed by `session_id` accumulates full conversation history per session. Each request loads and persists history.
|
|
||||||
- ENGINE: Added query intent classifier — LangGraph node that classifies every query as `RETRIEVAL`, `CODE_GENERATION` or `CONVERSATIONAL` and routes to the appropriate subgraph.
|
|
||||||
- ENGINE: Added hybrid retrieval — replaced `ElasticsearchStore` (LangChain abstraction) with native Elasticsearch client. Each query runs BM25 `multi_match` and kNN in parallel, fused with Reciprocal Rank Fusion (k=60). Returns top-8 documents.
|
|
||||||
- ENGINE: Added `evaluate.py` — full RAGAS evaluation pipeline using the same hybrid retrieval as production, Claude as external judge, and the golden dataset in `Docker/src/golden_dataset.json`.
|
|
||||||
- PROTO: Added `AskAgentStream` and `EvaluateRAG` RPCs to `brunix.proto` with their message types (`EvalRequest`, `EvalResponse`, `QuestionDetail`).
|
|
||||||
- DOCS: Added `docs/ADR/ADR-0001-grpc-primary-interface.md`.
|
|
||||||
- DOCS: Added `docs/ADR/ADR-0002-two-phase-streaming.md`.
|
|
||||||
- DOCS: Added `docs/ADR/ADR-0003-hybrid-retrieval-rrf.md`.
|
|
||||||
- DOCS: Added `docs/ADR/ADR-0004-claude-eval-judge.md`.
|
|
||||||
- DOCS: Added `docs/samples/` — 30 representative `.avap` code samples covering all AVAP constructs.
|
|
||||||
|
|
||||||
### Changed
|
|
||||||
- ENGINE: Replaced `ElasticsearchStore` with native Elasticsearch client — fixes silent kNN failure caused by schema incompatibility between the Chonkie ingestion pipeline and the LangChain-managed index schema.
|
|
||||||
- ENGINE: Replaced single `GENERATE_PROMPT` with five specialised prompts — `CLASSIFY_PROMPT`, `REFORMULATE_PROMPT`, `GENERATE_PROMPT`, `CODE_GENERATION_PROMPT`, `CONVERSATIONAL_PROMPT` — each optimised for its routing path.
|
|
||||||
- ENGINE: Extended `REFORMULATE_PROMPT` with explicit AVAP command mapping — intent-to-command expansion for API, database, HTTP, loop and error handling query types.
|
|
||||||
- ENGINE: Extended `AgentState` with `query_type` and `session_id` fields required for conditional routing and session persistence.
|
|
||||||
- ENGINE: Fixed `session_id` ignored — `graph.invoke` now passes `session_id` into the graph state.
|
|
||||||
- ENGINE: Fixed double `is_final: True` — `AskAgent` previously emitted two closing messages. Now emits exactly one.
|
|
||||||
- ENGINE: Fixed embedding endpoint mismatch — server now uses the same `/api/embed` endpoint and payload format as both ingestion pipelines, ensuring vectors are comparable at query time.
|
|
||||||
- DEPENDENCIES: `requirements.txt` updated — added `ragas`, `datasets`, `langchain-anthropic`, `fastapi`, `uvicorn`.
|
|
||||||
|
|
||||||
### Fixed
|
|
||||||
- ENGINE: Fixed retrieval returning zero results — `ElasticsearchStore` assumed a LangChain-managed schema incompatible with the Chonkie-generated index. Replaced with native ES client querying actual field names.
|
|
||||||
- ENGINE: Fixed context always empty — consequence of the retrieval bug above. The generation prompt received an empty `{context}` on every request and always returned the fallback string.
|
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## [1.5.1] - 2026-03-18
|
## [1.5.1] - 2026-03-18
|
||||||
|
|
|
||||||
|
|
@ -2,7 +2,7 @@
|
||||||
|
|
||||||
**Date:** 2026-02-09
|
**Date:** 2026-02-09
|
||||||
**Status:** Accepted
|
**Status:** Accepted
|
||||||
**Deciders:** Rafael Ruiz (CTO, AVAP Technology)
|
**Deciders:** Rafael Ruiz (CTO, AVAP Technology), MrHouston Engineering
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -2,7 +2,7 @@
|
||||||
|
|
||||||
**Date:** 2026-03-05
|
**Date:** 2026-03-05
|
||||||
**Status:** Accepted
|
**Status:** Accepted
|
||||||
**Deciders:** Rafael Ruiz (CTO)
|
**Deciders:** Rafael Ruiz (CTO), MrHouston Engineering
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -2,7 +2,7 @@
|
||||||
|
|
||||||
**Date:** 2026-03-05
|
**Date:** 2026-03-05
|
||||||
**Status:** Accepted
|
**Status:** Accepted
|
||||||
**Deciders:** Rafael Ruiz (CTO)
|
**Deciders:** Rafael Ruiz (CTO), MrHouston Engineering
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -2,7 +2,7 @@
|
||||||
|
|
||||||
**Date:** 2026-03-10
|
**Date:** 2026-03-10
|
||||||
**Status:** Accepted
|
**Status:** Accepted
|
||||||
**Deciders:** Rafael Ruiz (CTO)
|
**Deciders:** Rafael Ruiz (CTO), MrHouston Engineering
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,276 +0,0 @@
|
||||||
# ADR-0005: Embedding Model Selection — Comparative Evaluation of BGE-M3 vs Qwen3-Embedding-0.6B
|
|
||||||
|
|
||||||
**Date:** 2026-03-19
|
|
||||||
**Status:** Under Evaluation
|
|
||||||
**Deciders:** Rafael Ruiz (CTO), MrHouston Engineering
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Context
|
|
||||||
|
|
||||||
The AVAP RAG pipeline requires an embedding model capable of mapping a hybrid corpus into a vector space suitable for semantic retrieval. Understanding the exact composition of this corpus is a prerequisite for model selection.
|
|
||||||
|
|
||||||
### Corpus characterisation (empirically measured)
|
|
||||||
|
|
||||||
A chunk-level audit was performed on the full indexable corpus: the AVAP Language Reference Manual (`avap.md`) and 40 representative `.avap` code samples. Results (`test_chunks.jsonl`, 190 chunks):
|
|
||||||
|
|
||||||
| Metric | Value |
|
|
||||||
| -------------------- | ----------- |
|
|
||||||
| Total chunks | 190 |
|
|
||||||
| Total tokens indexed | 11,498 |
|
|
||||||
| Minimum chunk size | 1 token |
|
|
||||||
| Maximum chunk size | 833 tokens |
|
|
||||||
| Mean chunk size | 60.5 tokens |
|
|
||||||
| Median chunk size | 29 tokens |
|
|
||||||
| p90 | 117 tokens |
|
|
||||||
| p95 | 204 tokens |
|
|
||||||
| p99 | 511 tokens |
|
|
||||||
|
|
||||||
**Corpus composition by type:**
|
|
||||||
|
|
||||||
| Type | Count | Description |
|
|
||||||
| ------------------------- | ----- | ---------------------------------------- |
|
|
||||||
| Narrative (Spanish prose) | 79 | LRM explanations, concept descriptions |
|
|
||||||
| Code chunks | 83 | AVAP `.avap` sample files |
|
|
||||||
| BNF formal grammar | 9 | Formal language specification in English |
|
|
||||||
| Code examples | 14 | Inline examples within LRM |
|
|
||||||
| Function signatures | 2 | Extracted function headers |
|
|
||||||
|
|
||||||
**Linguistic composition:** 55% of chunks originate from the LRM (`avap.md`), written in Spanish with embedded English DSL identifiers. 45% are `.avap` code files containing English command names (`addVar`, `addResult`, `registerEndpoint`, `ormDirect`) with Spanish-language string literals and variable names (`"Hola"`, `datos_cliente`, `mi_json_final`, `contraseña`, `fecha`). 18.9% of chunks (36 out of 190) contain both Spanish content and English DSL commands within the same chunk — intra-chunk multilingual mixing.
|
|
||||||
|
|
||||||
Representative examples of intra-chunk multilingual mixing:
|
|
||||||
|
|
||||||
```
|
|
||||||
// Narrative chunk (Spanish prose + English DSL terms):
|
|
||||||
"AVAP (Advanced Virtual API Programming) es un DSL (Domain-Specific Language)
|
|
||||||
Turing Completo, diseñado para la orquestación segura de microservicios e I/O."
|
|
||||||
|
|
||||||
// Code chunk (English commands + Spanish identifiers and literals):
|
|
||||||
addParam("lang", l)
|
|
||||||
if(l, "es", "=")
|
|
||||||
addVar(msg, "Hola")
|
|
||||||
end()
|
|
||||||
addResult(msg)
|
|
||||||
|
|
||||||
// BNF chunk (formal English grammar):
|
|
||||||
<program> ::= ( <line> | <block_comment> )*
|
|
||||||
<statement> ::= <assignment> | <method_call_stmt> | <io_command> | ...
|
|
||||||
```
|
|
||||||
|
|
||||||
### Why the initial model was eliminated
|
|
||||||
|
|
||||||
The initial model provided was **Qwen2.5-1.5B**. Empirical evaluation by MrHouston Engineering (full results in `research/embeddings/`) demonstrated it is unsuitable for dense retrieval. Qwen2.5-1.5B generates embeddings via the **Last Token** method: the final token of the sequence is assumed to encode all preceding context. For AVAP code chunks, the last token is always a syntactic closer — `end()`, `}`, `endLoop()` — with zero semantic content. The resulting embeddings are effectively identical across functionally distinct chunks.
|
|
||||||
|
|
||||||
Benchmark confirmation (BEIR evaluation, three datasets):
|
|
||||||
|
|
||||||
**CodeXGLUE** (code retrieval from GitHub repositories):
|
|
||||||
|
|
||||||
| k | Qwen2.5-1.5B NDCG | Qwen2.5-1.5B Recall | Qwen3-Emb-0.6B NDCG | Qwen3-Emb-0.6B Recall |
|
|
||||||
| -- | ----------------- | ------------------- | ------------------- | --------------------- |
|
|
||||||
| 1 | 0.00031 | 0.00031 | **0.9497** | **0.9497** |
|
|
||||||
| 5 | 0.00086 | 0.00151 | **0.9716** | **0.9876** |
|
|
||||||
| 10 | 0.00118 | 0.00250 | **0.9734** | **0.9929** |
|
|
||||||
|
|
||||||
**CoSQA** (natural language queries over code — closest proxy to AVAP retrieval):
|
|
||||||
|
|
||||||
| k | Qwen2.5-1.5B NDCG | Qwen2.5-1.5B Recall | Qwen3-Emb-0.6B NDCG | Qwen3-Emb-0.6B Recall |
|
|
||||||
| --- | ----------------- | ------------------- | ------------------- | --------------------- |
|
|
||||||
| 1 | 0.00000 | 0.00000 | **0.1740** | **0.1740** |
|
|
||||||
| 10 | 0.00000 | 0.00000 | **0.3909** | **0.6700** |
|
|
||||||
| 100 | 0.00210 | 0.01000 | **0.4510** | **0.9520** |
|
|
||||||
|
|
||||||
**SciFact** (scientific prose — out-of-domain control):
|
|
||||||
|
|
||||||
| k | Qwen2.5-1.5B NDCG | Qwen2.5-1.5B Recall | Qwen3-Emb-0.6B NDCG | Qwen3-Emb-0.6B Recall |
|
|
||||||
| --- | ----------------- | ------------------- | ------------------- | --------------------- |
|
|
||||||
| 1 | 0.02333 | 0.02083 | **0.5633** | **0.5299** |
|
|
||||||
| 10 | 0.04619 | 0.07417 | **0.6855** | **0.8161** |
|
|
||||||
| 100 | 0.07768 | 0.23144 | **0.7129** | **0.9400** |
|
|
||||||
|
|
||||||
Qwen2.5-1.5B is eliminated. **Qwen3-Embedding-0.6B is the validated baseline.**
|
|
||||||
|
|
||||||
### Why a comparative evaluation was required before adopting Qwen3
|
|
||||||
|
|
||||||
Qwen3-Embedding-0.6B's benchmark results were obtained on English-only datasets. They eliminated Qwen2.5-1.5B decisively but did not characterise Qwen3's behaviour on the multilingual mixed corpus that AVAP represents. A second candidate — **BGE-M3** — presented theoretical advantages for this specific corpus that could not be assessed without empirical comparison.
|
|
||||||
|
|
||||||
The index rebuild required to adopt any model is destructive and must be done once. Given that the embedding model directly determines the quality of all RAG retrieval in production, adopting a model without a direct comparison between the two viable candidates would not have met the due diligence required for a decision of this impact.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Decision
|
|
||||||
|
|
||||||
A **head-to-head comparative evaluation** of BGE-M3 and Qwen3-Embedding-0.6B is being conducted under identical conditions before either is adopted as the production embedding model.
|
|
||||||
|
|
||||||
The model that demonstrates superior performance under the evaluation criteria defined below is adopted. This ADR moves to Accepted upon completion of that evaluation, with the selected model documented as the outcome.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Candidate Analysis
|
|
||||||
|
|
||||||
### Qwen3-Embedding-0.6B
|
|
||||||
|
|
||||||
**Strengths:**
|
|
||||||
|
|
||||||
- Already benchmarked on CodeXGLUE, CoSQA and SciFact — strong results documented
|
|
||||||
- 32,768 token context window — exceeds corpus requirements with large margin
|
|
||||||
- Same model family as the generation model (Qwen) — shared tokenizer vocabulary
|
|
||||||
- Lowest integration risk — already validated in the pipeline
|
|
||||||
|
|
||||||
**Limitations:**
|
|
||||||
|
|
||||||
- Benchmarks are English-only — multilingual performance on AVAP corpus unvalidated
|
|
||||||
- Not a dedicated multilingual model — training distribution weighted towards English and Chinese
|
|
||||||
- No native sparse retrieval support
|
|
||||||
|
|
||||||
**Corpus fit assessment:** The maximum chunk in the AVAP corpus is 833 tokens — well within both candidates' limits. Qwen3's 32,768 token context window provides no practical advantage over BGE-M3's 8,192 tokens for this corpus. Context window is not a differentiating criterion.
|
|
||||||
|
|
||||||
### BGE-M3
|
|
||||||
|
|
||||||
**Strengths:**
|
|
||||||
|
|
||||||
- Explicit multilingual contrastive training across 100+ languages including programming languages — direct architectural fit for the intra-chunk Spanish/English/DSL mixing observed in the corpus
|
|
||||||
- Supports dense, sparse and multi-vector ColBERT retrieval from a single model inference — future path to consolidating the current BM25+kNN dual-system architecture (ADR-0003)
|
|
||||||
- Higher MTEB retrieval score than Qwen3-Embedding-0.6B in the programming domain
|
|
||||||
|
|
||||||
**Limitations:**
|
|
||||||
|
|
||||||
- Not yet benchmarked on CodeXGLUE, CoSQA or SciFact at the time of candidate selection — no prior empirical results for this corpus
|
|
||||||
- 8,192 token context window — sufficient for current corpus (max chunk: 833 tokens, 10.2% utilization) but lower headroom for future corpus growth
|
|
||||||
- Requires tokenizer alignment: `HF_EMB_MODEL_NAME` must be updated to `BAAI/bge-m3` alongside `OLLAMA_EMB_MODEL_NAME` to keep chunk token counting consistent
|
|
||||||
|
|
||||||
**Corpus fit assessment:** The intra-chunk multilingual mixing (18.9% of chunks) and the Spanish prose component (79 narrative chunks) are the corpus characteristics most likely to differentiate BGE-M3 from Qwen3. The BEIR and EvaluateRAG evaluations determine whether this theoretical advantage translates to measurable retrieval improvement.
|
|
||||||
|
|
||||||
### VRAM
|
|
||||||
|
|
||||||
Both candidates require approximately 1.13 GiB at FP16 (BGE-M3: 567M parameters; Qwen3: 596M parameters). Combined with a quantized generation model and KV cache, total VRAM remains within the 4 GiB hardware constraint for both. VRAM is not a selection criterion.
|
|
||||||
|
|
||||||
### Embedding dimension
|
|
||||||
|
|
||||||
Both candidates output 1024-dimensional vectors. The Elasticsearch index mapping (`int8_hnsw`, `dims: 1024`, cosine similarity) is identical for both candidates. No mapping changes are required between them.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Evaluation Protocol
|
|
||||||
|
|
||||||
Both models are evaluated under identical conditions. All results are documented in `research/embeddings/`.
|
|
||||||
|
|
||||||
**Step 1 — BEIR benchmarks**
|
|
||||||
|
|
||||||
CodeXGLUE, CoSQA and SciFact were run with **BGE-M3** using the same BEIR evaluation scripts and configuration used for Qwen3-Embedding-0.6B. Qwen3-Embedding-0.6B results already existed in `research/embeddings/` and served as the baseline. Reported metrics: NDCG@k, MAP@k, Recall@k and Precision@k at k = 1, 3, 5, 10, 100.
|
|
||||||
|
|
||||||
**Step 2 — EvaluateRAG on AVAP corpus**
|
|
||||||
|
|
||||||
The Elasticsearch index is rebuilt twice — once with each model — and `EvaluateRAG` is run against the production AVAP golden dataset for both. Reported RAGAS scores: faithfulness, answer_relevancy, context_recall, context_precision, and global score with verdict.
|
|
||||||
|
|
||||||
**Selection criterion**
|
|
||||||
|
|
||||||
EvaluateRAG is the primary decision signal. It directly measures retrieval quality on the actual AVAP production corpus — including its intra-chunk multilingual mixing (18.9% of chunks) and domain-specific DSL syntax — and is therefore more representative than any external benchmark. The model with the higher global EvaluateRAG score is adopted.
|
|
||||||
|
|
||||||
BEIR results are the secondary signal. The primary BEIR metric is NDCG@10. Among the three datasets, **CoSQA is the most representative proxy** for the AVAP retrieval use case — it pairs natural language queries with code snippets, mirroring the Spanish prose query / AVAP DSL code retrieval pattern. CoSQA results are weighted accordingly in the comparison.
|
|
||||||
|
|
||||||
All margin comparisons use **absolute percentage points** in NDCG@10 (e.g., 0.39 vs 0.41 is a 2 absolute percentage point difference, not a 5.1% relative difference).
|
|
||||||
|
|
||||||
**Tiebreaker**
|
|
||||||
|
|
||||||
If the EvaluateRAG global scores are within 5 absolute percentage points of each other, the BEIR results determine the outcome under the following conditions:
|
|
||||||
|
|
||||||
- BGE-M3 exceeds Qwen3-Embedding-0.6B by more than 2 absolute percentage points on mean NDCG@10 across all three BEIR datasets, AND
|
|
||||||
- BGE-M3 does not underperform Qwen3-Embedding-0.6B by more than 2 absolute percentage points on CoSQA NDCG@10 specifically.
|
|
||||||
|
|
||||||
If neither condition is met — that is, if EvaluateRAG scores are within 5 points and BGE-M3 does not clear both BEIR thresholds — Qwen3-Embedding-0.6B is adopted. It carries lower integration risk, its benchmarks are already documented, and it is the validated baseline for the system.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Rationale
|
|
||||||
|
|
||||||
### Step 1 results — BEIR head-to-head comparison
|
|
||||||
|
|
||||||
BGE-M3 benchmarks were completed on the same three BEIR datasets using identical evaluation scripts and configuration. Full results are stored in `research/embeddings/embedding_eval_results/emb_models_result.json`. The following tables compare both candidates side by side.
|
|
||||||
|
|
||||||
**CodeXGLUE** (code retrieval from GitHub repositories):
|
|
||||||
|
|
||||||
| Metric | k | BGE-M3 | Qwen3-Emb-0.6B | Delta (BGE-M3 − Qwen3) |
|
|
||||||
| ------ | --- | ---------------- | ---------------- | ----------------------- |
|
|
||||||
| NDCG | 1 | **0.9520** | 0.9497 | +0.23 pp |
|
|
||||||
| NDCG | 5 | **0.9738** | 0.9717 | +0.21 pp |
|
|
||||||
| NDCG | 10 | **0.9749** | 0.9734 | +0.15 pp |
|
|
||||||
| NDCG | 100 | **0.9763** | 0.9745 | +0.18 pp |
|
|
||||||
| Recall | 1 | **0.9520** | 0.9497 | +0.23 pp |
|
|
||||||
| Recall | 5 | **0.9892** | 0.9876 | +0.16 pp |
|
|
||||||
| Recall | 10 | 0.9928 | **0.9930** | −0.02 pp |
|
|
||||||
| Recall | 100 | **0.9989** | 0.9981 | +0.08 pp |
|
|
||||||
|
|
||||||
Both models perform near-identically on CodeXGLUE. All deltas are below 0.25 absolute percentage points. This dataset does not differentiate the candidates.
|
|
||||||
|
|
||||||
**CoSQA** (natural language queries over code — most representative proxy for AVAP retrieval):
|
|
||||||
|
|
||||||
| Metric | k | BGE-M3 | Qwen3-Emb-0.6B | Delta (BGE-M3 − Qwen3) |
|
|
||||||
| ------ | --- | ------ | ---------------- | ----------------------- |
|
|
||||||
| NDCG | 1 | 0.1160 | **0.1740** | −5.80 pp |
|
|
||||||
| NDCG | 5 | 0.2383 | **0.3351** | −9.68 pp |
|
|
||||||
| NDCG | 10 | 0.2878 | **0.3909** | −10.31 pp |
|
|
||||||
| NDCG | 100 | 0.3631 | **0.4510** | −8.79 pp |
|
|
||||||
| Recall | 1 | 0.1160 | **0.1740** | −5.80 pp |
|
|
||||||
| Recall | 5 | 0.3660 | **0.5020** | −13.60 pp |
|
|
||||||
| Recall | 10 | 0.5160 | **0.6700** | −15.40 pp |
|
|
||||||
| Recall | 100 | 0.8740 | **0.9520** | −7.80 pp |
|
|
||||||
|
|
||||||
Qwen3-Embedding-0.6B outperforms BGE-M3 on CoSQA by a wide margin at every k. The NDCG@10 gap is 10.31 absolute percentage points. CoSQA is the most representative proxy for the AVAP retrieval use case — it pairs natural language queries with code snippets — making this the most significant BEIR result.
|
|
||||||
|
|
||||||
**SciFact** (scientific prose — out-of-domain control):
|
|
||||||
|
|
||||||
| Metric | k | BGE-M3 | Qwen3-Emb-0.6B | Delta (BGE-M3 − Qwen3) |
|
|
||||||
| ------ | --- | ------ | ---------------- | ----------------------- |
|
|
||||||
| NDCG | 1 | 0.5100 | **0.5533** | −4.33 pp |
|
|
||||||
| NDCG | 5 | 0.6190 | **0.6593** | −4.03 pp |
|
|
||||||
| NDCG | 10 | 0.6431 | **0.6785** | −3.54 pp |
|
|
||||||
| NDCG | 100 | 0.6705 | **0.7056** | −3.51 pp |
|
|
||||||
| Recall | 1 | 0.4818 | **0.5243** | −4.25 pp |
|
|
||||||
| Recall | 5 | 0.7149 | **0.7587** | −4.38 pp |
|
|
||||||
| Recall | 10 | 0.7834 | **0.8144** | −3.10 pp |
|
|
||||||
| Recall | 100 | 0.9037 | **0.9367** | −3.30 pp |
|
|
||||||
|
|
||||||
Qwen3-Embedding-0.6B leads BGE-M3 on SciFact by 3–4 absolute percentage points across all metrics. The gap is consistent but narrower than on CoSQA.
|
|
||||||
|
|
||||||
### BEIR summary — NDCG@10 comparison
|
|
||||||
|
|
||||||
| Dataset | BGE-M3 | Qwen3-Emb-0.6B | Delta | Leader |
|
|
||||||
| -------------- | ---------------- | ---------------- | ------------------- | ----------------- |
|
|
||||||
| CodeXGLUE | 0.9749 | 0.9734 | +0.15 pp | BGE-M3 (marginal) |
|
|
||||||
| CoSQA | 0.2878 | **0.3909** | −10.31 pp | **Qwen3** |
|
|
||||||
| SciFact | 0.6431 | **0.6785** | −3.54 pp | **Qwen3** |
|
|
||||||
| **Mean** | **0.6353** | **0.6809** | **−4.56 pp** | **Qwen3** |
|
|
||||||
|
|
||||||
Qwen3-Embedding-0.6B leads on mean NDCG@10 by 4.56 absolute percentage points, driven primarily by a 10.31 pp advantage on CoSQA.
|
|
||||||
|
|
||||||
### Application of tiebreaker criteria to BEIR results
|
|
||||||
|
|
||||||
Per the evaluation protocol, if EvaluateRAG global scores are within 5 absolute percentage points, the BEIR tiebreaker applies. The tiebreaker requires BGE-M3 to meet **both** conditions:
|
|
||||||
|
|
||||||
1. **BGE-M3 must exceed Qwen3 by more than 2 pp on mean NDCG@10.** Result: BGE-M3 trails by 4.56 pp. **Condition not met.**
|
|
||||||
2. **BGE-M3 must not underperform Qwen3 by more than 2 pp on CoSQA NDCG@10.** Result: BGE-M3 trails by 10.31 pp. **Condition not met.**
|
|
||||||
|
|
||||||
Neither tiebreaker condition is satisfied. Under the defined protocol, if the EvaluateRAG evaluation results in a tie (within 5 pp), the BEIR tiebreaker defaults to Qwen3-Embedding-0.6B.
|
|
||||||
|
|
||||||
### Step 2 results — EvaluateRAG on AVAP corpus
|
|
||||||
|
|
||||||
At this moment, we are not in possesion of the golden dataset, cannot proceed with step 2.
|
|
||||||
|
|
||||||
_Pending. Results will be documented here upon completion of the EvaluateRAG evaluation for both models._
|
|
||||||
|
|
||||||
### Preliminary assessment
|
|
||||||
|
|
||||||
The BEIR benchmarks — the secondary decision signal — favour Qwen3-Embedding-0.6B across both the most representative dataset (CoSQA, −10.31 pp) and the out-of-domain control (SciFact, −3.54 pp), with CodeXGLUE effectively tied. BGE-M3's theoretical advantage from multilingual contrastive training does not translate to superior performance on these English-only benchmarks.
|
|
||||||
|
|
||||||
The EvaluateRAG evaluation — the primary decision signal — remains pending. It is the only evaluation that directly measures retrieval quality on the actual AVAP corpus with its intra-chunk multilingual mixing. BGE-M3's architectural fit for multilingual content could still produce a measurable advantage on the production corpus that the English-only BEIR benchmarks cannot capture. No final model selection will be made until EvaluateRAG results are available for both candidates.
|
|
||||||
|
|
||||||
We have found that Qwen3-embedding is multi-lingual, with good scores in multi-lingual benchmarks. The documentation says so, but the definitive answer will be provided by the scores of the evaluation on the AVAP corpus.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Consequences
|
|
||||||
|
|
||||||
- **Index rebuild required** regardless of which model is adopted. Vectors from Qwen2.5-1.5B are incompatible with either candidate. The existing index is deleted before re-ingestion.
|
|
||||||
- **Two index rebuilds required for the evaluation.** One per candidate for the EvaluateRAG step. Given the current corpus size (190 chunks, 11,498 tokens), rebuild time is not a meaningful constraint.
|
|
||||||
- **Tokenizer alignment for BGE-M3.** If BGE-M3 is selected, both `OLLAMA_EMB_MODEL_NAME` and `HF_EMB_MODEL_NAME` are updated. Updating only `OLLAMA_EMB_MODEL_NAME` causes the chunker to estimate token counts using the wrong vocabulary — a silent bug that produces inconsistent chunk sizes without raising any error.
|
|
||||||
- **Future model changes.** Any future replacement of the embedding model follows the same evaluation protocol — BEIR benchmarks on the same three datasets plus EvaluateRAG — before an ADR update is accepted. Results are documented in `research/embeddings/`.
|
|
||||||
|
|
@ -1,363 +0,0 @@
|
||||||
# ADR-0006: Reward Algorithm for Self-Improving Dataset Synthesis
|
|
||||||
|
|
||||||
**Date:** 2026-03-25
|
|
||||||
**Status:** Under Evaluation — Primary comparison: Candidate A vs Candidate E vs Candidate F
|
|
||||||
**Deciders:** Rafael Ruiz (CTO), MrHouston Engineering (AI Team)
|
|
||||||
**Research lead:** Ivar Zapata
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Context
|
|
||||||
|
|
||||||
The AVAP dataset synthesis pipeline (Track A) generates AVAP code examples using a large language model, filtered by a three-stage quality pipeline: parser validation (Stage 1), Execution Coverage Score (Stage 2), and semantic novelty (Stage 3). The current pipeline has two structural limitations that the reward mechanism must address.
|
|
||||||
|
|
||||||
### Limitation 1 — Static generation
|
|
||||||
|
|
||||||
Each batch is generated from the same static prompt (LRM + category description). The generator has no memory of what it has already produced and no model of what "good" looks like for the constructs it hasn't explored yet.
|
|
||||||
|
|
||||||
### Limitation 2 — Distribution bias (the fundamental problem)
|
|
||||||
|
|
||||||
The generator (Claude claude-sonnet) has its own internal distribution over what AVAP code "looks like", derived from its training on mainstream languages. It naturally gravitates toward the simplest patterns — linear code, basic conditionals, single-construct examples — because those are closest to what it knows. Any reward mechanism based on selecting the best from what the model spontaneously produces and feeding those back as few-shots **amplifies this bias**: the pool fills with what the model does easily, and the model never explores what it does poorly.
|
|
||||||
|
|
||||||
This is not model collapse in the classical sense (weights are not updated), but it is **cumulative distribution bias** — the effective generation distribution narrows toward the model's comfort zone with each iteration.
|
|
||||||
|
|
||||||
### The correct framing
|
|
||||||
|
|
||||||
The solution is not to reward what the model produces spontaneously. It is to **specify externally what must be produced** and evaluate quality relative to that specification. Coverage of the DSL's grammar space must be guaranteed by construction, not hoped for through probabilistic exploration.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Decision
|
|
||||||
|
|
||||||
**Conduct a primary comparative evaluation of Candidate A (CW-Reward, reward-driven pool), Candidate E (MAP-Elites, externally-specified coverage cells), and Candidate F (MAP-Elites with ConstructPrior transfer from real production code)** before selecting the production algorithm. Candidates B, C, D are secondary alternatives evaluated only if none of A, E, or F meets quality thresholds.
|
|
||||||
|
|
||||||
The fundamental research question has two layers:
|
|
||||||
1. **Does forced external specification of construct combinations produce a less biased, higher-quality dataset than reward-driven spontaneous exploration?** (A vs E)
|
|
||||||
2. **Does seeding cell selection with real production code co-occurrence distributions further improve coverage quality and downstream RAG performance over blind MAP-Elites?** (E vs F)
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Candidate Analysis
|
|
||||||
|
|
||||||
### Candidate A — CW-Reward (Composite Weighted Reward)
|
|
||||||
|
|
||||||
**Algorithm class:** In-context reward — no parameter updates.
|
|
||||||
|
|
||||||
**Mechanism:** A composite reward is computed for each parser-valid example:
|
|
||||||
|
|
||||||
```
|
|
||||||
reward(e) = w_ecs · ECS(e) + w_novelty · Jaccard_novelty(e, Pool) + w_tests · test_quality(e)
|
|
||||||
```
|
|
||||||
|
|
||||||
High-reward examples enter a GoldPool (top-K). The pool is injected as few-shot context in subsequent generation calls. Coverage summary steers the prompt toward underrepresented constructs.
|
|
||||||
|
|
||||||
**Known bias risk:** The pool amplifies the model's natural generation distribution. Examples that are easy for the model (simple patterns, single constructs) tend to enter the pool first and persist. The Jaccard novelty metric penalises structural similarity but cannot detect semantic simplicity — two examples with different node type sets can both be trivially shallow.
|
|
||||||
|
|
||||||
**Appropriate when:** The base LLM has strong prior knowledge of the target language (mainstream languages). For AVAP, where the model has zero prior knowledge, the bias risk is materially higher.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
### Candidate E — MAP-Elites with Externally-Defined Coverage Cells (Proposed Primary)
|
|
||||||
|
|
||||||
**Algorithm class:** Quality-Diversity algorithm — no parameter updates, coverage guaranteed by construction.
|
|
||||||
|
|
||||||
**Core insight:** Instead of rewarding the best examples from spontaneous generation, define the coverage space externally from the grammar and direct the generator to fill specific cells. The model's distribution bias is neutralised because it is never asked to "explore freely" — it is always given a precise specification.
|
|
||||||
|
|
||||||
**Coverage space definition:**
|
|
||||||
|
|
||||||
The behavior space is defined over **pairs and trios of AVAP node types** drawn from the full grammar vocabulary. Each cell represents a construct combination that must be represented in the dataset:
|
|
||||||
|
|
||||||
```
|
|
||||||
Cell key = frozenset of 2 or 3 AVAP node types
|
|
||||||
Cell value = (best_example_so_far, quality_score)
|
|
||||||
|
|
||||||
Example cells:
|
|
||||||
{"startLoop", "ormAccessSelect"} → best example using both
|
|
||||||
{"try", "go", "RequestPost"} → best example using all three
|
|
||||||
{"function", "if_mode2", "encodeSHA256"} → best example using all three
|
|
||||||
```
|
|
||||||
|
|
||||||
**Space size:**
|
|
||||||
- Pairs: C(38, 2) = 703 cells
|
|
||||||
- Trios: C(38, 3) = 8,436 cells
|
|
||||||
- Total: 9,139 cells
|
|
||||||
|
|
||||||
With 5,000 examples targeted, average coverage is ~0.55 examples per cell — statistical coverage of pairwise and triadic construct combinations is achievable with focused cell selection strategy. Full coverage of high-prior cells is expected within budget; tail cells are addressed in Phase 3.
|
|
||||||
|
|
||||||
**Generation protocol:**
|
|
||||||
|
|
||||||
```
|
|
||||||
1. SELECT target cell:
|
|
||||||
- Empty cells first (exploration phase)
|
|
||||||
- Then lowest-quality cells (exploitation phase)
|
|
||||||
- Interleave: every 10 calls, select a cell adjacent to a
|
|
||||||
recently improved cell (local neighborhood search)
|
|
||||||
|
|
||||||
2. SPECIFY in the prompt:
|
|
||||||
"Generate an AVAP example that MUST use ALL of these constructs:
|
|
||||||
{cell_constructs}. Use additional constructs where natural."
|
|
||||||
|
|
||||||
3. VALIDATE:
|
|
||||||
a. Parser: syntactically valid? (Stage 1)
|
|
||||||
b. Construct presence: all cell constructs in AST? (cell gate)
|
|
||||||
c. If both pass → compute cell quality score
|
|
||||||
|
|
||||||
4. UPDATE cell:
|
|
||||||
If quality > current cell quality → replace cell entry
|
|
||||||
```
|
|
||||||
|
|
||||||
**Cell quality score:**
|
|
||||||
|
|
||||||
```
|
|
||||||
cell_quality(e, cell) =
|
|
||||||
construct_fidelity(e, cell) # fraction of cell constructs actually present
|
|
||||||
+ α · bonus_constructs(e, cell) # extra constructs beyond cell specification
|
|
||||||
+ β · test_quality(e) # quality of test assertions
|
|
||||||
+ γ · code_length_norm(e) # normalised code length (longer = richer)
|
|
||||||
```
|
|
||||||
|
|
||||||
`construct_fidelity` is the primary gate: an example that does not contain all cell constructs scores 0 regardless of other criteria.
|
|
||||||
|
|
||||||
**Why this eliminates distribution bias:**
|
|
||||||
|
|
||||||
The model is never asked what it "wants" to generate. It receives a precise specification: "you must use these three constructs." If it produces something that satisfies the specification, it enters the map. If not, it is discarded and the cell remains available for the next attempt. The coverage trajectory is determined by the cell selection strategy, not by the model's natural distribution.
|
|
||||||
|
|
||||||
The only residual bias is the model's ability to satisfy arbitrary construct specifications — some cells may be harder to fill than others. This is empirically measurable (fill rate per cell) and is itself a research finding about the generator's capabilities.
|
|
||||||
|
|
||||||
**Appropriate when:** The target language is novel or partially unknown to the generator. The external specification mechanism compensates for the model's lack of prior knowledge.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
### Candidate F — MAP-Elites with ConstructPrior Transfer (Proposed Disruptive Extension)
|
|
||||||
|
|
||||||
**Algorithm class:** Quality-Diversity algorithm with informed cell selection — no parameter updates, coverage guaranteed by construction.
|
|
||||||
|
|
||||||
**Core insight:** Candidate E specifies *which* constructs must appear but treats all cells as equally valuable. Real production code does not use constructs uniformly: some combinations (e.g., `ormAccessSelect` + `try`) appear in virtually every real API endpoint; others (e.g., `encodeSHA256` + `startLoop`) appear rarely. A golden dataset that mirrors production code distributions will retrieve more relevant examples for real developer queries. The ConstructPrior module transfers this knowledge from large public codebases to weight MAP-Elites cell selection.
|
|
||||||
|
|
||||||
**ConstructPrior design:**
|
|
||||||
|
|
||||||
```
|
|
||||||
ConstructPrior = weighted combination of 4 domain sources:
|
|
||||||
|
|
||||||
Source 1 — The Stack (BigCode, 50% weight)
|
|
||||||
Filter: paths matching /api/, /routes/, /handlers/, /endpoints/
|
|
||||||
Languages: Python, Go, JavaScript/TypeScript, Java
|
|
||||||
Process: extract function-level code blocks → map language constructs
|
|
||||||
to AVAP semantic equivalents → compute co-occurrence frequency
|
|
||||||
per (construct_a, construct_b) and (construct_a, construct_b, construct_c)
|
|
||||||
Rationale: real microservice API code; largest and most representative source
|
|
||||||
|
|
||||||
Source 2 — CodeSearchNet (30% weight)
|
|
||||||
Filter: semantic search for "api endpoint", "http handler", "database query"
|
|
||||||
Languages: Python, Go, Java, JavaScript
|
|
||||||
Process: same mapping pipeline as Source 1
|
|
||||||
Rationale: function-docstring pairs provide semantic context for mapping quality
|
|
||||||
|
|
||||||
Source 3 — HumanEval-X Go (10% weight)
|
|
||||||
Filter: problems using goroutines, channels, wait groups
|
|
||||||
Process: map Go concurrency primitives → AVAP {go, gather, startLoop}
|
|
||||||
Rationale: AVAP's concurrency model mirrors Go's; coverage of concurrent patterns
|
|
||||||
|
|
||||||
Source 4 — Spider SQL Dataset (10% weight)
|
|
||||||
Filter: multi-table joins, aggregations, nested queries
|
|
||||||
Process: map SQL operations → AVAP {ormAccessSelect, ormAccessInsert, ormAccessUpdate}
|
|
||||||
Rationale: AVAP ORM constructs semantically equivalent to SQL clauses
|
|
||||||
```
|
|
||||||
|
|
||||||
**Construct mapping table (AVAP ← source constructs):**
|
|
||||||
|
|
||||||
| AVAP construct | Python equivalent | Go equivalent | SQL equivalent |
|
|
||||||
|---|---|---|---|
|
|
||||||
| `ormAccessSelect` | `cursor.fetchall()`, `session.query()` | `db.Query()`, `rows.Scan()` | `SELECT` |
|
|
||||||
| `ormAccessInsert` | `session.add()`, `cursor.execute(INSERT)` | `db.Exec(INSERT)` | `INSERT INTO` |
|
|
||||||
| `ormAccessUpdate` | `session.merge()`, `cursor.execute(UPDATE)` | `db.Exec(UPDATE)` | `UPDATE` |
|
|
||||||
| `RequestGet` | `requests.get()`, `httpx.get()` | `http.Get()`, `client.Get()` | — |
|
|
||||||
| `RequestPost` | `requests.post()`, `httpx.post()` | `http.Post()`, `client.Post()` | — |
|
|
||||||
| `startLoop` | `for item in list:` | `for _, v := range` | `CURSOR LOOP` |
|
|
||||||
| `go` + `gather` | `asyncio.gather()`, `ThreadPoolExecutor` | `go func()`, `sync.WaitGroup` | — |
|
|
||||||
| `try` + `exception` | `try: except:` | `if err != nil` | — |
|
|
||||||
| `encodeSHA256` | `hashlib.sha256()` | `sha256.New()` | — |
|
|
||||||
| `function` | `def func():` | `func name()` | `CREATE FUNCTION` |
|
|
||||||
|
|
||||||
**Cell weighting formula:**
|
|
||||||
|
|
||||||
```
|
|
||||||
cell_prior_weight(cell) =
|
|
||||||
Σ_{s ∈ Sources} weight_s · freq_s(cell_constructs)
|
|
||||||
|
|
||||||
where freq_s(cell) = co-occurrence frequency of the construct set in source s,
|
|
||||||
normalized to [0, 1] within each source.
|
|
||||||
|
|
||||||
Cells with prior_weight = 0 (no source coverage) receive a minimum weight ε = 0.05
|
|
||||||
to ensure all cells remain reachable.
|
|
||||||
```
|
|
||||||
|
|
||||||
**Modified cell selection with ConstructPrior:**
|
|
||||||
|
|
||||||
```
|
|
||||||
PHASE 1 (exploration):
|
|
||||||
Select empty cells, weighted by cell_prior_weight.
|
|
||||||
High-prior cells filled first — these are patterns real developers use.
|
|
||||||
|
|
||||||
PHASE 2 (exploitation):
|
|
||||||
Select lowest-quality filled cells, UCB-weighted,
|
|
||||||
also weighted by cell_prior_weight.
|
|
||||||
High-prior, low-quality cells deprioritized for richer improvement.
|
|
||||||
|
|
||||||
PHASE 3 (tail coverage):
|
|
||||||
Cells with prior_weight = ε are visited last, after all
|
|
||||||
production-relevant cells reach quality > 0.7.
|
|
||||||
Ensures complete mathematical coverage without wasting
|
|
||||||
early generation budget on rare combinations.
|
|
||||||
```
|
|
||||||
|
|
||||||
**Why this is disruptive:**
|
|
||||||
|
|
||||||
1. **First formal connection between DSL dataset synthesis and production code distributions.** Prior dataset synthesis work (MBPP, HumanEval, APPS) uses human-authored problems or scrapes competitive programming sites. For novel DSLs with no prior human authors, this approach provides the first principled method to bootstrap coverage from semantically equivalent languages.
|
|
||||||
|
|
||||||
2. **Eliminates the uniform sampling assumption.** Standard Quality-Diversity algorithms treat all niches as equally valuable. The ConstructPrior breaks this assumption: cells that correspond to real production patterns are assigned higher value, producing a dataset whose distribution mirrors real developer usage rather than mathematical combinatorial completeness.
|
|
||||||
|
|
||||||
3. **Zero human annotation required.** The prior is derived automatically from public datasets under permissive licenses (The Stack: Apache 2.0; CodeSearchNet: MIT; HumanEval-X: MIT; Spider: CC BY-SA 4.0).
|
|
||||||
|
|
||||||
4. **Residual bias is semantic, not structural.** Candidate E's residual bias is the model's ability to satisfy arbitrary construct specifications (some cells may be hard to fill). Candidate F's residual bias is the construct mapping quality (how faithfully Python/Go/SQL constructs map to AVAP equivalents). The latter is measurable, improvable, and fully transparent.
|
|
||||||
|
|
||||||
**Expected improvement over Candidate E:**
|
|
||||||
|
|
||||||
- RAGAS Composite: +0.03–0.08 (hypothesis: production-weighted cells retrieve more relevant examples for real queries)
|
|
||||||
- Distribution entropy: similar or slightly lower than E (intentionally non-uniform — mirrors production distribution)
|
|
||||||
- Downstream task success: +5–15% on held-out real developer queries (hypothesis: high-prior cells produce examples that match actual query patterns)
|
|
||||||
|
|
||||||
**Appropriate when:** Target DSL has identifiable semantic equivalents in mainstream languages, and a production-weighted dataset is preferred over a mathematically uniform one.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
### Out of Scope — Fine-tuning Approaches (GRPO, DPO)
|
|
||||||
|
|
||||||
Gradient-based approaches (GRPO, DPO) address a **different problem**: fine-tuning the inference model after the dataset is built. This ADR concerns dataset synthesis algorithm design. Fine-tuning the inference model is a separate architectural decision, tracked separately, and is not evaluated here.
|
|
||||||
|
|
||||||
Per-iteration fine-tuning of the generator (training the generator on its own outputs between batches) is explicitly rejected as a design choice. Iteratively training a model on its own outputs produces cumulative distribution narrowing. The generator (Claude API) and any future inference model must be trained on separate, independently validated datasets.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
### Candidate D — UCB Bandit over Coverage Regions
|
|
||||||
|
|
||||||
**Algorithm class:** Multi-armed bandit.
|
|
||||||
|
|
||||||
Coverage regions are arms. UCB selects which region to target via exploration-exploitation tradeoff. Theoretically well-understood convergence guarantees but does not provide construct-level specification — it targets regions, not specific combinations. Less precise than Candidate E.
|
|
||||||
|
|
||||||
**Superseded by Candidate E** for the same computational cost with stronger guarantees.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Comparative Summary
|
|
||||||
|
|
||||||
| Property | A: CW-Reward | E: MAP-Elites | F: MAP-Elites+Prior |
|
|
||||||
|---|---|---|---|
|
|
||||||
| Distribution bias risk | **High** | **None** | **None** |
|
|
||||||
| Coverage guarantee | Probabilistic | **By construction** | **By construction** |
|
|
||||||
| Production code alignment | None | None | **Yes (weighted)** |
|
|
||||||
| LLM parameter updates | No | No | No |
|
|
||||||
| GPU requirement | None | None | None |
|
|
||||||
| Works with API-only LLM | Yes | Yes | Yes |
|
|
||||||
| Interpretability | High | **Very high** | **Very high** |
|
|
||||||
| Implementation complexity | Low | Medium | **Medium-High** |
|
|
||||||
| Convergence guarantee | No | **Yes (fill rate)** | **Yes (fill rate)** |
|
|
||||||
| Residual bias | Model distribution | Cell fill difficulty | Mapping quality |
|
|
||||||
| External data required | No | No | Yes (public, free) |
|
|
||||||
| Novel contribution | Low | Medium | **High** |
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Evaluation Protocol
|
|
||||||
|
|
||||||
### Phase 1 — Candidate A vs Candidate E vs Candidate F
|
|
||||||
|
|
||||||
Run all three candidates for 500 generated examples each, same LRM, same parser, same Stage 1 filter. Fixed random seed for reproducibility.
|
|
||||||
|
|
||||||
**Primary metrics:**
|
|
||||||
|
|
||||||
| Metric | Definition | Expected winner |
|
|
||||||
|---|---|---|
|
|
||||||
| Cell fill rate | Fraction of 9,139 cells with ≥1 example (E/F only) | E≈F by construction |
|
|
||||||
| Coverage breadth | Distinct node types covered / total | E≈F |
|
|
||||||
| Distribution uniformity | Entropy of node type frequency distribution | E (flatter = better) |
|
|
||||||
| Production alignment | KL divergence between dataset and ConstructPrior distribution | **F** (by design) |
|
|
||||||
| Mean cell quality | Average quality score across filled cells | TBD empirically |
|
|
||||||
| Parser pass rate trend | Pass rate across iterations | A (if few-shots help) |
|
|
||||||
| Downstream RAGAS | RAGAS Composite on 50 held-out AVAP queries | **Primary decision signal** |
|
|
||||||
|
|
||||||
**Distribution uniformity** is the key metric for bias detection (A vs E). Plot node type frequency as a histogram. Candidate A will show a long-tail distribution. Candidate E should show a near-uniform distribution. Candidate F will show a production-weighted distribution (intentionally non-uniform — this is a feature, not a bug).
|
|
||||||
|
|
||||||
**Production alignment** is the key metric for F vs E. A dataset with low KL divergence from ConstructPrior produces examples that match real developer usage patterns. If RAGAS(F) > RAGAS(E), this validates the transfer prior hypothesis.
|
|
||||||
|
|
||||||
**Selection criterion:**
|
|
||||||
- A vs E: Candidate E wins if entropy > 3.0 bits AND RAGAS(E) ≥ RAGAS(A).
|
|
||||||
- E vs F: Candidate F wins if RAGAS(F) > RAGAS(E) by margin ≥ 0.02.
|
|
||||||
- If F wins both comparisons, F is the production algorithm.
|
|
||||||
- Fallback: if RAGAS margin F vs E < 0.02, use E (simpler, no external data dependency).
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Weight and Hyperparameter Grids
|
|
||||||
|
|
||||||
### Candidate A weight grid
|
|
||||||
|
|
||||||
| Config | w_ecs | w_novelty | w_tests | Hypothesis |
|
|
||||||
|---|---|---|---|---|
|
|
||||||
| A1 | 0.50 | 0.35 | 0.15 | Balanced (baseline) |
|
|
||||||
| A2 | 0.70 | 0.20 | 0.10 | Coverage-heavy |
|
|
||||||
| A3 | 0.30 | 0.60 | 0.10 | Novelty-heavy |
|
|
||||||
| A4 | 0.85 | 0.00 | 0.15 | No novelty (ablation) |
|
|
||||||
|
|
||||||
A4 is the critical ablation: does novelty weighting reduce distribution bias, or is ECS alone sufficient?
|
|
||||||
|
|
||||||
### Candidate E hyperparameter grid
|
|
||||||
|
|
||||||
| Config | Cell size | Selection strategy | α (bonus constructs) |
|
|
||||||
|---|---|---|---|
|
|
||||||
| E1 | Pairs only | Empty-first | 0.2 |
|
|
||||||
| E2 | Pairs + Trios | Empty-first | 0.2 |
|
|
||||||
| E3 | Pairs + Trios | UCB-weighted | 0.2 |
|
|
||||||
| E4 | Pairs + Trios | Empty-first | 0.5 |
|
|
||||||
|
|
||||||
E2 is the baseline. E3 tests whether UCB cell selection improves quality over simple empty-first ordering. E4 tests whether a higher bonus for extra constructs produces richer examples.
|
|
||||||
|
|
||||||
### Candidate F hyperparameter grid
|
|
||||||
|
|
||||||
| Config | Prior sources | Phase 3 threshold | ε (tail minimum) | Mapping strictness |
|
|
||||||
|---|---|---|---|---|
|
|
||||||
| F1 | All 4 sources (50/30/10/10) | q > 0.7 | 0.05 | Lenient (keyword match) |
|
|
||||||
| F2 | All 4 sources (50/30/10/10) | q > 0.7 | 0.05 | Strict (AST-level match) |
|
|
||||||
| F3 | Stack only (100%) | q > 0.7 | 0.05 | Lenient |
|
|
||||||
| F4 | All 4 sources (50/30/10/10) | q > 0.5 | 0.10 | Lenient |
|
|
||||||
|
|
||||||
F1 is the baseline. F2 tests whether strict construct mapping (requiring AST-level evidence vs keyword presence) improves prior quality. F3 is the ablation: does the multi-source mixture add value over The Stack alone? F4 tests earlier phase transition and higher minimum tail weight.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Open Questions for the Scientific Team
|
|
||||||
|
|
||||||
1. **Cell selection with difficulty weighting:** Some cells may be intrinsically hard to fill (e.g., combining `go` + `avapConnector` + `ormAccessSelect` in a single coherent example). Should the cell selection strategy account for historical fill difficulty, or treat all cells equally?
|
|
||||||
|
|
||||||
2. **Cross-cell quality:** An example generated for cell {A, B} may also be a high-quality example for cell {A, C} if it happens to use C as well. Should examples be indexed against all cells they satisfy, or only the cell they were generated for?
|
|
||||||
|
|
||||||
3. **Minimum example length per cell:** Short examples (3–5 lines) can technically satisfy a cell specification with minimal semantic content. Should a minimum code complexity threshold (e.g., minimum AST depth, minimum number of statements) be required for cell admission?
|
|
||||||
|
|
||||||
4. **Cell retirement:** Once a cell reaches quality score > 0.90, should it be retired from the selection pool to focus generation effort on harder cells?
|
|
||||||
|
|
||||||
5. **Generalisation to KCL:** The KCL grammar has different node types. Does the MAP-Elites cell space need to be redefined per language, or can a universal cell structure be derived from shared construct categories (type_definition, validation, control_flow, io)?
|
|
||||||
|
|
||||||
6. **ConstructPrior mapping quality:** The construct mapping (e.g., Python `session.query()` → AVAP `ormAccessSelect`) is heuristic. Should mapping quality be validated against a small manually annotated equivalence set before running the full generation pipeline? If the mapping is noisy, the prior weights may be misleading — a high-frequency Python pattern that maps incorrectly to a rare AVAP pattern would over-weight a non-representative cell.
|
|
||||||
|
|
||||||
7. **Prior refresh cadence:** The Stack and CodeSearchNet are static snapshots. If AVAP adoption grows and native AVAP code becomes available, should the ConstructPrior be retrained on AVAP-native data, effectively transitioning from transfer learning to self-supervised learning? Define the minimum corpus size threshold at which native data supersedes the cross-language prior.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Consequences
|
|
||||||
|
|
||||||
- `generate_mbap_v2.py` is rewritten to implement Candidate F (MAP-Elites + ConstructPrior) as the primary algorithm. Candidate E (MAP-Elites without prior) is available via `--mode map-elites`. Candidate A (CW-Reward) is available via `--mode reward`. All three modes use identical parser, stage filters, and cell definitions to ensure fair comparison.
|
|
||||||
- A `ConstructPrior` module (`construct_prior.py`) handles multi-source data download, construct extraction, language-to-AVAP mapping, and co-occurrence matrix construction. This module is isolated from the core MAP-Elites loop and can be updated independently.
|
|
||||||
- The construct mapping table (language construct → AVAP equivalent) is maintained as a versioned configuration file (`construct_map.yaml`) and must not be modified after generation begins for a given dataset version.
|
|
||||||
- Results must be documented in `research/reward/` before this ADR is closed. Required artefacts: entropy histograms for A/E/F, KL divergence plots, RAGAS Composite comparison table, cell fill rate heatmaps.
|
|
||||||
- Any change to cell definitions, quality metrics, or the construct mapping table requires full dataset regeneration.
|
|
||||||
- Per-iteration fine-tuning of the generator is rejected and will not be re-evaluated without new evidence addressing the distribution narrowing risk.
|
|
||||||
|
|
@ -1,242 +0,0 @@
|
||||||
# ADR-0007: Mandatory Syntactic Validation Layer (MSVL) for RAG Evaluation
|
|
||||||
|
|
||||||
**Date:** 2026-04-06
|
|
||||||
**Status:** Proposed
|
|
||||||
**Deciders:** Rafael Ruiz (CTO), Pablo (AI Team)
|
|
||||||
**Related ADRs:** ADR-0003 (Hybrid Retrieval RRF), ADR-0004 (Claude as RAGAS Evaluation Judge), ADR-0005 (Embedding Model Selection), ADR-0006 (Reward Algorithm for Dataset Synthesis)
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Context
|
|
||||||
|
|
||||||
### The evaluation campaign that triggered this ADR
|
|
||||||
|
|
||||||
On 2026-04-06, the AI Team ran six evaluation suites using `EvaluateRAG` on the 50-question golden dataset, covering three embedding models across two index configurations each. All six runs returned a verdict of **ACCEPTABLE** from the RAGAS pipeline. The scores are reproduced below:
|
|
||||||
|
|
||||||
| Embedding Model | Index | Faithfulness | Answer Relevancy | Context Recall | Context Precision | Global Score |
|
|
||||||
|---|---|---|---|---|---|---|
|
|
||||||
| qwen3-0.6B-emb | avap-knowledge-v2-qwen | 0.5329 | 0.8393 | 0.5449 | 0.5843 | **0.6254** |
|
|
||||||
| qwen3-0.6B-emb | avap-docs-test-v4-qwen | 0.5781 | 0.8472 | 0.6451 | 0.6633 | **0.6834** |
|
|
||||||
| bge-m3 | avap-knowledge-v2-bge | 0.5431 | 0.8507 | 0.5057 | 0.5689 | **0.6171** |
|
|
||||||
| bge-m3 | avap-docs-test-v4-bge | 0.5843 | 0.8400 | 0.6067 | 0.6384 | **0.6681** |
|
|
||||||
| harrier-oss-v1:0.6b | avap-knowledge-v2-harrier | 0.5328 | 0.8424 | 0.4898 | 0.5634 | **0.6071** |
|
|
||||||
| harrier-oss-v1:0.6b | avap-docs-test-v4-harrier | 0.6829 | 0.8457 | 0.6461 | 0.6688 | **0.7109** |
|
|
||||||
|
|
||||||
*Judge model for all runs: `claude-sonnet-4-20250514`. All runs: 50 questions, Docker container on shared EC2.*
|
|
||||||
|
|
||||||
### Why these scores are not valid for architectural decisions
|
|
||||||
|
|
||||||
Manual inspection of the `answer_preview` fields reveals a systematic pattern that invalidates all six verdicts: **models are generating syntactically invalid AVAP code while receiving ACCEPTABLE scores from the LLM judge.**
|
|
||||||
|
|
||||||
The root cause is architectural. The RAGAS judge (Claude Sonnet) evaluates *semantic coherence* — whether the answer is logically consistent with the retrieved context. It does not evaluate *syntactic validity* — whether the generated code would execute on the PLATON kernel. For a proprietary DSL like AVAP, these two properties are independent. A response can score high on faithfulness while containing complete Go syntax.
|
|
||||||
|
|
||||||
**Forensic analysis of the six evaluation traces** identifies three distinct failure modes.
|
|
||||||
|
|
||||||
#### Failure Mode 1 — Foreign language injection
|
|
||||||
|
|
||||||
Models produce complete syntax from Go, Python, or JavaScript inside code blocks labelled `avap`. These responses are not AVAP and would fail at parse time.
|
|
||||||
|
|
||||||
| Entry | Model / Index | Language injected | Evidence |
|
|
||||||
|---|---|---|---|
|
|
||||||
| GD-V-009 | harrier / avap-knowledge-v2 | **Go** | `package main`, `import "fmt"`, `func main()` inside an `avap` block |
|
|
||||||
| GD-V-009 | qwen3 / avap-knowledge-v2 | **Go** | `package main`, `import (..."fmt"...)` |
|
|
||||||
| GD-C-003 | harrier / avap-knowledge-v2 | **Python** | `for i in range(1, 6):` with Python dict literal |
|
|
||||||
| GD-C-003 | bge-m3 / avap-knowledge-v2 | **Python** | `for i in range(1, 6):` with `# Build the JSON object` comment |
|
|
||||||
| GD-C-004 | bge-m3 / avap-knowledge-v2 | **JavaScript** | `let allowedRoles = ["admin", ...]`, `.includes(rol)` |
|
|
||||||
| GD-V-007 | qwen3 / avap-docs-test-v4 | **JS / PHP / Python** | `foreach(item in items)`, Python `print()` |
|
|
||||||
|
|
||||||
GD-V-009 is the most critical case. The question asks about AVAP goroutine scope. The model answers with a complete Go program. Claude-Sonnet scored this ACCEPTABLE because the prose surrounding the code is semantically consistent with the retrieved context — the code block itself is never validated.
|
|
||||||
|
|
||||||
#### Failure Mode 2 — Hallucinated AVAP commands
|
|
||||||
|
|
||||||
Models invent command names that do not exist in the AVAP grammar. These are not foreign languages — they appear syntactically plausible — but would fail at the parser's symbol resolution stage.
|
|
||||||
|
|
||||||
| Invented command | Observed in | Real AVAP equivalent |
|
|
||||||
|---|---|---|
|
|
||||||
| `getSHA256(x)` | qwen3 | `encodeSHA256(origen, destino)` |
|
|
||||||
| `generateSHA256Hash(x)` | bge-m3, harrier | `encodeSHA256(origen, destino)` |
|
|
||||||
| `readParam("x")` | qwen3, bge-m3 | `addParam("x", destino)` |
|
|
||||||
| `ifParam("x", dest)` | qwen3 | `addParam("x", dest)` + `if(...)` |
|
|
||||||
| `returnResult(x)` | bge-m3 | `addResult(x)` |
|
|
||||||
| `getTimeStamp(...)` | qwen3 | `getDateTime(...)` |
|
|
||||||
| `except(e)` | qwen3 | `exception(e)` |
|
|
||||||
| `getListParamList(...)` | harrier | Does not exist |
|
|
||||||
| `variableFromJSON(...)` | harrier | Does not exist |
|
|
||||||
| `confirmPassword(...)` | bge-m3 | Does not exist |
|
|
||||||
| `httpGet(...)` | bge-m3 | `RequestGet(...)` |
|
|
||||||
|
|
||||||
#### Failure Mode 3 — Structural foreign syntax
|
|
||||||
|
|
||||||
Beyond identifiable code blocks, some responses embed structural constructs that are not part of the AVAP grammar: curly-brace function bodies, `while` loops, `let`/`var` declarations, `for`/`foreach` statements. These appear in entries where no foreign language is explicitly named.
|
|
||||||
|
|
||||||
#### Summary by model and index
|
|
||||||
|
|
||||||
| Model | Index | Foreign syntax (entries) | Hallucinated cmds (entries) | Estimated invalid / 50 |
|
|
||||||
|---|---|---|---|---|
|
|
||||||
| qwen3-0.6B-emb | avap-knowledge-v2 | 3 | 2 | ~5 (10%) |
|
|
||||||
| qwen3-0.6B-emb | avap-docs-test-v4 | 3 | 3 | ~6 (12%) |
|
|
||||||
| bge-m3 | avap-knowledge-v2 | 6 | 3 | ~8 (16%) |
|
|
||||||
| bge-m3 | avap-docs-test-v4 | 5 | 1 | ~6 (12%) |
|
|
||||||
| harrier-oss-v1:0.6b | avap-knowledge-v2 | 2 | 3 | ~5 (10%) |
|
|
||||||
| harrier-oss-v1:0.6b | avap-docs-test-v4 | 1 | 0 | ~1 (2%) |
|
|
||||||
|
|
||||||
*Counts are conservative lower bounds: `answer_preview` fields are truncated at ~300 characters. Full response bodies may contain additional violations not visible in the preview.*
|
|
||||||
|
|
||||||
### Relative ordering within this campaign
|
|
||||||
|
|
||||||
The data supports a *relative* — not absolute — ordering. **harrier / avap-docs-test-v4** shows the fewest syntactic violations and the highest global score (0.7109). It is the least-bad model in this run. However, this does not make it production-ready: a model that generates correct AVAP in 98% of responses can still fail for a user on a critical code generation query.
|
|
||||||
|
|
||||||
**bge-m3** failures are predominantly well-known foreign syntaxes (Python, JavaScript), which makes them identifiable without a parser. **qwen3** introduces invented commands that look like valid AVAP idioms (`ifParam`, `getSHA256`, `getTimeStamp`) — these are harder to detect precisely because they are superficially plausible.
|
|
||||||
|
|
||||||
The CTO's conclusion: no model can be selected or rejected based on these six runs. The measurement instrument is not fit for purpose.
|
|
||||||
|
|
||||||
### Evaluation environment issues identified in this campaign
|
|
||||||
|
|
||||||
Three additional issues compromise reproducibility independently of model quality:
|
|
||||||
|
|
||||||
**Mixed execution environments.** Parts of the team ran `run_evaluation` from local notebooks. Notebook runs do not record temperature or random seeds, making score reproduction impossible across machines and Python environments.
|
|
||||||
|
|
||||||
**Undocumented index re-creation.** Bugs were discovered in the existing indices and they were re-indexed with corrected pipelines (`avap_ingestor.py` for `avap-knowledge-v2-*`, `elasticsearch_ingestion.py` for `avap-docs-test-v4-*`). The pre-processing delta between old and new indices was not documented before the evaluation was run, making it impossible to determine whether score differences reflect model quality or index quality.
|
|
||||||
|
|
||||||
**BM25 contamination in embedding comparisons.** The pipeline uses Hybrid Retrieval (BM25 + kNN, per ADR-0003). When the goal is to compare embedding models, BM25 acts as a confounding variable: a weaker embedding model can compensate with BM25 recall, masking the true quality differential. Evaluations intended to select an embedding model require a kNN-only retrieval mode that does not exist yet.
|
|
||||||
|
|
||||||
### The few-shot gap
|
|
||||||
|
|
||||||
The 190 validated AVAP examples from ADR-0006 are not currently injected into the generation prompt. The syntactic failure rates above — 5 to 17% of responses per run — are consistent with a model that has no valid AVAP examples in its prompt context and falls back on pre-training distributions. This is the expected behaviour of a base LLM encountering an unfamiliar DSL without few-shot grounding.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Decision
|
|
||||||
|
|
||||||
Establish the **Mandatory Syntactic Validation Layer (MSVL)** as a non-optional prerequisite gate in the `EvaluateRAG` pipeline. Any evaluation score produced without MSVL is classified as non-binding and cannot be cited in architectural decisions.
|
|
||||||
|
|
||||||
### 1. Parser integration in `EvaluateRAG`
|
|
||||||
|
|
||||||
Every code block in a generated response must be submitted to the AVAP Parser via gRPC before RAGAS scoring. The parser returns a binary result: `VALID` or `INVALID` with a failure category (`unknown_token`, `unexpected_construct`, `foreign_keyword`, `syntax_error`).
|
|
||||||
|
|
||||||
### 2. `syntactic_validity` as an independent metric
|
|
||||||
|
|
||||||
Introduce `syntactic_validity` (float 0.0–1.0): the fraction of code-bearing responses that pass parser validation within a run. This metric is reported alongside RAGAS scores, not as a replacement.
|
|
||||||
|
|
||||||
For entries that fail parser validation, `faithfulness` and `answer_relevancy` are **overridden to 0.0** regardless of the LLM judge's qualitative assessment. The raw RAGAS scores are preserved in the evaluation record for audit.
|
|
||||||
|
|
||||||
```
|
|
||||||
final_faithfulness(entry) =
|
|
||||||
0.0 if parser(entry) = INVALID
|
|
||||||
ragas_faithfulness(entry) otherwise
|
|
||||||
|
|
||||||
final_answer_relevancy(entry) =
|
|
||||||
0.0 if parser(entry) = INVALID
|
|
||||||
ragas_answer_relevancy(entry) otherwise
|
|
||||||
```
|
|
||||||
|
|
||||||
### 3. Parser SLA and fallback policy
|
|
||||||
|
|
||||||
The AVAP Parser gRPC service must respond within 2 seconds per call. If the parser is unreachable or times out, the evaluation run is **aborted** with an explicit error. Silent fallback to RAGAS-only scoring is prohibited.
|
|
||||||
|
|
||||||
```python
|
|
||||||
if parser_status == UNAVAILABLE:
|
|
||||||
raise EvaluationAbortedError(
|
|
||||||
"AVAP Parser unreachable. MSVL cannot be bypassed."
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
### 4. Standardised evaluation protocol
|
|
||||||
|
|
||||||
Local notebook environments are **prohibited** for official evaluation reports. All evaluations cited in architectural decisions must be executed within the `EvaluateRAG` Docker container in Staging with:
|
|
||||||
|
|
||||||
- Fixed random seeds via `EVAL_SEED` environment variable
|
|
||||||
- `temperature=0` for all generation calls
|
|
||||||
- `ANTHROPIC_MODEL` pinned to a specific version string, not `latest`
|
|
||||||
- Index version and the exact ingestion pipeline used documented in the evaluation record *before* the run starts
|
|
||||||
|
|
||||||
### 5. Few-shot context injection
|
|
||||||
|
|
||||||
The 190 validated AVAP examples from ADR-0006 must be injected as few-shot context into the generation prompt. Injection protocol:
|
|
||||||
|
|
||||||
- Examples are selected by **semantic similarity** to the current query (top-K retrieval from the validated pool), not injected wholesale
|
|
||||||
- K defaults to 5; effective K per run is logged in the evaluation record
|
|
||||||
- If the few-shot retrieval service is unavailable, the run proceeds without injection and this is flagged as `few_shot_injection: degraded` in the report
|
|
||||||
|
|
||||||
This directly targets Failure Mode 1: a model that has seen 5 valid AVAP examples before generating code is substantially less likely to default to Go or Python syntax.
|
|
||||||
|
|
||||||
### 6. Embedding-only evaluation mode
|
|
||||||
|
|
||||||
A separate `knn_only` retrieval mode must be implemented in `EvaluateRAG` for evaluations whose explicit purpose is embedding model comparison. This mode disables BM25 and uses only kNN retrieval. Results from this mode are tagged `retrieval_mode: knn_only` and are not comparable with standard hybrid retrieval scores. This mode must be used for any future embedding model selection decision.
|
|
||||||
|
|
||||||
### 7. Statistical measurement requirements
|
|
||||||
|
|
||||||
| Requirement | Specification | Rationale |
|
|
||||||
|---|---|---|
|
|
||||||
| **Bootstrap stability** | N ≥ 5 runs per suite | N=3 provides 1 degree of freedom for variance estimation; N=5 is the minimum to detect bimodal operating modes |
|
|
||||||
| **Reported statistics** | Mean (μ) and standard deviation (σ) | Single-run scores cannot be used for decision-making |
|
|
||||||
| **Leakage audit** | Token distribution analysis per model | Quantifies how much syntactic correctness derives from pre-training bias vs. AVAP documentation retrieval |
|
|
||||||
| **Syntactic confusion matrix** | Parse failures broken down by category and question ID | Identifies which AVAP constructs (`startLoop`, `ormAccess`, `encodeSHA256`, etc.) require additional documentation or few-shot coverage |
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Rationale
|
|
||||||
|
|
||||||
### Why 0.0 override rather than a graduated penalty?
|
|
||||||
|
|
||||||
For AVAP, syntactic validity is a binary production gate: code either executes on the PLATON kernel or it does not. A graduated penalty would imply partial credit for non-executable output, which has no operational meaning. The override to 0.0 aligns the metric with the actual production outcome. Raw RAGAS scores are preserved for post-hoc analysis if the policy needs to be revised.
|
|
||||||
|
|
||||||
### Why abort on parser unavailability rather than degrade?
|
|
||||||
|
|
||||||
Silent fallback to RAGAS-only scoring produces evaluation reports that are visually identical to MSVL-validated reports. The purpose of the layer is to prevent false positives. An infrastructure failure that silently removes the gate defeats that purpose entirely. Failing loudly is the only policy consistent with the layer's goal.
|
|
||||||
|
|
||||||
### Why few-shot injection by similarity rather than full pool injection?
|
|
||||||
|
|
||||||
Injecting all 190 examples wholesale would consume the majority of the generation context window, compressing the retrieved documentation that RAGAS evaluates. Similarity-based top-K selection preserves the most relevant examples while protecting retrieval context fidelity. Coverage of rare construct combinations depends on query distribution — this is measurable via the confusion matrix.
|
|
||||||
|
|
||||||
### Why N ≥ 5 runs?
|
|
||||||
|
|
||||||
`temperature=0` reduces run-to-run variance but does not eliminate it. Retrieval non-determinism from kNN approximate search and prompt token ordering effects can produce different results at zero temperature. N=3 provides 1 degree of freedom for variance estimation. N=5 is the minimum that allows detection of a bimodal distribution (two distinct operating modes) with elementary statistical reliability.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Status of prior evaluations
|
|
||||||
|
|
||||||
The six evaluation runs from 2026-04-06 documented in this ADR's Context section are **classified as non-binding**. They may be cited as qualitative evidence of relative model behaviour but cannot be used to select an embedding model for production.
|
|
||||||
|
|
||||||
Any evaluation report generated before this ADR's acceptance date that does not include a `syntactic_validity` score is retroactively classified as non-binding.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Alternatives Considered
|
|
||||||
|
|
||||||
| Alternative | Rejected because |
|
|
||||||
|---|---|
|
|
||||||
| **Post-hoc validation** (flag but do not override scores) | Does not prevent false positives from propagating into decision metrics |
|
|
||||||
| **Raise RAGAS threshold to ≥ 0.80** | A model could pass at 0.80 with 10% Go injection; does not address the structural misalignment between semantic scoring and syntactic validity |
|
|
||||||
| **Manual code review per evaluation run** | Not reproducible or scalable; reintroduces evaluator subjectivity |
|
|
||||||
| **Fine-tuning with AVAP-only data** | Addresses the generation problem but not the measurement problem; MSVL is needed regardless |
|
|
||||||
| **Disable BM25 for all evaluations** | Removes a production component defined in ADR-0003; the correct solution is an explicit `knn_only` mode for embedding comparisons, not removing hybrid retrieval globally |
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Consequences
|
|
||||||
|
|
||||||
**Positive:**
|
|
||||||
- Eliminates the false positive class definitively demonstrated in this ADR's Context: semantically coherent but syntactically invalid responses will no longer receive ACCEPTABLE verdicts.
|
|
||||||
- `syntactic_validity` becomes a first-class longitudinal metric enabling tracking of DSL fidelity independently of semantic quality.
|
|
||||||
- Standardised Docker execution with documented seeds ensures scores are reproducible and comparable across team members and time.
|
|
||||||
- The syntactic confusion matrix creates a direct feedback loop into documentation priorities and few-shot pool expansion.
|
|
||||||
|
|
||||||
**Negative:**
|
|
||||||
- Evaluation latency increases by one gRPC call per generated response. At the 2-second SLA for a 50-question dataset, this adds approximately 100 seconds per run.
|
|
||||||
- The AVAP Parser becomes a hard dependency of the evaluation pipeline and must be versioned and kept in sync with the LRM. Parser upgrades may alter score comparability across historical runs.
|
|
||||||
- N ≥ 5 runs multiplies evaluation cost (API calls, compute time) approximately 5×.
|
|
||||||
- The `knn_only` retrieval mode and the few-shot retrieval service are engineering work not currently scheduled.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Open Questions
|
|
||||||
|
|
||||||
1. **Acceptance threshold for `syntactic_validity`:** This ADR defines how to measure syntactic validity but does not specify the minimum score required for production readiness. A subsequent amendment must define this threshold (e.g., `syntactic_validity ≥ 0.95` for `CODE_GENERATION` questions) before MSVL scores can be used as a hard CI/CD gate.
|
|
||||||
|
|
||||||
2. **Parser version pinning policy:** When a parser upgrade changes accepted constructs, historical scores become incomparable. A policy for when upgrades require re-running historical evaluations has not been defined.
|
|
||||||
|
|
||||||
3. **Few-shot pool adequacy for the confusion matrix tail:** Whether 190 examples provide adequate coverage of rare construct combinations visible in the confusion matrix has not been empirically tested.
|
|
||||||
|
|
||||||
4. **BM25 contamination remediation for existing results:** The `knn_only` evaluation mode should be scheduled before the next embedding model comparison campaign to produce a clean comparative baseline.
|
|
||||||
|
|
@ -45,7 +45,16 @@ Both `AskAgent` and `AskAgentStream` return a **server-side stream** of `AgentRe
|
||||||
|
|
||||||
**Use case:** Clients that do not support streaming or need a single atomic response.
|
**Use case:** Clients that do not support streaming or need a single atomic response.
|
||||||
|
|
||||||
**Request:** See [`AgentRequest`](#agentrequest) in §3.
|
**Request:**
|
||||||
|
|
||||||
|
```protobuf
|
||||||
|
message AgentRequest {
|
||||||
|
string query = 1; // The user's question. Required. Max recommended: 4096 chars.
|
||||||
|
string session_id = 2; // Conversation session identifier. Optional.
|
||||||
|
// If empty, defaults to "default" (shared session).
|
||||||
|
// Use a UUID per user/conversation for isolation.
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
**Response stream:**
|
**Response stream:**
|
||||||
|
|
||||||
|
|
@ -61,7 +70,7 @@ Both `AskAgent` and `AskAgentStream` return a **server-side stream** of `AgentRe
|
||||||
|
|
||||||
**Behaviour:** Runs `prepare_graph` (classify → reformulate → retrieve), then calls `llm.stream()` directly. Emits one `AgentResponse` per token from Ollama, followed by a terminal message.
|
**Behaviour:** Runs `prepare_graph` (classify → reformulate → retrieve), then calls `llm.stream()` directly. Emits one `AgentResponse` per token from Ollama, followed by a terminal message.
|
||||||
|
|
||||||
**Use case:** Interactive clients (chat UIs, VS Code extension) that need progressive rendering.
|
**Use case:** Interactive clients (chat UIs, terminal tools) that need progressive rendering.
|
||||||
|
|
||||||
**Request:** Same `AgentRequest` as `AskAgent`.
|
**Request:** Same `AgentRequest` as `AskAgent`.
|
||||||
|
|
||||||
|
|
@ -143,40 +152,10 @@ message QuestionDetail {
|
||||||
|
|
||||||
### `AgentRequest`
|
### `AgentRequest`
|
||||||
|
|
||||||
```protobuf
|
| Field | Type | Required | Description |
|
||||||
message AgentRequest {
|
|---|---|---|---|
|
||||||
string query = 1;
|
| `query` | `string` | Yes | User's natural language question |
|
||||||
string session_id = 2;
|
| `session_id` | `string` | No | Conversation identifier for multi-turn context. Use a stable UUID per user session. |
|
||||||
string editor_content = 3;
|
|
||||||
string selected_text = 4;
|
|
||||||
string extra_context = 5;
|
|
||||||
string user_info = 6;
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
| Field | Type | Required | Encoding | Description |
|
|
||||||
|---|---|---|---|---|
|
|
||||||
| `query` | `string` | Yes | Plain text | User's natural language question. Max recommended: 4096 chars. |
|
|
||||||
| `session_id` | `string` | No | Plain text | Conversation identifier for multi-turn context. Use a stable UUID per user session. Defaults to `"default"` if empty. |
|
|
||||||
| `editor_content` | `string` | No | Base64 | Full content of the active file open in the editor at query time. Decoded server-side before entering the graph. |
|
|
||||||
| `selected_text` | `string` | No | Base64 | Text currently selected in the editor. Primary anchor for query reformulation and generation when the classifier detects an explicit editor reference. |
|
|
||||||
| `extra_context` | `string` | No | Base64 | Free-form additional context (e.g. file path, language identifier, open diagnostic errors). |
|
|
||||||
| `user_info` | `string` | No | JSON string | Client identity metadata. Expected format: `{"dev_id": <int>, "project_id": <int>, "org_id": <int>}`. Available in graph state for future routing or personalisation — not yet consumed by the graph. |
|
|
||||||
|
|
||||||
**Editor context behaviour:**
|
|
||||||
|
|
||||||
Fields 3–6 are all optional. If none are provided the assistant behaves exactly as without them — full backward compatibility. When `editor_content` or `selected_text` are provided, the graph classifier determines whether the user's question explicitly refers to that code. Only if the classifier returns `EDITOR` are the context fields injected into the generation prompt. This prevents the model from referencing editor code when the question is unrelated to it.
|
|
||||||
|
|
||||||
**Base64 encoding:**
|
|
||||||
|
|
||||||
`editor_content`, `selected_text` and `extra_context` must be Base64-encoded before sending. The server decodes them with UTF-8. Malformed Base64 is silently treated as empty string — no error is raised.
|
|
||||||
|
|
||||||
```python
|
|
||||||
import base64
|
|
||||||
encoded = base64.b64encode(content.encode("utf-8")).decode("utf-8")
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
### `AgentResponse`
|
### `AgentResponse`
|
||||||
|
|
||||||
|
|
@ -186,8 +165,6 @@ encoded = base64.b64encode(content.encode("utf-8")).decode("utf-8")
|
||||||
| `avap_code` | `string` | Currently always `"AVAP-2026"` in non-streaming mode, empty in streaming |
|
| `avap_code` | `string` | Currently always `"AVAP-2026"` in non-streaming mode, empty in streaming |
|
||||||
| `is_final` | `bool` | `true` only on the last message of the stream |
|
| `is_final` | `bool` | `true` only on the last message of the stream |
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
### `EvalRequest`
|
### `EvalRequest`
|
||||||
|
|
||||||
| Field | Type | Required | Default | Description |
|
| Field | Type | Required | Default | Description |
|
||||||
|
|
@ -196,8 +173,6 @@ encoded = base64.b64encode(content.encode("utf-8")).decode("utf-8")
|
||||||
| `limit` | `int32` | No | `0` (all) | Max questions to evaluate |
|
| `limit` | `int32` | No | `0` (all) | Max questions to evaluate |
|
||||||
| `index` | `string` | No | `$ELASTICSEARCH_INDEX` | ES index to evaluate |
|
| `index` | `string` | No | `$ELASTICSEARCH_INDEX` | ES index to evaluate |
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
### `EvalResponse`
|
### `EvalResponse`
|
||||||
|
|
||||||
See full definition in [§2.3](#23-evaluaterag).
|
See full definition in [§2.3](#23-evaluaterag).
|
||||||
|
|
@ -217,7 +192,7 @@ The engine catches all exceptions and returns them as terminal `AgentResponse` m
|
||||||
{"text": "[ENG] Error: Connection refused connecting to Ollama", "is_final": true}
|
{"text": "[ENG] Error: Connection refused connecting to Ollama", "is_final": true}
|
||||||
```
|
```
|
||||||
|
|
||||||
**`EvaluateRAG` error response:**
|
**`EvaluateRAG` error response:**
|
||||||
Returned as a single `EvalResponse` with `status` set to the error description:
|
Returned as a single `EvalResponse` with `status` set to the error description:
|
||||||
```json
|
```json
|
||||||
{"status": "ANTHROPIC_API_KEY no configurada en .env", ...}
|
{"status": "ANTHROPIC_API_KEY no configurada en .env", ...}
|
||||||
|
|
@ -236,11 +211,11 @@ grpcurl -plaintext localhost:50052 list
|
||||||
grpcurl -plaintext localhost:50052 describe brunix.AssistanceEngine
|
grpcurl -plaintext localhost:50052 describe brunix.AssistanceEngine
|
||||||
```
|
```
|
||||||
|
|
||||||
### `AskAgent` — basic query
|
### `AskAgent` — full response
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
grpcurl -plaintext \
|
grpcurl -plaintext \
|
||||||
-d '{"query": "Que significa AVAP?", "session_id": "dev-001"}' \
|
-d '{"query": "What is addVar in AVAP?", "session_id": "dev-001"}' \
|
||||||
localhost:50052 \
|
localhost:50052 \
|
||||||
brunix.AssistanceEngine/AskAgent
|
brunix.AssistanceEngine/AskAgent
|
||||||
```
|
```
|
||||||
|
|
@ -248,47 +223,12 @@ grpcurl -plaintext \
|
||||||
Expected response:
|
Expected response:
|
||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
"text": "AVAP (Advanced Virtual API Programming) es un DSL Turing Completo...",
|
"text": "addVar is an AVAP command that declares a new variable...",
|
||||||
"avap_code": "AVAP-2026",
|
"avap_code": "AVAP-2026",
|
||||||
"is_final": true
|
"is_final": true
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
### `AskAgent` — with editor context
|
|
||||||
|
|
||||||
```python
|
|
||||||
import base64, json, grpc
|
|
||||||
import brunix_pb2, brunix_pb2_grpc
|
|
||||||
|
|
||||||
def encode(text: str) -> str:
|
|
||||||
return base64.b64encode(text.encode("utf-8")).decode("utf-8")
|
|
||||||
|
|
||||||
channel = grpc.insecure_channel("localhost:50052")
|
|
||||||
stub = brunix_pb2_grpc.AssistanceEngineStub(channel)
|
|
||||||
|
|
||||||
editor_code = """
|
|
||||||
try()
|
|
||||||
ormDirect("UPDATE users SET active=1", res)
|
|
||||||
exception(e)
|
|
||||||
addVar(_status, 500)
|
|
||||||
addResult("Error")
|
|
||||||
end()
|
|
||||||
"""
|
|
||||||
|
|
||||||
request = brunix_pb2.AgentRequest(
|
|
||||||
query = "why is this not catching the error?",
|
|
||||||
session_id = "dev-001",
|
|
||||||
editor_content = encode(editor_code),
|
|
||||||
selected_text = encode(editor_code), # same block selected
|
|
||||||
extra_context = encode("file: handler.avap"),
|
|
||||||
user_info = json.dumps({"dev_id": 1, "project_id": 2, "org_id": 3}),
|
|
||||||
)
|
|
||||||
|
|
||||||
for response in stub.AskAgent(request):
|
|
||||||
if response.is_final:
|
|
||||||
print(response.text)
|
|
||||||
```
|
|
||||||
|
|
||||||
### `AskAgentStream` — token streaming
|
### `AskAgentStream` — token streaming
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
|
|
@ -310,6 +250,7 @@ Expected response (truncated):
|
||||||
### `EvaluateRAG` — run evaluation
|
### `EvaluateRAG` — run evaluation
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
|
# Evaluate first 10 questions from the "core_syntax" category
|
||||||
grpcurl -plaintext \
|
grpcurl -plaintext \
|
||||||
-d '{"category": "core_syntax", "limit": 10}' \
|
-d '{"category": "core_syntax", "limit": 10}' \
|
||||||
localhost:50052 \
|
localhost:50052 \
|
||||||
|
|
@ -323,7 +264,7 @@ Expected response:
|
||||||
"questions_evaluated": 10,
|
"questions_evaluated": 10,
|
||||||
"elapsed_seconds": 142.3,
|
"elapsed_seconds": 142.3,
|
||||||
"judge_model": "claude-sonnet-4-20250514",
|
"judge_model": "claude-sonnet-4-20250514",
|
||||||
"index": "avap-knowledge-v1",
|
"index": "avap-docs-test",
|
||||||
"faithfulness": 0.8421,
|
"faithfulness": 0.8421,
|
||||||
"answer_relevancy": 0.7913,
|
"answer_relevancy": 0.7913,
|
||||||
"context_recall": 0.7234,
|
"context_recall": 0.7234,
|
||||||
|
|
@ -334,7 +275,7 @@ Expected response:
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
### Multi-turn conversation
|
### Multi-turn conversation example
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# Turn 1
|
# Turn 1
|
||||||
|
|
@ -342,7 +283,7 @@ grpcurl -plaintext \
|
||||||
-d '{"query": "What is registerEndpoint?", "session_id": "user-abc"}' \
|
-d '{"query": "What is registerEndpoint?", "session_id": "user-abc"}' \
|
||||||
localhost:50052 brunix.AssistanceEngine/AskAgentStream
|
localhost:50052 brunix.AssistanceEngine/AskAgentStream
|
||||||
|
|
||||||
# Turn 2 — engine has history from Turn 1
|
# Turn 2 — the engine has history from Turn 1
|
||||||
grpcurl -plaintext \
|
grpcurl -plaintext \
|
||||||
-d '{"query": "Can you show me an example?", "session_id": "user-abc"}' \
|
-d '{"query": "Can you show me an example?", "session_id": "user-abc"}' \
|
||||||
localhost:50052 brunix.AssistanceEngine/AskAgentStream
|
localhost:50052 brunix.AssistanceEngine/AskAgentStream
|
||||||
|
|
@ -362,90 +303,37 @@ python -m grpc_tools.protoc \
|
||||||
|
|
||||||
## 6. OpenAI-Compatible Proxy
|
## 6. OpenAI-Compatible Proxy
|
||||||
|
|
||||||
The container also exposes an HTTP server on port `8000` (`openai_proxy.py`) that wraps the gRPC interface under an OpenAI-compatible API. This allows integration with any tool that supports the OpenAI Chat Completions API — `continue.dev`, LiteLLM, Open WebUI, or any custom client.
|
The container also exposes an HTTP server on port `8000` (`openai_proxy.py`) that wraps `AskAgentStream` under an OpenAI-compatible endpoint. This allows integration with any tool that supports the OpenAI Chat Completions API.
|
||||||
|
|
||||||
**Base URL:** `http://localhost:8000`
|
**Base URL:** `http://localhost:8000`
|
||||||
|
|
||||||
### Available endpoints
|
|
||||||
|
|
||||||
| Method | Endpoint | Description |
|
|
||||||
|---|---|---|
|
|
||||||
| `POST` | `/v1/chat/completions` | OpenAI Chat Completions. Routes to `AskAgent` or `AskAgentStream`. |
|
|
||||||
| `POST` | `/v1/completions` | OpenAI Completions (legacy). |
|
|
||||||
| `GET` | `/v1/models` | Lists available models. Returns `brunix`. |
|
|
||||||
| `POST` | `/api/chat` | Ollama chat format (NDJSON streaming). |
|
|
||||||
| `POST` | `/api/generate` | Ollama generate format (NDJSON streaming). |
|
|
||||||
| `GET` | `/api/tags` | Ollama model list. |
|
|
||||||
| `GET` | `/health` | Health check. Returns `{"status": "ok"}`. |
|
|
||||||
|
|
||||||
### `POST /v1/chat/completions`
|
### `POST /v1/chat/completions`
|
||||||
|
|
||||||
**Routing:** `stream: false` → `AskAgent` (single response). `stream: true` → `AskAgentStream` (SSE token stream).
|
|
||||||
|
|
||||||
**Request body:**
|
**Request body:**
|
||||||
|
|
||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
"model": "brunix",
|
"model": "brunix",
|
||||||
"messages": [
|
"messages": [
|
||||||
{"role": "user", "content": "Que significa AVAP?"}
|
{"role": "user", "content": "What is addVar in AVAP?"}
|
||||||
],
|
],
|
||||||
"stream": false,
|
"stream": true
|
||||||
"session_id": "uuid-per-conversation",
|
|
||||||
"user": "{\"editor_content\":\"\",\"selected_text\":\"\",\"extra_context\":\"\",\"user_info\":{\"dev_id\":1,\"project_id\":2,\"org_id\":3}}"
|
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
**The `user` field (editor context transport):**
|
**Notes:**
|
||||||
|
- The `model` field is ignored; the engine always uses the configured `OLLAMA_MODEL_NAME`.
|
||||||
|
- Session management is handled internally by the proxy. Conversation continuity across separate HTTP requests is not guaranteed.
|
||||||
|
- Only `stream: true` is fully supported. Non-streaming mode may be available but is not the primary use case.
|
||||||
|
|
||||||
The standard OpenAI `user` field is used to transport editor context as a JSON string. This allows the VS Code extension to send context without requiring API changes. Non-Brunix clients can omit `user` or set it to a plain string — both are handled gracefully.
|
**Example with curl:**
|
||||||
|
|
||||||
| Key in `user` JSON | Encoding | Description |
|
|
||||||
|---|---|---|
|
|
||||||
| `editor_content` | Base64 | Full content of the active editor file |
|
|
||||||
| `selected_text` | Base64 | Currently selected text in the editor |
|
|
||||||
| `extra_context` | Base64 | Free-form additional context |
|
|
||||||
| `user_info` | JSON object | `{"dev_id": int, "project_id": int, "org_id": int}` |
|
|
||||||
|
|
||||||
**Important:** `session_id` must be sent as a top-level field — never inside the `user` JSON. The proxy reads `session_id` exclusively from the dedicated field.
|
|
||||||
|
|
||||||
**Example — general query (no editor context):**
|
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
curl -X POST http://localhost:8000/v1/chat/completions \
|
curl http://localhost:8000/v1/chat/completions \
|
||||||
-H "Content-Type: application/json" \
|
-H "Content-Type: application/json" \
|
||||||
-d '{
|
-d '{
|
||||||
"model": "brunix",
|
"model": "brunix",
|
||||||
"messages": [{"role": "user", "content": "Que significa AVAP?"}],
|
"messages": [{"role": "user", "content": "Explain AVAP loops"}],
|
||||||
"stream": false,
|
"stream": true
|
||||||
"session_id": "test-001"
|
|
||||||
}'
|
|
||||||
```
|
|
||||||
|
|
||||||
**Example — query with editor context (VS Code extension):**
|
|
||||||
|
|
||||||
```bash
|
|
||||||
curl -X POST http://localhost:8000/v1/chat/completions \
|
|
||||||
-H "Content-Type: application/json" \
|
|
||||||
-d '{
|
|
||||||
"model": "brunix",
|
|
||||||
"messages": [{"role": "user", "content": "que hace este codigo?"}],
|
|
||||||
"stream": true,
|
|
||||||
"session_id": "test-001",
|
|
||||||
"user": "{\"editor_content\":\"<base64>\",\"selected_text\":\"<base64>\",\"extra_context\":\"\",\"user_info\":{\"dev_id\":1,\"project_id\":2,\"org_id\":3}}"
|
|
||||||
}'
|
|
||||||
```
|
|
||||||
|
|
||||||
**Example — empty editor context fields:**
|
|
||||||
|
|
||||||
```bash
|
|
||||||
curl -X POST http://localhost:8000/v1/chat/completions \
|
|
||||||
-H "Content-Type: application/json" \
|
|
||||||
-d '{
|
|
||||||
"model": "brunix",
|
|
||||||
"messages": [{"role": "user", "content": "como funciona addVar?"}],
|
|
||||||
"stream": false,
|
|
||||||
"session_id": "test-002",
|
|
||||||
"user": "{\"editor_content\":\"\",\"selected_text\":\"\",\"extra_context\":\"\",\"user_info\":{}}"
|
|
||||||
}'
|
}'
|
||||||
```
|
```
|
||||||
|
|
|
||||||
|
|
@ -1,8 +1,8 @@
|
||||||
# Brunix Assistance Engine — Architecture Reference
|
# Brunix Assistance Engine — Architecture Reference
|
||||||
|
|
||||||
> **Audience:** Engineers contributing to this repository, architects reviewing the system design, and operators responsible for its deployment.
|
> **Audience:** Engineers contributing to this repository, architects reviewing the system design, and operators responsible for its deployment.
|
||||||
> **Last updated:** 2026-03-20
|
> **Last updated:** 2026-03-18
|
||||||
> **Version:** 1.6.x
|
> **Version:** 1.5.x
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
|
|
@ -13,15 +13,14 @@
|
||||||
3. [Request Lifecycle](#3-request-lifecycle)
|
3. [Request Lifecycle](#3-request-lifecycle)
|
||||||
4. [LangGraph Workflow](#4-langgraph-workflow)
|
4. [LangGraph Workflow](#4-langgraph-workflow)
|
||||||
5. [RAG Pipeline — Hybrid Search](#5-rag-pipeline--hybrid-search)
|
5. [RAG Pipeline — Hybrid Search](#5-rag-pipeline--hybrid-search)
|
||||||
6. [Editor Context Pipeline](#6-editor-context-pipeline)
|
6. [Streaming Architecture (AskAgentStream)](#6-streaming-architecture-askagentstream)
|
||||||
7. [Streaming Architecture (AskAgentStream)](#7-streaming-architecture-askagentstream)
|
7. [Evaluation Pipeline (EvaluateRAG)](#7-evaluation-pipeline-evaluaterag)
|
||||||
8. [Evaluation Pipeline (EvaluateRAG)](#8-evaluation-pipeline-evaluaterag)
|
8. [Data Ingestion Pipeline](#8-data-ingestion-pipeline)
|
||||||
9. [Data Ingestion Pipeline](#9-data-ingestion-pipeline)
|
9. [Infrastructure Layout](#9-infrastructure-layout)
|
||||||
10. [Infrastructure Layout](#10-infrastructure-layout)
|
10. [Session State & Conversation Memory](#10-session-state--conversation-memory)
|
||||||
11. [Session State & Conversation Memory](#11-session-state--conversation-memory)
|
11. [Observability Stack](#11-observability-stack)
|
||||||
12. [Observability Stack](#12-observability-stack)
|
12. [Security Boundaries](#12-security-boundaries)
|
||||||
13. [Security Boundaries](#13-security-boundaries)
|
13. [Known Limitations & Future Work](#13-known-limitations--future-work)
|
||||||
14. [Known Limitations & Future Work](#14-known-limitations--future-work)
|
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
|
|
@ -34,7 +33,6 @@ The **Brunix Assistance Engine** is a stateful, streaming-capable AI service tha
|
||||||
- **Hybrid RAG** (BM25 + kNN with RRF fusion) over an Elasticsearch vector index
|
- **Hybrid RAG** (BM25 + kNN with RRF fusion) over an Elasticsearch vector index
|
||||||
- **Ollama** as the local LLM and embedding backend
|
- **Ollama** as the local LLM and embedding backend
|
||||||
- **RAGAS + Claude** as the automated evaluation judge
|
- **RAGAS + Claude** as the automated evaluation judge
|
||||||
- **Editor context injection** — the VS Code extension can send active file content and selected code alongside each query; the engine decides whether to use it based on the user's intent
|
|
||||||
|
|
||||||
A secondary **OpenAI-compatible HTTP proxy** (port `8000`) is served via FastAPI/Uvicorn, enabling integration with tools that expect the OpenAI API format.
|
A secondary **OpenAI-compatible HTTP proxy** (port `8000`) is served via FastAPI/Uvicorn, enabling integration with tools that expect the OpenAI API format.
|
||||||
|
|
||||||
|
|
@ -42,7 +40,6 @@ A secondary **OpenAI-compatible HTTP proxy** (port `8000`) is served via FastAPI
|
||||||
┌─────────────────────────────────────────────────────────────┐
|
┌─────────────────────────────────────────────────────────────┐
|
||||||
│ External Clients │
|
│ External Clients │
|
||||||
│ grpcurl / App SDK │ OpenAI-compatible client │
|
│ grpcurl / App SDK │ OpenAI-compatible client │
|
||||||
│ VS Code extension │ (continue.dev, LiteLLM) │
|
|
||||||
└────────────┬────────────────┴──────────────┬────────────────┘
|
└────────────┬────────────────┴──────────────┬────────────────┘
|
||||||
│ gRPC :50052 │ HTTP :8000
|
│ gRPC :50052 │ HTTP :8000
|
||||||
▼ ▼
|
▼ ▼
|
||||||
|
|
@ -77,19 +74,19 @@ A secondary **OpenAI-compatible HTTP proxy** (port `8000`) is served via FastAPI
|
||||||
|
|
||||||
| Component | File / Service | Responsibility |
|
| Component | File / Service | Responsibility |
|
||||||
|---|---|---|
|
|---|---|---|
|
||||||
| **gRPC Server** | `Docker/src/server.py` | Entry point. Implements the `AssistanceEngine` servicer. Initializes LLM, embeddings, ES client, and both graphs. Decodes Base64 editor context fields from incoming requests. |
|
| **gRPC Server** | `Docker/src/server.py` | Entry point. Implements the `AssistanceEngine` servicer. Initializes LLM, embeddings, ES client, and both graphs. |
|
||||||
| **Full Graph** | `Docker/src/graph.py` → `build_graph()` | Complete workflow: classify → reformulate → retrieve → generate. Used by `AskAgent` and `EvaluateRAG`. |
|
| **Full Graph** | `Docker/src/graph.py` → `build_graph()` | Complete workflow: classify → reformulate → retrieve → generate. Used by `AskAgent` and `EvaluateRAG`. |
|
||||||
| **Prepare Graph** | `Docker/src/graph.py` → `build_prepare_graph()` | Partial workflow: classify → reformulate → retrieve. Does **not** call the LLM for generation. Used by `AskAgentStream` to enable manual token streaming. |
|
| **Prepare Graph** | `Docker/src/graph.py` → `build_prepare_graph()` | Partial workflow: classify → reformulate → retrieve. Does **not** call the LLM for generation. Used by `AskAgentStream` to enable manual token streaming. |
|
||||||
| **Message Builder** | `Docker/src/graph.py` → `build_final_messages()` | Reconstructs the final prompt list from prepared state for `llm.stream()`. Injects editor context when `use_editor_context` is `True`. |
|
| **Message Builder** | `Docker/src/graph.py` → `build_final_messages()` | Reconstructs the final prompt list from prepared state for `llm.stream()`. |
|
||||||
| **Prompt Library** | `Docker/src/prompts.py` | Centralized definitions for `CLASSIFY`, `REFORMULATE`, `GENERATE`, `CODE_GENERATION`, and `CONVERSATIONAL` prompts. |
|
| **Prompt Library** | `Docker/src/prompts.py` | Centralized definitions for `CLASSIFY`, `REFORMULATE`, `GENERATE`, `CODE_GENERATION`, and `CONVERSATIONAL` prompts. |
|
||||||
| **Agent State** | `Docker/src/state.py` | `AgentState` TypedDict shared across all graph nodes. Includes editor context fields and `use_editor_context` flag. |
|
| **Agent State** | `Docker/src/state.py` | `AgentState` TypedDict shared across all graph nodes. |
|
||||||
| **Evaluation Suite** | `Docker/src/evaluate.py` | RAGAS-based pipeline. Uses the production retriever + Ollama LLM for generation, and Claude as the impartial judge. |
|
| **Evaluation Suite** | `Docker/src/evaluate.py` | RAGAS-based pipeline. Uses the production retriever + Ollama LLM for generation, and Claude as the impartial judge. |
|
||||||
| **OpenAI Proxy** | `Docker/src/openai_proxy.py` | FastAPI application that wraps `AskAgent` / `AskAgentStream` under OpenAI and Ollama compatible endpoints. Parses editor context from the `user` field. |
|
| **OpenAI Proxy** | `Docker/src/openai_proxy.py` | FastAPI application that wraps `AskAgentStream` under an `/v1/chat/completions` endpoint. |
|
||||||
| **LLM Factory** | `Docker/src/utils/llm_factory.py` | Provider-agnostic factory for chat models (Ollama, AWS Bedrock). |
|
| **LLM Factory** | `Docker/src/utils/llm_factory.py` | Provider-agnostic factory for chat models (Ollama, AWS Bedrock). |
|
||||||
| **Embedding Factory** | `Docker/src/utils/emb_factory.py` | Provider-agnostic factory for embedding models (Ollama, HuggingFace). |
|
| **Embedding Factory** | `Docker/src/utils/emb_factory.py` | Provider-agnostic factory for embedding models (Ollama, HuggingFace). |
|
||||||
| **Ingestion Pipeline** | `scripts/pipelines/flows/elasticsearch_ingestion.py` | Chunks and ingests AVAP documents into Elasticsearch with embeddings. |
|
| **Ingestion Pipeline** | `scripts/pipelines/flows/elasticsearch_ingestion.py` | Chunks and ingests AVAP documents into Elasticsearch with embeddings. |
|
||||||
| **AVAP Chunker** | `scripts/pipelines/ingestion/avap_chunker.py` | Semantic chunker for `.avap` source files using `avap_config.json` as grammar. |
|
| **Dataset Generator** | `scripts/pipelines/flows/generate_mbap.py` | Generates synthetic MBPP-style AVAP problems using Claude. |
|
||||||
| **Unit Tests** | `Docker/tests/test_prd_0002.py` | 40 unit tests covering editor context parsing, Base64 decoding, classifier output, reformulate anchor, and injection logic. |
|
| **MBPP Translator** | `scripts/pipelines/flows/translate_mbpp.py` | Translates MBPP Python dataset into AVAP equivalents. |
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
|
|
@ -98,21 +95,16 @@ A secondary **OpenAI-compatible HTTP proxy** (port `8000`) is served via FastAPI
|
||||||
### 3.1 `AskAgent` (non-streaming)
|
### 3.1 `AskAgent` (non-streaming)
|
||||||
|
|
||||||
```
|
```
|
||||||
Client → gRPC AgentRequest{query, session_id, editor_content*, selected_text*, extra_context*, user_info*}
|
Client → gRPC AgentRequest{query, session_id}
|
||||||
│ (* Base64-encoded; user_info is JSON string)
|
|
||||||
│
|
│
|
||||||
├─ Decode Base64 fields (editor_content, selected_text, extra_context)
|
|
||||||
├─ Load conversation history from session_store[session_id]
|
├─ Load conversation history from session_store[session_id]
|
||||||
├─ Build initial_state = {messages, session_id, editor_content, selected_text, extra_context, user_info}
|
├─ Build initial_state = {messages: history + [user_msg], ...}
|
||||||
│
|
│
|
||||||
└─ graph.invoke(initial_state)
|
└─ graph.invoke(initial_state)
|
||||||
├─ classify → query_type ∈ {RETRIEVAL, CODE_GENERATION, CONVERSATIONAL}
|
├─ classify → query_type ∈ {RETRIEVAL, CODE_GENERATION, CONVERSATIONAL}
|
||||||
│ use_editor_context ∈ {True, False}
|
├─ reformulate → reformulated_query (keyword-optimized for semantic search)
|
||||||
├─ reformulate → reformulated_query
|
├─ retrieve → context (top-8 hybrid RRF chunks from Elasticsearch)
|
||||||
│ (anchored to selected_text if use_editor_context=True)
|
└─ generate → final AIMessage (llm.invoke)
|
||||||
├─ retrieve → context (top-8 hybrid RRF chunks)
|
|
||||||
└─ generate → final AIMessage
|
|
||||||
(editor context injected only if use_editor_context=True)
|
|
||||||
│
|
│
|
||||||
├─ Persist updated history to session_store[session_id]
|
├─ Persist updated history to session_store[session_id]
|
||||||
└─ yield AgentResponse{text, avap_code="AVAP-2026", is_final=True}
|
└─ yield AgentResponse{text, avap_code="AVAP-2026", is_final=True}
|
||||||
|
|
@ -121,18 +113,17 @@ Client → gRPC AgentRequest{query, session_id, editor_content*, selected_text*,
|
||||||
### 3.2 `AskAgentStream` (token streaming)
|
### 3.2 `AskAgentStream` (token streaming)
|
||||||
|
|
||||||
```
|
```
|
||||||
Client → gRPC AgentRequest{query, session_id, editor_content*, selected_text*, extra_context*, user_info*}
|
Client → gRPC AgentRequest{query, session_id}
|
||||||
│
|
│
|
||||||
├─ Decode Base64 fields
|
|
||||||
├─ Load history from session_store[session_id]
|
├─ Load history from session_store[session_id]
|
||||||
├─ Build initial_state
|
├─ Build initial_state
|
||||||
│
|
│
|
||||||
├─ prepare_graph.invoke(initial_state) ← Phase 1: no LLM generation
|
├─ prepare_graph.invoke(initial_state) ← Phase 1: no LLM generation
|
||||||
│ ├─ classify → query_type + use_editor_context
|
│ ├─ classify
|
||||||
│ ├─ reformulate
|
│ ├─ reformulate
|
||||||
│ └─ retrieve (or skip_retrieve if CONVERSATIONAL)
|
│ └─ retrieve (or skip_retrieve if CONVERSATIONAL)
|
||||||
│
|
│
|
||||||
├─ build_final_messages(prepared_state) ← Reconstruct prompt with editor context if flagged
|
├─ build_final_messages(prepared_state) ← Reconstruct prompt list
|
||||||
│
|
│
|
||||||
└─ for chunk in llm.stream(final_messages):
|
└─ for chunk in llm.stream(final_messages):
|
||||||
└─ yield AgentResponse{text=token, is_final=False}
|
└─ yield AgentResponse{text=token, is_final=False}
|
||||||
|
|
@ -141,20 +132,7 @@ Client → gRPC AgentRequest{query, session_id, editor_content*, selected_text*,
|
||||||
└─ yield AgentResponse{text="", is_final=True}
|
└─ yield AgentResponse{text="", is_final=True}
|
||||||
```
|
```
|
||||||
|
|
||||||
### 3.3 HTTP Proxy → gRPC
|
### 3.3 `EvaluateRAG`
|
||||||
|
|
||||||
```
|
|
||||||
Client → POST /v1/chat/completions {messages, stream, session_id, user}
|
|
||||||
│
|
|
||||||
├─ Extract query from last user message in messages[]
|
|
||||||
├─ Read session_id from dedicated field (NOT from user)
|
|
||||||
├─ Parse user field as JSON → {editor_content, selected_text, extra_context, user_info}
|
|
||||||
│
|
|
||||||
├─ stream=false → _invoke_blocking() → AskAgent gRPC call
|
|
||||||
└─ stream=true → _iter_stream() → AskAgentStream gRPC call → SSE token stream
|
|
||||||
```
|
|
||||||
|
|
||||||
### 3.4 `EvaluateRAG`
|
|
||||||
|
|
||||||
```
|
```
|
||||||
Client → gRPC EvalRequest{category?, limit?, index?}
|
Client → gRPC EvalRequest{category?, limit?, index?}
|
||||||
|
|
@ -166,8 +144,9 @@ Client → gRPC EvalRequest{category?, limit?, index?}
|
||||||
│ ├─ retrieve_context (hybrid BM25+kNN, same as production)
|
│ ├─ retrieve_context (hybrid BM25+kNN, same as production)
|
||||||
│ └─ generate_answer (Ollama LLM + GENERATE_PROMPT)
|
│ └─ generate_answer (Ollama LLM + GENERATE_PROMPT)
|
||||||
├─ Build RAGAS Dataset
|
├─ Build RAGAS Dataset
|
||||||
├─ Run RAGAS metrics with Claude as judge
|
├─ Run RAGAS metrics with Claude as judge:
|
||||||
└─ Compute global_score + verdict
|
│ faithfulness / answer_relevancy / context_recall / context_precision
|
||||||
|
└─ Compute global_score + verdict (EXCELLENT / ACCEPTABLE / INSUFFICIENT)
|
||||||
│
|
│
|
||||||
└─ return EvalResponse{scores, global_score, verdict, details[]}
|
└─ return EvalResponse{scores, global_score, verdict, details[]}
|
||||||
```
|
```
|
||||||
|
|
@ -176,28 +155,11 @@ Client → gRPC EvalRequest{category?, limit?, index?}
|
||||||
|
|
||||||
## 4. LangGraph Workflow
|
## 4. LangGraph Workflow
|
||||||
|
|
||||||
### 4.1 Agent State
|
### 4.1 Full Graph (`build_graph`)
|
||||||
|
|
||||||
```python
|
|
||||||
class AgentState(TypedDict):
|
|
||||||
messages: Annotated[list, add_messages] # conversation history
|
|
||||||
session_id: str
|
|
||||||
query_type: str # RETRIEVAL | CODE_GENERATION | CONVERSATIONAL
|
|
||||||
reformulated_query: str
|
|
||||||
context: str # formatted RAG context string
|
|
||||||
editor_content: str # decoded from Base64
|
|
||||||
selected_text: str # decoded from Base64
|
|
||||||
extra_context: str # decoded from Base64
|
|
||||||
user_info: str # JSON string: {"dev_id", "project_id", "org_id"}
|
|
||||||
use_editor_context: bool # set by classifier — True only if query explicitly refers to editor
|
|
||||||
```
|
|
||||||
|
|
||||||
### 4.2 Full Graph (`build_graph`)
|
|
||||||
|
|
||||||
```
|
```
|
||||||
┌─────────────┐
|
┌─────────────┐
|
||||||
│ classify │ ← sees: query + history + selected_text (if present)
|
│ classify │
|
||||||
│ │ outputs: query_type + use_editor_context
|
|
||||||
└──────┬──────┘
|
└──────┬──────┘
|
||||||
│
|
│
|
||||||
┌────────────────┼──────────────────┐
|
┌────────────────┼──────────────────┐
|
||||||
|
|
@ -208,12 +170,8 @@ class AgentState(TypedDict):
|
||||||
▼ ▼
|
▼ ▼
|
||||||
┌──────────────┐ ┌────────────────────────┐
|
┌──────────────┐ ┌────────────────────────┐
|
||||||
│ reformulate │ │ respond_conversational │
|
│ reformulate │ │ respond_conversational │
|
||||||
│ │ └───────────┬────────────┘
|
└──────┬───────┘ └───────────┬────────────┘
|
||||||
│ if use_editor│ │
|
▼ │
|
||||||
│ anchor query │ │
|
|
||||||
│ to selected │ │
|
|
||||||
└──────┬───────┘ │
|
|
||||||
▼ │
|
|
||||||
┌──────────────┐ │
|
┌──────────────┐ │
|
||||||
│ retrieve │ │
|
│ retrieve │ │
|
||||||
└──────┬───────┘ │
|
└──────┬───────┘ │
|
||||||
|
|
@ -222,54 +180,24 @@ class AgentState(TypedDict):
|
||||||
▼ ▼ │
|
▼ ▼ │
|
||||||
┌──────────┐ ┌───────────────┐ │
|
┌──────────┐ ┌───────────────┐ │
|
||||||
│ generate │ │ generate_code │ │
|
│ generate │ │ generate_code │ │
|
||||||
│ │ │ │ │
|
└────┬─────┘ └───────┬───────┘ │
|
||||||
│ injects │ │ injects editor│ │
|
|
||||||
│ editor │ │ context only │ │
|
|
||||||
│ context │ │ if flag=True │ │
|
|
||||||
│ if flag │ └───────┬───────┘ │
|
|
||||||
└────┬─────┘ │ │
|
|
||||||
│ │ │
|
│ │ │
|
||||||
└────────────────────┴────────────────┘
|
└────────────────────┴────────────────┘
|
||||||
│
|
│
|
||||||
END
|
END
|
||||||
```
|
```
|
||||||
|
|
||||||
### 4.3 Prepare Graph (`build_prepare_graph`)
|
### 4.2 Prepare Graph (`build_prepare_graph`)
|
||||||
|
|
||||||
Identical routing for classify, but generation nodes are replaced by `END`. The `CONVERSATIONAL` branch uses `skip_retrieve` (returns empty context). The `use_editor_context` flag is set here and carried forward into `build_final_messages`.
|
Identical routing for classify, but generation nodes are replaced by `END`. The `CONVERSATIONAL` branch uses `skip_retrieve` (returns empty context without querying Elasticsearch).
|
||||||
|
|
||||||
### 4.4 Classifier — Two-Token Output
|
### 4.3 Query Type Routing
|
||||||
|
|
||||||
The classifier outputs exactly two tokens separated by a space:
|
| `query_type` | Triggers retrieve? | Generation prompt |
|
||||||
|
|---|---|---|
|
||||||
```
|
| `RETRIEVAL` | Yes | `GENERATE_PROMPT` (explanation-focused) |
|
||||||
<query_type> <editor_signal>
|
| `CODE_GENERATION` | Yes | `CODE_GENERATION_PROMPT` (code-focused, returns AVAP blocks) |
|
||||||
|
| `CONVERSATIONAL` | No | `CONVERSATIONAL_PROMPT` (reformulation of prior answer) |
|
||||||
Examples:
|
|
||||||
RETRIEVAL NO_EDITOR
|
|
||||||
CODE_GENERATION EDITOR
|
|
||||||
CONVERSATIONAL NO_EDITOR
|
|
||||||
```
|
|
||||||
|
|
||||||
`EDITOR` is set only when the user message explicitly refers to editor code using expressions like "this code", "este codigo", "fix this", "que hace esto", "explain this", etc. General AVAP questions, code generation requests, and conversational follow-ups always return `NO_EDITOR`.
|
|
||||||
|
|
||||||
### 4.5 Query Type Routing
|
|
||||||
|
|
||||||
| `query_type` | Triggers retrieve? | Generation prompt | Editor context injected? |
|
|
||||||
|---|---|---|---|
|
|
||||||
| `RETRIEVAL` | Yes | `GENERATE_PROMPT` | Only if `use_editor_context=True` |
|
|
||||||
| `CODE_GENERATION` | Yes | `CODE_GENERATION_PROMPT` | Only if `use_editor_context=True` |
|
|
||||||
| `CONVERSATIONAL` | No | `CONVERSATIONAL_PROMPT` | Never |
|
|
||||||
|
|
||||||
### 4.6 Reformulator — Mode-Aware & Language-Preserving
|
|
||||||
|
|
||||||
The reformulator receives `[MODE: <query_type>]` prepended to the query:
|
|
||||||
|
|
||||||
- **MODE RETRIEVAL:** Compresses the query into compact keywords. Does NOT expand with AVAP commands. Preserves original language — Spanish queries stay in Spanish, English queries stay in English.
|
|
||||||
- **MODE CODE_GENERATION:** Applies the AVAP command expansion mapping (registerEndpoint, addParam, ormAccessSelect, etc.).
|
|
||||||
- **MODE CONVERSATIONAL:** Returns the query as-is.
|
|
||||||
|
|
||||||
Language preservation is critical for BM25 retrieval — the AVAP LRM is written in Spanish, so a Spanish query must reach the retriever in Spanish for lexical matching to work correctly.
|
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
|
|
@ -278,14 +206,12 @@ Language preservation is critical for BM25 retrieval — the AVAP LRM is written
|
||||||
The retrieval system (`hybrid_search_native`) fuses BM25 lexical search and kNN dense vector search using **Reciprocal Rank Fusion (RRF)**.
|
The retrieval system (`hybrid_search_native`) fuses BM25 lexical search and kNN dense vector search using **Reciprocal Rank Fusion (RRF)**.
|
||||||
|
|
||||||
```
|
```
|
||||||
User query (reformulated, language-preserved)
|
User query
|
||||||
│
|
│
|
||||||
├─ embeddings.embed_query(query) → query_vector [1024-dim]
|
├─ embeddings.embed_query(query) → query_vector [768-dim]
|
||||||
│
|
│
|
||||||
├─ ES bool query:
|
├─ ES multi_match (BM25) on fields [content^2, text^2]
|
||||||
│ ├─ must: multi_match (BM25) on [content^2, text^2]
|
│ └─ top-k BM25 hits
|
||||||
│ └─ should: boost spec/narrative doc_types (2.0x / 1.5x)
|
|
||||||
│ └─ top-k BM25 hits
|
|
||||||
│
|
│
|
||||||
└─ ES knn on field [embedding], num_candidates = k×5
|
└─ ES knn on field [embedding], num_candidates = k×5
|
||||||
└─ top-k kNN hits
|
└─ top-k kNN hits
|
||||||
|
|
@ -295,9 +221,7 @@ User query (reformulated, language-preserved)
|
||||||
└─ Top-8 documents → format_context() → context string
|
└─ Top-8 documents → format_context() → context string
|
||||||
```
|
```
|
||||||
|
|
||||||
**RRF constant:** `60` (standard value).
|
**RRF constant:** `60` (standard value; prevents high-rank documents from dominating while still rewarding consensus between both retrieval modes).
|
||||||
|
|
||||||
**doc_type boost:** `spec` and `narrative` chunks receive a score boost in the BM25 query to prioritize definitional and explanatory content over raw code examples when the query is about meaning or documentation.
|
|
||||||
|
|
||||||
**Chunk metadata** attached to each retrieved document:
|
**Chunk metadata** attached to each retrieved document:
|
||||||
|
|
||||||
|
|
@ -305,96 +229,36 @@ User query (reformulated, language-preserved)
|
||||||
|---|---|
|
|---|---|
|
||||||
| `chunk_id` | Unique identifier within the index |
|
| `chunk_id` | Unique identifier within the index |
|
||||||
| `source_file` | Origin document filename |
|
| `source_file` | Origin document filename |
|
||||||
| `doc_type` | `spec`, `code`, `code_example`, `bnf` |
|
| `doc_type` | `prose`, `code`, `code_example`, `bnf` |
|
||||||
| `block_type` | AVAP block type: `narrative`, `function`, `if`, `startLoop`, `try`, etc. |
|
| `block_type` | AVAP block type: `function`, `if`, `startLoop`, `try` |
|
||||||
| `section` | Document section/chapter heading |
|
| `section` | Document section/chapter heading |
|
||||||
|
|
||||||
---
|
Documents of type `code`, `code_example`, `bnf`, or block type `function / if / startLoop / try` are tagged as `[AVAP CODE]` in the formatted context, signaling the LLM to treat them as executable syntax rather than prose.
|
||||||
|
|
||||||
## 6. Editor Context Pipeline
|
|
||||||
|
|
||||||
The editor context pipeline (PRD-0002) allows the VS Code extension to send the user's active editor state alongside every query. The engine uses this context only when the user explicitly refers to their code.
|
|
||||||
|
|
||||||
### Transport
|
|
||||||
|
|
||||||
Editor context travels differently depending on the client protocol:
|
|
||||||
|
|
||||||
**Via gRPC directly (`AgentRequest` fields 3–6):**
|
|
||||||
- `editor_content` (field 3) — Base64-encoded full file content
|
|
||||||
- `selected_text` (field 4) — Base64-encoded selected text
|
|
||||||
- `extra_context` (field 5) — Base64-encoded free-form context
|
|
||||||
- `user_info` (field 6) — JSON string `{"dev_id":…,"project_id":…,"org_id":…}`
|
|
||||||
|
|
||||||
**Via HTTP proxy (OpenAI `/v1/chat/completions`):**
|
|
||||||
- Transported in the standard `user` field as a JSON string
|
|
||||||
- Same four keys, same encodings
|
|
||||||
- The proxy parses, extracts, and forwards to gRPC
|
|
||||||
|
|
||||||
### Pipeline
|
|
||||||
|
|
||||||
```
|
|
||||||
AgentRequest arrives
|
|
||||||
│
|
|
||||||
├─ server.py: Base64 decode editor_content, selected_text, extra_context
|
|
||||||
├─ user_info passed as-is (JSON string)
|
|
||||||
│
|
|
||||||
└─ initial_state populated with all four fields
|
|
||||||
│
|
|
||||||
▼
|
|
||||||
classify node:
|
|
||||||
├─ If selected_text present → injected into classify prompt as <editor_selection>
|
|
||||||
├─ LLM outputs: RETRIEVAL EDITOR or RETRIEVAL NO_EDITOR (etc.)
|
|
||||||
└─ use_editor_context = True if second token == EDITOR
|
|
||||||
│
|
|
||||||
▼
|
|
||||||
reformulate node:
|
|
||||||
├─ If use_editor_context=True AND selected_text present:
|
|
||||||
│ anchor = selected_text + "\n\nUser question: " + query
|
|
||||||
│ → LLM reformulates using selected code as primary signal
|
|
||||||
└─ Else: reformulate query as normal
|
|
||||||
│
|
|
||||||
▼
|
|
||||||
retrieve node: (unchanged — uses reformulated_query)
|
|
||||||
│
|
|
||||||
▼
|
|
||||||
generate / generate_code node:
|
|
||||||
├─ If use_editor_context=True:
|
|
||||||
│ prompt = <selected_code> + <editor_file> + <extra_context> + RAG_prompt
|
|
||||||
│ Priority: selected_text > editor_content > RAG context > extra_context
|
|
||||||
└─ Else: standard RAG prompt — no editor content injected
|
|
||||||
```
|
|
||||||
|
|
||||||
### Intent detection examples
|
|
||||||
|
|
||||||
| User message | `use_editor_context` | Reason |
|
|
||||||
|---|---|---|
|
|
||||||
| "Que significa AVAP?" | `False` | General definition question |
|
|
||||||
| "dame un API de hello world" | `False` | Code generation, no editor reference |
|
|
||||||
| "que hace este codigo?" | `True` | Explicit reference to "this code" |
|
|
||||||
| "fix this" | `True` | Explicit reference to current selection |
|
|
||||||
| "como mejoro esto?" | `True` | Explicit reference to current context |
|
|
||||||
| "how does addVar work?" | `False` | Documentation question, no editor reference |
|
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## 7. Streaming Architecture (AskAgentStream)
|
## 6. Streaming Architecture (AskAgentStream)
|
||||||
|
|
||||||
The two-phase streaming design is critical to understand:
|
The two-phase streaming design is critical to understand:
|
||||||
|
|
||||||
**Why not stream through LangGraph?**
|
**Why not stream through LangGraph?**
|
||||||
LangGraph's `stream()` method yields full state snapshots per node, not individual tokens. To achieve true per-token streaming to the gRPC client, the generation step is deliberately extracted from the graph and called directly via `llm.stream()`.
|
LangGraph's `stream()` method yields full state snapshots per node, not individual tokens. To achieve true per-token streaming to the gRPC client, the generation step is deliberately extracted from the graph and called directly via `llm.stream()`.
|
||||||
|
|
||||||
**Phase 1 — Deterministic preparation (graph-managed):**
|
**Phase 1 — Deterministic preparation (graph-managed):**
|
||||||
Classification, query reformulation, and retrieval run through `prepare_graph.invoke()`. This phase runs synchronously and produces the complete context before any token is emitted to the client. Editor context classification also happens here — `use_editor_context` is set in the prepared state.
|
- Classification, query reformulation, and retrieval run through `prepare_graph.invoke()`.
|
||||||
|
- This phase runs synchronously and produces the complete context before any token is emitted to the client.
|
||||||
|
|
||||||
**Phase 2 — Token streaming (manual):**
|
**Phase 2 — Token streaming (manual):**
|
||||||
`build_final_messages()` reconstructs the exact prompt, injecting editor context if `use_editor_context` is `True`. `llm.stream(final_messages)` yields one `AIMessageChunk` per token from Ollama. Each token is immediately forwarded as `AgentResponse{text=token, is_final=False}`.
|
- `build_final_messages()` reconstructs the exact prompt that `generate` / `generate_code` / `respond_conversational` would have used.
|
||||||
|
- `llm.stream(final_messages)` yields one `AIMessageChunk` per token from Ollama.
|
||||||
|
- Each token is immediately forwarded to the gRPC client as `AgentResponse{text=token, is_final=False}`.
|
||||||
|
- After the stream ends, the full assembled text is persisted to `session_store`.
|
||||||
|
|
||||||
**Backpressure:** gRPC streaming is flow-controlled by the client. If the client stops reading, the Ollama token stream will block at the `yield` point.
|
**Backpressure:** gRPC streaming is flow-controlled by the client. If the client stops reading, the Ollama token stream will block at the `yield` point. No explicit buffer overflow protection is implemented (acceptable for the current single-client dev mode).
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## 8. Evaluation Pipeline (EvaluateRAG)
|
## 7. Evaluation Pipeline (EvaluateRAG)
|
||||||
|
|
||||||
The evaluation suite implements an **offline RAG evaluation** pattern using RAGAS metrics.
|
The evaluation suite implements an **offline RAG evaluation** pattern using RAGAS metrics.
|
||||||
|
|
||||||
|
|
@ -424,7 +288,7 @@ verdict:
|
||||||
|
|
||||||
### Golden dataset
|
### Golden dataset
|
||||||
|
|
||||||
Located at `Docker/src/golden_dataset.json`. Each entry:
|
Located at `Docker/src/golden_dataset.json`. Each entry follows this schema:
|
||||||
|
|
||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
|
|
@ -435,11 +299,9 @@ Located at `Docker/src/golden_dataset.json`. Each entry:
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
> **Note:** The golden dataset does not include editor-context queries. EvaluateRAG measures the RAG pipeline in isolation. A separate editor-context golden dataset is planned as future work once the VS Code extension is validated.
|
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## 9. Data Ingestion Pipeline
|
## 8. Data Ingestion Pipeline
|
||||||
|
|
||||||
Documents flow into the Elasticsearch index through two paths:
|
Documents flow into the Elasticsearch index through two paths:
|
||||||
|
|
||||||
|
|
@ -455,29 +317,31 @@ scripts/pipelines/flows/elasticsearch_ingestion.py
|
||||||
│
|
│
|
||||||
├─ Load markdown files
|
├─ Load markdown files
|
||||||
├─ Chunk using scripts/pipelines/tasks/chunk.py
|
├─ Chunk using scripts/pipelines/tasks/chunk.py
|
||||||
|
│ (semantic chunking via Chonkie library)
|
||||||
├─ Generate embeddings via scripts/pipelines/tasks/embeddings.py
|
├─ Generate embeddings via scripts/pipelines/tasks/embeddings.py
|
||||||
|
│ (Ollama or HuggingFace embedding model)
|
||||||
└─ Bulk index into Elasticsearch
|
└─ Bulk index into Elasticsearch
|
||||||
|
index: avap-docs-* (configurable via ELASTICSEARCH_INDEX)
|
||||||
|
mapping: {content, embedding, source_file, doc_type, section, ...}
|
||||||
```
|
```
|
||||||
|
|
||||||
### Path B — AVAP native code chunker
|
### Path B — Synthetic AVAP code samples
|
||||||
|
|
||||||
```
|
```
|
||||||
docs/samples/*.avap
|
docs/samples/*.avap
|
||||||
│
|
│
|
||||||
▼
|
▼
|
||||||
scripts/pipelines/ingestion/avap_chunker.py
|
scripts/pipelines/flows/generate_mbap.py
|
||||||
│ (grammar: scripts/pipelines/ingestion/avap_config.json v2.0)
|
|
||||||
│
|
│
|
||||||
├─ Lexer strips comments and string contents
|
├─ Read AVAP LRM (docs/LRM/avap.md)
|
||||||
├─ Block detection (function, if, startLoop, try)
|
├─ Call Claude API to generate MBPP-style problems
|
||||||
├─ Statement classification (30 types + catch-all)
|
└─ Output synthetic_datasets/mbpp_avap.json
|
||||||
├─ Semantic tag assignment (18 boolean tags)
|
(used for fine-tuning and few-shot examples)
|
||||||
└─ Output: JSONL chunks → avap_ingestor.py → Elasticsearch
|
|
||||||
```
|
```
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## 10. Infrastructure Layout
|
## 9. Infrastructure Layout
|
||||||
|
|
||||||
### Devaron Cluster (Vultr Kubernetes)
|
### Devaron Cluster (Vultr Kubernetes)
|
||||||
|
|
||||||
|
|
@ -488,6 +352,22 @@ scripts/pipelines/ingestion/avap_chunker.py
|
||||||
| Observability DB | `brunix-postgres` | `5432` | PostgreSQL for Langfuse |
|
| Observability DB | `brunix-postgres` | `5432` | PostgreSQL for Langfuse |
|
||||||
| Langfuse UI | — | `80` | `http://45.77.119.180` |
|
| Langfuse UI | — | `80` | `http://45.77.119.180` |
|
||||||
|
|
||||||
|
### Kubernetes tunnel commands
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Terminal 1 — LLM
|
||||||
|
kubectl port-forward --address 0.0.0.0 svc/ollama-light-service 11434:11434 \
|
||||||
|
-n brunix --kubeconfig ./kubernetes/kubeconfig.yaml
|
||||||
|
|
||||||
|
# Terminal 2 — Elasticsearch
|
||||||
|
kubectl port-forward --address 0.0.0.0 svc/brunix-vector-db 9200:9200 \
|
||||||
|
-n brunix --kubeconfig ./kubernetes/kubeconfig.yaml
|
||||||
|
|
||||||
|
# Terminal 3 — PostgreSQL (Langfuse)
|
||||||
|
kubectl port-forward --address 0.0.0.0 svc/brunix-postgres 5432:5432 \
|
||||||
|
-n brunix --kubeconfig ./kubernetes/kubeconfig.yaml
|
||||||
|
```
|
||||||
|
|
||||||
### Port map summary
|
### Port map summary
|
||||||
|
|
||||||
| Port | Protocol | Service | Scope |
|
| Port | Protocol | Service | Scope |
|
||||||
|
|
@ -501,7 +381,7 @@ scripts/pipelines/ingestion/avap_chunker.py
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## 11. Session State & Conversation Memory
|
## 10. Session State & Conversation Memory
|
||||||
|
|
||||||
Conversation history is managed via an in-process dictionary:
|
Conversation history is managed via an in-process dictionary:
|
||||||
|
|
||||||
|
|
@ -515,63 +395,69 @@ session_store: dict[str, list] = defaultdict(list)
|
||||||
- **In-memory only.** History is lost on container restart.
|
- **In-memory only.** History is lost on container restart.
|
||||||
- **No TTL or eviction.** Sessions grow unbounded for the lifetime of the process.
|
- **No TTL or eviction.** Sessions grow unbounded for the lifetime of the process.
|
||||||
- **Thread safety:** Python's GIL provides basic safety for the `ThreadPoolExecutor(max_workers=10)` gRPC server, but concurrent writes to the same `session_id` from two simultaneous requests are not explicitly protected.
|
- **Thread safety:** Python's GIL provides basic safety for the `ThreadPoolExecutor(max_workers=10)` gRPC server, but concurrent writes to the same `session_id` from two simultaneous requests are not explicitly protected.
|
||||||
- **History window:** `format_history_for_classify()` uses only the last 6 messages for query classification.
|
- **History window:** `format_history_for_classify()` uses only the last 6 messages for query classification to keep the classify prompt short and deterministic.
|
||||||
|
|
||||||
> **Future work:** Replace `session_store` with a Redis-backed persistent store to survive restarts and support horizontal scaling.
|
> **Future work:** Replace `session_store` with a Redis-backed persistent store to survive restarts and support horizontal scaling.
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## 12. Observability Stack
|
## 11. Observability Stack
|
||||||
|
|
||||||
### Langfuse tracing
|
### Langfuse tracing
|
||||||
|
|
||||||
Every `AskAgent` / `AskAgentStream` request creates a trace capturing input query, session ID, each LangGraph node execution, LLM token counts, latency, and final response.
|
The server integrates Langfuse for end-to-end LLM tracing. Every `AskAgent` / `AskAgentStream` request creates a trace that captures:
|
||||||
|
- Input query and session ID
|
||||||
|
- Each LangGraph node execution (classify, reformulate, retrieve, generate)
|
||||||
|
- LLM token counts, latency, and cost
|
||||||
|
- Final response
|
||||||
|
|
||||||
**Access:** `http://45.77.119.180`
|
**Access:** `http://45.77.119.180` — requires a project API key configured via `LANGFUSE_PUBLIC_KEY` and `LANGFUSE_SECRET_KEY`.
|
||||||
|
|
||||||
### Logging
|
### Logging
|
||||||
|
|
||||||
|
Structured logging via Python's `logging` module, configured at `INFO` level. Log format:
|
||||||
|
|
||||||
|
```
|
||||||
|
[MODULE] context_info — key=value key=value
|
||||||
|
```
|
||||||
|
|
||||||
Key log markers:
|
Key log markers:
|
||||||
|
|
||||||
| Marker | Module | Meaning |
|
| Marker | Module | Meaning |
|
||||||
|---|---|---|
|
|---|---|---|
|
||||||
| `[ESEARCH]` | `server.py` | Elasticsearch connection status |
|
| `[ESEARCH]` | `server.py` | Elasticsearch connection status |
|
||||||
| `[classify]` | `graph.py` | Query type + `use_editor_context` flag + raw LLM output |
|
| `[classify]` | `graph.py` | Query type decision + raw LLM output |
|
||||||
| `[reformulate]` | `graph.py` | Reformulated query string + whether selected_text was used as anchor |
|
| `[reformulate]` | `graph.py` | Reformulated query string |
|
||||||
| `[hybrid]` | `graph.py` | BM25 / kNN hit counts and RRF result count |
|
| `[hybrid]` | `graph.py` | BM25 / kNN hit counts and RRF result count |
|
||||||
| `[retrieve]` | `graph.py` | Number of docs retrieved and context length |
|
| `[retrieve]` | `graph.py` | Number of docs retrieved and context length |
|
||||||
| `[generate]` | `graph.py` | Response character count |
|
| `[generate]` | `graph.py` | Response character count |
|
||||||
| `[AskAgent]` | `server.py` | editor and selected flags, query preview |
|
|
||||||
| `[AskAgentStream]` | `server.py` | Token count and total chars per stream |
|
| `[AskAgentStream]` | `server.py` | Token count and total chars per stream |
|
||||||
| `[base64]` | `server.py` | Warning when a Base64 field fails to decode |
|
| `[eval]` | `evaluate.py` | Per-question retrieval and generation status |
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## 13. Security Boundaries
|
## 12. Security Boundaries
|
||||||
|
|
||||||
| Boundary | Current state | Risk |
|
| Boundary | Current state | Risk |
|
||||||
|---|---|---|
|
|---|---|---|
|
||||||
| gRPC transport | **Insecure** (`add_insecure_port`) | Network interception possible. Acceptable in dev/tunnel setup; requires mTLS for production. |
|
| gRPC transport | **Insecure** (`add_insecure_port`) | Network interception possible. Acceptable in dev/tunnel setup; requires mTLS for production. |
|
||||||
| Elasticsearch auth | Optional (user/pass or API key via env vars) | Index is accessible without auth if vars are unset. |
|
| Elasticsearch auth | Optional (user/pass or API key via env vars) | Index is accessible without auth if `ELASTICSEARCH_USER` and `ELASTICSEARCH_API_KEY` are unset. |
|
||||||
| Editor context | Transmitted in plaintext (Base64 is encoding, not encryption) | File contents visible to anyone intercepting gRPC traffic. Requires TLS for production. |
|
|
||||||
| Container user | Non-root (`python:3.11-slim` default) | Low risk. Do not override with `root`. |
|
| Container user | Non-root (`python:3.11-slim` default) | Low risk. Do not override with `root`. |
|
||||||
| Secrets in env | Via `.env` / `docker-compose` env injection | Never commit real values. |
|
| Secrets in env | Via `.env` / `docker-compose` env injection | Never commit real values. See [CONTRIBUTING.md](../CONTRIBUTING.md#6-environment-variables-policy). |
|
||||||
| Session store | In-memory, no auth | Any caller with gRPC access can read/write any session by guessing its ID. |
|
| Session store | In-memory, no auth | Any caller with access to the gRPC port can read/write any session by guessing its ID. |
|
||||||
| `user_info` | JSON string, no validation | `dev_id`, `project_id`, `org_id` are not authenticated — passed as metadata only. |
|
| Kubeconfig | `./kubernetes/kubeconfig.yaml` (local only) | Grants cluster access. Never commit. Listed in `.gitignore`. |
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## 14. Known Limitations & Future Work
|
## 13. Known Limitations & Future Work
|
||||||
|
|
||||||
| Area | Limitation | Proposed solution |
|
| Area | Limitation | Proposed solution |
|
||||||
|---|---|---|
|
|---|---|---|
|
||||||
| Session persistence | In-memory, lost on restart | Redis-backed `session_store` |
|
| Session persistence | In-memory, lost on restart | Redis-backed `session_store` |
|
||||||
| Horizontal scaling | `session_store` is per-process | Sticky sessions or external session store |
|
| Horizontal scaling | `session_store` is per-process | Sticky sessions or external session store |
|
||||||
| gRPC security | Insecure port | Add TLS + optional mTLS |
|
| gRPC security | Insecure port | Add TLS + optional mTLS |
|
||||||
| Editor context security | Base64 is not encryption | TLS required before sending real file contents |
|
|
||||||
| `user_info` auth | Not validated or authenticated | JWT or API key validation on `user_info` fields |
|
|
||||||
| Elasticsearch auth | Not enforced if vars unset | Make auth required; fail-fast on startup |
|
| Elasticsearch auth | Not enforced if vars unset | Make auth required; fail-fast on startup |
|
||||||
| Context window | Full history passed to generate; no truncation | Sliding window or summarization for long sessions |
|
| Context window | Full history passed to generate; no truncation | Sliding window or summarization for long sessions |
|
||||||
| Evaluation | Golden dataset has no editor-context queries | Build dedicated editor-context golden dataset after VS Code validation |
|
| Evaluation | Golden dataset must be manually maintained | Automated golden dataset refresh pipeline |
|
||||||
| Rate limiting | None on gRPC server | Add interceptor-based rate limiter |
|
| Rate limiting | None on gRPC server | Add interceptor-based rate limiter |
|
||||||
| Health check | No gRPC health protocol | Implement `grpc.health.v1` |
|
| Health check | No gRPC health protocol | Implement `grpc.health.v1` |
|
||||||
|
|
|
||||||
|
|
@ -1,3 +0,0 @@
|
||||||
nivel = 5
|
|
||||||
es_admin = nivel >= 10
|
|
||||||
addResult(es_admin)
|
|
||||||
|
|
@ -1,4 +0,0 @@
|
||||||
subtotal = 150.50
|
|
||||||
iva = subtotal * 0.21
|
|
||||||
total = subtotal + iva
|
|
||||||
addResult(total)
|
|
||||||
|
|
@ -1,6 +0,0 @@
|
||||||
startLoop(i,1,10)
|
|
||||||
item = "item_%s" % i
|
|
||||||
AddvariableToJSON(item,'valor_generado',mi_json)
|
|
||||||
endLoop()
|
|
||||||
addResult(mi_json)
|
|
||||||
|
|
||||||
|
|
@ -1,7 +0,0 @@
|
||||||
registros = ['1','2','3']
|
|
||||||
getListLen(registros, total)
|
|
||||||
contador = 0
|
|
||||||
startLoop(idx, 0, 2)
|
|
||||||
actual = registros[int(idx)]
|
|
||||||
endLoop()
|
|
||||||
addResult(actual)
|
|
||||||
|
|
@ -1,2 +0,0 @@
|
||||||
getDateTime("", 86400, "UTC", expira)
|
|
||||||
addResult(expira)
|
|
||||||
|
|
@ -1,2 +0,0 @@
|
||||||
addParam("client_id", id_interno)
|
|
||||||
addResult(id_interno)
|
|
||||||
|
|
@ -1,3 +0,0 @@
|
||||||
addParam(emails,emails)
|
|
||||||
getQueryParamList(lista_correos)
|
|
||||||
addResult(lista_correos)
|
|
||||||
|
|
@ -1,6 +0,0 @@
|
||||||
addParam("lang", l)
|
|
||||||
addParam("lang2", l2)
|
|
||||||
if(l, "es", "=")
|
|
||||||
addVar(msg, "Hola")
|
|
||||||
end()
|
|
||||||
addResult(msg)
|
|
||||||
|
|
@ -1,3 +0,0 @@
|
||||||
nombre = "Sistema"
|
|
||||||
log = "Evento registrado por: %s" % nombre
|
|
||||||
addResult(log)
|
|
||||||
|
|
@ -1,4 +0,0 @@
|
||||||
datos_cliente = "datos"
|
|
||||||
addVar(clave, "cliente_vip")
|
|
||||||
AddvariableToJSON(clave, datos_cliente, mi_json_final)
|
|
||||||
addResult(mi_json_final)
|
|
||||||
|
|
@ -1,3 +0,0 @@
|
||||||
addParam("data_list", mi_lista)
|
|
||||||
getListLen(mi_lista, cantidad)
|
|
||||||
addResult(cantidad)
|
|
||||||
|
|
@ -1,2 +0,0 @@
|
||||||
stampToDatetime(1708726162, "%d/%m/%Y", 0, fecha_human)
|
|
||||||
addResult(fecha_human)
|
|
||||||
|
|
@ -1,7 +0,0 @@
|
||||||
addParam(sal_par,saldo)
|
|
||||||
if(saldo, 0, ">")
|
|
||||||
permitir = True
|
|
||||||
else()
|
|
||||||
permitir = False
|
|
||||||
end()
|
|
||||||
addResult(permitir)
|
|
||||||
|
|
@ -1,6 +0,0 @@
|
||||||
addParam(userrype, user_type)
|
|
||||||
addParam(sells, compras)
|
|
||||||
if(None, None, " user_type == 'VIP' or compras > 100")
|
|
||||||
addVar(descuento, 0.20)
|
|
||||||
end()
|
|
||||||
addResult(descuento)
|
|
||||||
|
|
@ -1,2 +0,0 @@
|
||||||
getDateTime("%Y-%m-%d %H:%M:%S", 0, "Europe/Madrid", sql_date)
|
|
||||||
addResult(sql_date)
|
|
||||||
|
|
@ -1,6 +0,0 @@
|
||||||
function suma(a, b){
|
|
||||||
total = a + b
|
|
||||||
return(total)
|
|
||||||
}
|
|
||||||
resultado = suma(10, 20)
|
|
||||||
addResult(resultado)
|
|
||||||
|
|
@ -1,9 +0,0 @@
|
||||||
function es_valido(token){
|
|
||||||
response = False
|
|
||||||
if(token, "SECRET", "=")
|
|
||||||
response = True
|
|
||||||
end()
|
|
||||||
return(response)
|
|
||||||
}
|
|
||||||
autorizado = es_valido("SECRET")
|
|
||||||
addResult(autorizado)
|
|
||||||
|
|
@ -1,2 +0,0 @@
|
||||||
randomString("[A-Z]\d", 32, token_seguridad)
|
|
||||||
addResult(token_seguridad)
|
|
||||||
|
|
@ -1,2 +0,0 @@
|
||||||
encodeSHA256("payload_data", checksum)
|
|
||||||
addResult(checksum)
|
|
||||||
|
|
@ -1,2 +0,0 @@
|
||||||
addVar(mensaje, "Hola mundo desde AVAP")
|
|
||||||
addResult(mensaje)
|
|
||||||
|
|
@ -1,6 +0,0 @@
|
||||||
addParam(password,pass_nueva)
|
|
||||||
pass_antigua = "password"
|
|
||||||
if(pass_nueva, pass_antigua, "!=")
|
|
||||||
addVar(cambio, "Contraseña actualizada")
|
|
||||||
end()
|
|
||||||
addResult(cambio)
|
|
||||||
|
|
@ -1,2 +0,0 @@
|
||||||
replace("REF_1234_OLD","OLD", "NEW", ref_actualizada)
|
|
||||||
addResult(ref_actualizada)
|
|
||||||
|
|
@ -1,5 +0,0 @@
|
||||||
try()
|
|
||||||
ormDirect("UPDATE table_inexistente SET a=1", res)
|
|
||||||
exception(e)
|
|
||||||
addVar(_status,500)
|
|
||||||
addResult("Error de base de datos")
|
|
||||||
|
|
@ -1,2 +0,0 @@
|
||||||
getDateTime("", 0, "UTC", ahora)
|
|
||||||
addResult(ahora)
|
|
||||||
|
|
@ -1,6 +0,0 @@
|
||||||
ormCheckTable(tabla_pruebas,resultado_comprobacion)
|
|
||||||
if(resultado_comprobacion,False,'==')
|
|
||||||
ormCreateTable("username,age",'VARCHAR,INTEGER',tabla_pruebas,resultado_creacion)
|
|
||||||
end()
|
|
||||||
addResult(resultado_comprobacion)
|
|
||||||
addResult(resultado_creacion)
|
|
||||||
|
|
@ -1,14 +0,0 @@
|
||||||
addParam("page", p)
|
|
||||||
addParam("size", s)
|
|
||||||
registros = ["u1", "u2", "u3", "u4", "u5", "u6"]
|
|
||||||
offset = int(p) * int(s)
|
|
||||||
limite = offset + int(s)
|
|
||||||
contador = 0
|
|
||||||
addResult(offset)
|
|
||||||
addResult(limite)
|
|
||||||
startLoop(i, 2, limite)
|
|
||||||
actual = registros[int(i)]
|
|
||||||
titulo = "reg_%s" % i
|
|
||||||
AddvariableToJSON(titulo, actual, pagina_json)
|
|
||||||
endLoop()
|
|
||||||
addResult(pagina_json)
|
|
||||||
|
|
@ -1,3 +0,0 @@
|
||||||
addVar(base, 1000)
|
|
||||||
addVar(copia, $base)
|
|
||||||
addResult(copia)
|
|
||||||
|
|
@ -1,9 +0,0 @@
|
||||||
addParam("password_base", password_base)
|
|
||||||
replace(password_base, "a", "@", temp1)
|
|
||||||
replace(temp1, "e", "3", temp2)
|
|
||||||
replace(temp2, "o", "0", temp3)
|
|
||||||
replace(temp3, "i", "!", modified_password)
|
|
||||||
randomString("[a-zA-Z0-9]", 4, suffix)
|
|
||||||
addVar(final_password, modified_password)
|
|
||||||
final_password = final_password + suffix
|
|
||||||
addResult(final_password)
|
|
||||||
|
|
@ -1,4 +0,0 @@
|
||||||
addVar(code, 200)
|
|
||||||
addVar(status, "Success")
|
|
||||||
addResult(code)
|
|
||||||
addResult(status)
|
|
||||||
|
|
@ -1,8 +0,0 @@
|
||||||
encontrado = False
|
|
||||||
startLoop(i, 1, 10)
|
|
||||||
if(i, 5, "==")
|
|
||||||
encontrado = True
|
|
||||||
i = 11
|
|
||||||
end()
|
|
||||||
endLoop()
|
|
||||||
addResult(encontrado)
|
|
||||||
|
|
@ -1,5 +0,0 @@
|
||||||
addParam("password", password)
|
|
||||||
encodeSHA256(password, hashed_password)
|
|
||||||
randomString("[a-zA-Z0-9]", 32, secure_token)
|
|
||||||
addResult(hashed_password)
|
|
||||||
addResult(secure_token)
|
|
||||||
|
|
@ -1,5 +0,0 @@
|
||||||
try()
|
|
||||||
RequestGet("https://api.test.com/data", 0, 0, respuesta)
|
|
||||||
exception(e)
|
|
||||||
addVar(error_trace, "Fallo de conexión: %s" % e)
|
|
||||||
addResult(error_trace)
|
|
||||||
|
|
@ -1,6 +0,0 @@
|
||||||
addParam("api_key", key)
|
|
||||||
if(key, None, "==")
|
|
||||||
addVar(_status, 403)
|
|
||||||
addVar(error, "Acceso denegado: falta API KEY")
|
|
||||||
addResult(error)
|
|
||||||
end()
|
|
||||||
|
|
@ -1,2 +0,0 @@
|
||||||
stub(addResult(error), 5) => {}
|
|
||||||
assert(addResult(error), 5): {}
|
|
||||||
|
|
@ -1,5 +0,0 @@
|
||||||
addParam("rol", r)
|
|
||||||
if(r, ["admin", "editor", "root"], "in")
|
|
||||||
acceso = True
|
|
||||||
end()
|
|
||||||
addResult(acceso)
|
|
||||||
|
|
@ -1,178 +0,0 @@
|
||||||
# RP-0001: Pre-Implementation Validation for ADR-0007 (MSVL)
|
|
||||||
|
|
||||||
**Date:** 2026-04-06
|
|
||||||
**Status:** Proposed
|
|
||||||
**Author:** Rafael Ruiz (CTO)
|
|
||||||
**Executed by:** AI Team (Pablo)
|
|
||||||
**Related ADR:** ADR-0007 (Mandatory Syntactic Validation Layer)
|
|
||||||
**Input data:** 6 evaluation runs from 2026-04-06 (`evaluation_*.json`)
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Purpose
|
|
||||||
|
|
||||||
ADR-0007 defines four implementation decisions: parser gRPC integration, dynamic few-shot injection, N=5 Docker protocol, and syntactic confusion matrix. Before assigning engineering work to any of these, two questions must be answered empirically with data the team already has:
|
|
||||||
|
|
||||||
1. **Is the syntactic failure rate structurally predictable?** If failures concentrate in specific question categories or construct types, the few-shot pool and documentation effort can be targeted. If failures are random, the problem is model capability and few-shot injection may not be sufficient.
|
|
||||||
|
|
||||||
2. **Does few-shot injection reduce foreign syntax injection before we build the parser gate?** The few-shot change is cheap to test manually. If it eliminates the majority of violations, the urgency profile of the remaining ADR-0007 decisions changes.
|
|
||||||
|
|
||||||
These two experiments must be completed and their results reviewed before any ADR-0007 implementation work begins.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Experiment 1 — Syntactic Failure Taxonomy
|
|
||||||
|
|
||||||
### Hypothesis
|
|
||||||
|
|
||||||
Syntactic violations are not uniformly distributed across question categories. `CODE_GENERATION` questions will show significantly higher violation rates than `RETRIEVAL` or `CONVERSATIONAL` questions, because code generation requires the model to produce executable AVAP syntax rather than describe it. Within `CODE_GENERATION`, violations will concentrate in construct combinations not covered by the current few-shot pool (loops with JSON building, ORM with error handling, multi-function scripts).
|
|
||||||
|
|
||||||
### What we already know from the 2026-04-06 runs
|
|
||||||
|
|
||||||
The forensic analysis in ADR-0007 identified the following confirmed violations across 6 runs (300 entries total):
|
|
||||||
|
|
||||||
| Failure type | Confirmed cases | Dominant model |
|
|
||||||
|---|---|---|
|
|
||||||
| Foreign language injection (Go, Python, JS) | 20 | bge-m3, qwen3 |
|
|
||||||
| Hallucinated AVAP commands | 12 | qwen3, harrier |
|
|
||||||
| Structural foreign syntax (while, let, var, for) | 8 | bge-m3, qwen3 |
|
|
||||||
|
|
||||||
**Critical limitation:** `answer_preview` fields are truncated at ~300 characters. The full response bodies are not available in the JSON files. The counts above are lower bounds.
|
|
||||||
|
|
||||||
### Method
|
|
||||||
|
|
||||||
**Step 1 — Full response body access.** Re-run `EvaluateRAG` for the two best-performing configurations (harrier/avap-docs-test-v4 and qwen3/avap-docs-test-v4) with a modified `evaluate.py` that logs the complete `answer` field, not just `answer_preview`. This is a one-line change. Run once, not N=5 — this is exploratory, not a benchmarking run.
|
|
||||||
|
|
||||||
**Step 2 — Manual taxonomy on CODE_GENERATION entries.** For all 20 `GD-C-*` entries per run, classify each response into one of four categories:
|
|
||||||
|
|
||||||
| Category | Definition |
|
|
||||||
|---|---|
|
|
||||||
| `VALID_AVAP` | All constructs present are valid AVAP. May be incomplete but syntactically correct. |
|
|
||||||
| `FOREIGN_SYNTAX` | Contains identifiable syntax from Go, Python, or JavaScript. |
|
|
||||||
| `INVENTED_COMMAND` | Uses a command name that does not exist in the LRM. |
|
|
||||||
| `STRUCTURAL_ERROR` | Grammatically wrong AVAP (wrong argument count, missing `end()`, wrong block nesting). |
|
|
||||||
|
|
||||||
One reviewer, consistent criteria. The goal is not perfect precision — it is identifying whether `CODE_GENERATION` failures are concentrated in specific construct types.
|
|
||||||
|
|
||||||
**Step 3 — Construct-level breakdown.** For every `FOREIGN_SYNTAX` or `INVENTED_COMMAND` entry in `CODE_GENERATION`, record which AVAP construct the question required and which the model failed on. Use the question text to infer the target construct:
|
|
||||||
|
|
||||||
| Question | Target construct |
|
|
||||||
|---|---|
|
|
||||||
| GD-C-003 (loop + JSON) | `startLoop` + `AddVariableToJSON` |
|
|
||||||
| GD-C-005 (GET + error handling) | `RequestGet` + `try/exception` |
|
|
||||||
| GD-C-011 (ORM table check) | `ormCheckTable` + `ormCreateTable` |
|
|
||||||
| GD-C-014 (list length) | `getListLen` + `itemFromList` |
|
|
||||||
| ... | ... |
|
|
||||||
|
|
||||||
**Step 4 — Cross-model comparison.** Compare the taxonomy distributions between harrier and qwen3 on the same index. If one model shows a qualitatively different failure profile (e.g., harrier fails on ORM, qwen3 fails on loops), the few-shot pool composition matters more than the pool size.
|
|
||||||
|
|
||||||
### Success criteria
|
|
||||||
|
|
||||||
The experiment is conclusive if it produces one of these two findings:
|
|
||||||
|
|
||||||
**Finding A (concentrated failures):** ≥ 70% of `CODE_GENERATION` violations occur in ≤ 5 distinct construct combinations. This means the few-shot pool can be targeted and the ADR-0007 few-shot injection decision is high-leverage.
|
|
||||||
|
|
||||||
**Finding B (distributed failures):** Violations are spread across ≥ 10 distinct construct combinations with no clear concentration. This means the model lacks general AVAP grammar coverage and few-shot injection alone will be insufficient — the parser gate becomes the primary defence, not a secondary one.
|
|
||||||
|
|
||||||
### Output
|
|
||||||
|
|
||||||
A one-page table: construct × model × failure type × count. This table becomes the first version of the syntactic confusion matrix specified in ADR-0007 Section 7, produced without any infrastructure changes.
|
|
||||||
|
|
||||||
### Estimated effort
|
|
||||||
|
|
||||||
2–3 hours. One `evaluate.py` modification (log full answer), two evaluation runs (no N=5, no seeds required — exploratory), one manual taxonomy pass on ~40 entries.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Experiment 2 — Few-Shot Injection A/B
|
|
||||||
|
|
||||||
### Hypothesis
|
|
||||||
|
|
||||||
Injecting 5 semantically similar AVAP examples from the 190-example pool into the generation prompt will reduce foreign syntax injection in `CODE_GENERATION` entries by ≥ 60% compared to the current baseline (no few-shot context). The reduction will be measurable manually without a parser gate, because the most severe violations (complete Go programs, Python `for` loops) are visually identifiable.
|
|
||||||
|
|
||||||
### Dependency
|
|
||||||
|
|
||||||
Experiment 2 should be run **after** Experiment 1. The construct-level breakdown from Experiment 1 informs which few-shot examples to select: if GD-C-003 (loop + JSON) fails consistently, the few-shot examples injected for that query should include `bucle_1_10.avap` and `construccion_dinamica_de_objeto.avap` from the LRM pool, not generic examples.
|
|
||||||
|
|
||||||
### Method
|
|
||||||
|
|
||||||
**Step 1 — Build the few-shot retrieval function.** Using `src/utils/emb_factory.py` (already exists), embed the 190 examples from `docs/LRM/*.avap`. For each query in the golden dataset, retrieve the top-5 most similar examples by cosine similarity. Log which examples are selected per query.
|
|
||||||
|
|
||||||
**Step 2 — Modify the generation prompt.** Add a few-shot block before the user query in `prompts.py`. Format:
|
|
||||||
|
|
||||||
```
|
|
||||||
The following are valid AVAP code examples. Use them as syntactic reference.
|
|
||||||
|
|
||||||
--- Example 1 ---
|
|
||||||
{few_shot_example_1}
|
|
||||||
|
|
||||||
--- Example 2 ---
|
|
||||||
{few_shot_example_2}
|
|
||||||
|
|
||||||
[...up to 5]
|
|
||||||
|
|
||||||
Now answer the following question using only valid AVAP syntax:
|
|
||||||
{query}
|
|
||||||
```
|
|
||||||
|
|
||||||
**Step 3 — Run one evaluation pass** on the same two configurations as Experiment 1 (harrier/avap-docs-test-v4 and qwen3/avap-docs-test-v4). Log full response bodies. This is still exploratory — no N=5, no seeds.
|
|
||||||
|
|
||||||
**Step 4 — Manual comparison.** Apply the same taxonomy from Experiment 1 to the new responses. Count `FOREIGN_SYNTAX` and `INVENTED_COMMAND` entries before and after few-shot injection.
|
|
||||||
|
|
||||||
**Step 5 — RAGAS delta.** Compare global scores between baseline and few-shot runs. A few-shot injection that reduces syntactic violations but also reduces RAGAS scores significantly would indicate that the few-shot context is consuming context window at the expense of retrieval quality — this informs the K parameter decision in ADR-0007.
|
|
||||||
|
|
||||||
### Success criteria
|
|
||||||
|
|
||||||
| Result | Interpretation | Implication for ADR-0007 |
|
|
||||||
|---|---|---|
|
|
||||||
| Foreign syntax violations drop ≥ 60% | Few-shot injection is high-leverage | Prioritise few-shot implementation before parser gate |
|
|
||||||
| Foreign syntax violations drop 20–60% | Few-shot helps but is insufficient alone | Implement both in parallel |
|
|
||||||
| Foreign syntax violations drop < 20% | Model lacks AVAP grammar at a fundamental level | Parser gate is the primary defence; few-shot pool needs expansion or the base model needs replacement |
|
|
||||||
| RAGAS global score drops > 0.05 | Context window competition is real | Reduce K or implement dynamic context window management |
|
|
||||||
|
|
||||||
### Output
|
|
||||||
|
|
||||||
A 2×2 table: model × condition (baseline / few-shot) × violation count × RAGAS global score. Plus the few-shot retrieval log showing which examples were selected for which queries — this is the raw input for pool quality analysis.
|
|
||||||
|
|
||||||
### Estimated effort
|
|
||||||
|
|
||||||
4–6 hours. Embedding the 190 examples + retrieval function (~2h), prompt modification (~30min), two evaluation runs (~2h), manual taxonomy pass (~1h).
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Decision gate
|
|
||||||
|
|
||||||
Both experiments feed a **go/no-go decision** for ADR-0007 implementation:
|
|
||||||
|
|
||||||
| Scenario | Decision |
|
|
||||||
|---|---|
|
|
||||||
| Exp 1: concentrated failures + Exp 2: ≥ 60% reduction | Implement few-shot first, parser gate second. The few-shot pool composition (informed by the confusion matrix) is the highest-leverage action. |
|
|
||||||
| Exp 1: concentrated failures + Exp 2: < 60% reduction | Implement parser gate and few-shot in parallel. The concentrated failure profile informs pool expansion. |
|
|
||||||
| Exp 1: distributed failures + Exp 2: any result | Parser gate is the primary defence. Few-shot injection is a secondary measure. The base model may need re-evaluation. |
|
|
||||||
| Both experiments inconclusive | Run Experiment 1 with full response bodies and a second annotator before proceeding. |
|
|
||||||
|
|
||||||
This decision gate replaces the need for an architectural meeting to assign priorities — the data makes the priority order self-evident.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## What this protocol does not answer
|
|
||||||
|
|
||||||
- Whether the AVAP Parser gRPC service can handle the throughput of N=5 evaluation runs (50 queries × 5 runs = 250 parser calls). That requires a load test on the parser service, not an evaluation run.
|
|
||||||
- Whether 190 examples are sufficient to cover the confusion matrix tail. That requires the confusion matrix from Experiment 1 to exist first.
|
|
||||||
- The minimum `syntactic_validity` threshold for production readiness. That requires at least one MSVL-validated run with known-good and known-bad models to calibrate.
|
|
||||||
|
|
||||||
These three questions are explicitly deferred to the post-implementation phase of ADR-0007.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Timeline
|
|
||||||
|
|
||||||
| Step | Owner | Estimated duration |
|
|
||||||
|---|---|---|
|
|
||||||
| Experiment 1: `evaluate.py` modification + 2 runs | Pablo (AI Team) | 1 day |
|
|
||||||
| Experiment 1: manual taxonomy + confusion matrix draft | Pablo (AI Team) | 1 day |
|
|
||||||
| Experiment 2: few-shot retrieval function + prompt modification | Pablo (AI Team) | 1 day |
|
|
||||||
| Experiment 2: 2 runs + manual comparison | Pablo (AI Team) | 1 day |
|
|
||||||
| Results review and go/no-go decision | Rafael Ruiz (CTO) + Pablo | 1 meeting |
|
|
||||||
|
|
||||||
**Total: 4 working days before any infrastructure change from ADR-0007 is scheduled.**
|
|
||||||
|
|
@ -1,89 +0,0 @@
|
||||||
# PRD-0001: OpenAI-Compatible HTTP Proxy
|
|
||||||
|
|
||||||
**Date:** 2026-03-18
|
|
||||||
**Status:** Implemented
|
|
||||||
**Requested by:** Rafael Ruiz (CTO)
|
|
||||||
**Implemented in:** PR #58
|
|
||||||
**Related ADR:** ADR-0001 (gRPC as primary interface)
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Problem
|
|
||||||
|
|
||||||
The Brunix Assistance Engine exposes a gRPC interface as its primary API. gRPC is the right choice for performance and type safety in server-to-server communication, but it creates a significant adoption barrier for two categories of consumers:
|
|
||||||
|
|
||||||
**Existing OpenAI integrations.** Any tool or client already configured to call the OpenAI API — VS Code extensions using `continue.dev`, LiteLLM routers, Open WebUI instances, internal tooling at 101OBEX, Corp — requires code changes to switch to gRPC. The switching cost is non-trivial and creates friction that slows adoption.
|
|
||||||
|
|
||||||
**Model replacement use case.** The core strategic value of the Brunix RAG is that it can replace direct OpenAI API consumption with a locally-hosted, domain-specific assistant that has no per-token cost and no data privacy concerns. This value proposition is only actionable if the replacement is transparent — i.e., the client does not need to change to consume the Brunix RAG instead of OpenAI.
|
|
||||||
|
|
||||||
Without a compatibility layer, the Brunix engine cannot serve as a drop-in replacement for OpenAI models. Every potential adopter faces an integration project instead of a configuration change.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Solution
|
|
||||||
|
|
||||||
Implement an HTTP server running alongside the gRPC server that exposes:
|
|
||||||
|
|
||||||
- The OpenAI Chat Completions API (`/v1/chat/completions`) — both streaming and non-streaming
|
|
||||||
- The OpenAI Completions API (`/v1/completions`) — legacy support
|
|
||||||
- The OpenAI Models API (`/v1/models`) — for compatibility with clients that enumerate available models
|
|
||||||
- The Ollama Chat API (`/api/chat`) — NDJSON streaming format
|
|
||||||
- The Ollama Generate API (`/api/generate`) — for Ollama-native clients
|
|
||||||
- The Ollama Tags API (`/api/tags`) — for clients that list available models
|
|
||||||
- A health endpoint (`/health`)
|
|
||||||
|
|
||||||
The proxy bridges HTTP → gRPC internally: `stream: false` routes to `AskAgent`, `stream: true` routes to `AskAgentStream`. The gRPC interface remains the primary interface and is not modified.
|
|
||||||
|
|
||||||
Any client that currently points to `https://api.openai.com` can be reconfigured to point to `http://localhost:8000` (or the server's address) with `model: brunix` and will work without any other change.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Scope
|
|
||||||
|
|
||||||
**In scope:**
|
|
||||||
- OpenAI-compatible endpoints as listed above
|
|
||||||
- Ollama-compatible endpoints as listed above
|
|
||||||
- Routing `stream: false` to `AskAgent` and `stream: true` to `AskAgentStream`
|
|
||||||
- Session ID propagation via the `session_id` extension field in `ChatCompletionRequest`
|
|
||||||
- Health endpoint
|
|
||||||
|
|
||||||
**Out of scope:**
|
|
||||||
- OpenAI function calling / tool use
|
|
||||||
- OpenAI embeddings API (`/v1/embeddings`)
|
|
||||||
- OpenAI fine-tuning or moderation APIs
|
|
||||||
- Authentication / API key validation (handled at infrastructure level)
|
|
||||||
- Multi-turn conversation reconstruction from the message array (the proxy extracts only the last user message as the query)
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Technical implementation
|
|
||||||
|
|
||||||
**Stack:** FastAPI + uvicorn, running on port 8000 inside the same container as the gRPC server.
|
|
||||||
|
|
||||||
**Concurrency:** An asyncio event loop bridges FastAPI's async context with the synchronous gRPC calls via a dedicated `ThreadPoolExecutor` (configurable via `PROXY_THREAD_WORKERS`, default 20). This prevents gRPC blocking calls from stalling the async HTTP server.
|
|
||||||
|
|
||||||
**Streaming:** An `asyncio.Queue` connects the gRPC token stream (produced in a thread) with the FastAPI `StreamingResponse` (consumed in the async event loop). Tokens are forwarded as SSE events (OpenAI format) or NDJSON (Ollama format) as they arrive from `AskAgentStream`.
|
|
||||||
|
|
||||||
**Entry point:** `entrypoint.sh` starts both the gRPC server and the HTTP proxy as parallel processes. If either crashes, the other is terminated — the container fails cleanly rather than entering a partially active state.
|
|
||||||
|
|
||||||
**Environment variables:**
|
|
||||||
|
|
||||||
| Variable | Default | Description |
|
|
||||||
|---|---|---|
|
|
||||||
| `BRUNIX_GRPC_TARGET` | `localhost:50051` | gRPC server address |
|
|
||||||
| `PROXY_MODEL_ID` | `brunix` | Model name returned by `/v1/models` and `/api/tags` |
|
|
||||||
| `PROXY_THREAD_WORKERS` | `20` | ThreadPoolExecutor size for gRPC calls |
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Validation
|
|
||||||
|
|
||||||
**Functional:** Any OpenAI-compatible client (continue.dev, LiteLLM, Open WebUI) can be pointed at `http://localhost:8000` with `model: brunix` and successfully send queries to the Brunix RAG without code changes.
|
|
||||||
|
|
||||||
**Strategic:** The VS Code extension and any 101OBEX, Corp internal tooling currently consuming OpenAI can switch to the Brunix RAG by changing one endpoint URL and one model name. No other changes required.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Impact on existing interfaces
|
|
||||||
|
|
||||||
The gRPC interface (`AskAgent`, `AskAgentStream`, `EvaluateRAG`) is unchanged. Existing gRPC clients are not affected. The proxy is additive — it does not replace the gRPC interface, it complements it.
|
|
||||||
|
|
@ -1,199 +0,0 @@
|
||||||
# PRD-0002: Editor Context Injection for VS Code Extension
|
|
||||||
|
|
||||||
**Date:** 2026-03-19
|
|
||||||
**Status:** Implemented
|
|
||||||
**Requested by:** Rafael Ruiz (CTO)
|
|
||||||
**Purpose:** Validate the VS Code extension with real users
|
|
||||||
**Related ADR:** ADR-0001 (gRPC interface), ADR-0002 (two-phase streaming)
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Problem
|
|
||||||
|
|
||||||
The Brunix Assistance Engine previously received only two inputs from the client: a `query` (the user's question) and a `session_id` (for conversation continuity). It had no awareness of what the user was looking at in their editor when they asked the question.
|
|
||||||
|
|
||||||
This created a fundamental limitation for a coding assistant: the user asking "how do I handle the error here?" or "what does this function return?" could not be answered correctly without knowing what "here" and "this function" referred to. The assistant was forced to treat every question as a general AVAP documentation query, even when the user's intent was clearly anchored to specific code in their editor.
|
|
||||||
|
|
||||||
For the VS Code extension validation, the CEO needed to demonstrate that the assistant behaves as a genuine coding assistant — one that understands the user's current context — not just a documentation search tool.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Solution
|
|
||||||
|
|
||||||
The gRPC contract has been extended to allow the VS Code extension to send four optional context fields alongside every query. These fields are transported in the standard OpenAI `user` field as a JSON string when using the HTTP proxy, and as dedicated proto fields when calling gRPC directly.
|
|
||||||
|
|
||||||
**Transport format via HTTP proxy (`/v1/chat/completions`):**
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"model": "brunix",
|
|
||||||
"messages": [{"role": "user", "content": "que hace este código?"}],
|
|
||||||
"stream": true,
|
|
||||||
"session_id": "uuid",
|
|
||||||
"user": "{\"editor_content\":\"<base64>\",\"selected_text\":\"<base64>\",\"extra_context\":\"<base64>\",\"user_info\":{\"dev_id\":1,\"project_id\":2,\"org_id\":3}}"
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
**Fields:**
|
|
||||||
- **`editor_content`** (base64) — full content of the active file open in the editor. Gives the assistant awareness of the complete code the user is working on.
|
|
||||||
- **`selected_text`** (base64) — text currently selected in the editor, if any. The most precise signal of user intent — if the user has selected a block of code before asking a question, that block is almost certainly what the question is about.
|
|
||||||
- **`extra_context`** (base64) — free-form additional context (e.g., file path, language identifier, cursor position, open diagnostic errors). Extensible without requiring proto changes.
|
|
||||||
- **`user_info`** (JSON object) — client identity metadata: `dev_id`, `project_id`, `org_id`. Not base64 — sent as a JSON object nested within the `user` JSON string.
|
|
||||||
|
|
||||||
All four fields are optional. If none are provided, the assistant behaves exactly as it does today — full backward compatibility.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## User experience
|
|
||||||
|
|
||||||
**Scenario 1 — Question about selected code:**
|
|
||||||
The user selects a `try() / exception() / end()` block in their editor and asks "why is this not catching my error?". The assistant detects via the classifier that the question refers explicitly to the selected code, injects `selected_text` into the generation prompt, and answers specifically about that block — not about error handling in general.
|
|
||||||
|
|
||||||
**Scenario 2 — Question about the open file:**
|
|
||||||
The user has a full AVAP function open and asks "what HTTP status codes can this return?". The classifier detects the question refers to editor content, injects `editor_content` into the generation prompt, and reasons about the `_status` assignments in the function.
|
|
||||||
|
|
||||||
**Scenario 3 — General question (unchanged behaviour):**
|
|
||||||
The user asks "how does addVar work?" without selecting anything or referring to the editor. The classifier sets `use_editor_context: False`. The assistant behaves exactly as before — retrieval-augmented response from the AVAP knowledge base, no editor content injected.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Scope
|
|
||||||
|
|
||||||
**In scope:**
|
|
||||||
- Add `editor_content`, `selected_text`, `extra_context`, `user_info` fields to `AgentRequest` in `brunix.proto`
|
|
||||||
- Decode base64 fields (`editor_content`, `selected_text`, `extra_context`) in `server.py` before propagating to graph state
|
|
||||||
- Parse `user_info` as opaque JSON string — available in state for future use, not yet consumed by the graph
|
|
||||||
- Parse the `user` field in `openai_proxy.py` as a JSON object containing all four context fields
|
|
||||||
- Propagate all fields through the server into the graph state (`AgentState`)
|
|
||||||
- Extend the classifier (`CLASSIFY_PROMPT_TEMPLATE`) to output two tokens: query type and editor context signal (`EDITOR` / `NO_EDITOR`)
|
|
||||||
- Set `use_editor_context: bool` in `AgentState` based on classifier output
|
|
||||||
- Use `selected_text` as the primary anchor for query reformulation only when `use_editor_context` is `True`
|
|
||||||
- Inject `selected_text` and `editor_content` into the generation prompt only when `use_editor_context` is `True`
|
|
||||||
- Fix reformulator language — queries must be rewritten in the original language, never translated
|
|
||||||
|
|
||||||
**Out of scope:**
|
|
||||||
- Changes to `EvaluateRAG` — the golden dataset does not include editor-context queries; this feature does not affect embedding or retrieval evaluation
|
|
||||||
- Consuming `user_info` fields (`dev_id`, `project_id`, `org_id`) in the graph — available in state for future routing or personalisation
|
|
||||||
- Evaluation of the feature impact via EvaluateRAG — a dedicated golden dataset with editor-context queries is required for that measurement; it is future work
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Technical design
|
|
||||||
|
|
||||||
### Proto changes (`brunix.proto`)
|
|
||||||
|
|
||||||
```protobuf
|
|
||||||
message AgentRequest {
|
|
||||||
string query = 1; // unchanged
|
|
||||||
string session_id = 2; // unchanged
|
|
||||||
string editor_content = 3; // base64-encoded full editor file content
|
|
||||||
string selected_text = 4; // base64-encoded currently selected text
|
|
||||||
string extra_context = 5; // base64-encoded free-form additional context
|
|
||||||
string user_info = 6; // JSON string: {"dev_id":…,"project_id":…,"org_id":…}
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
Fields 1 and 2 are unchanged. Fields 3–6 are optional — absent fields default to empty string in proto3. All existing clients remain compatible without modification.
|
|
||||||
|
|
||||||
### AgentState changes (`state.py`)
|
|
||||||
|
|
||||||
```python
|
|
||||||
class AgentState(TypedDict):
|
|
||||||
# Core fields
|
|
||||||
messages: Annotated[list, add_messages]
|
|
||||||
session_id: str
|
|
||||||
query_type: str
|
|
||||||
reformulated_query: str
|
|
||||||
context: str
|
|
||||||
# Editor context fields (PRD-0002)
|
|
||||||
editor_content: str # decoded from base64
|
|
||||||
selected_text: str # decoded from base64
|
|
||||||
extra_context: str # decoded from base64
|
|
||||||
user_info: str # JSON string — {"dev_id":…,"project_id":…,"org_id":…}
|
|
||||||
# Set by classifier — True only when user explicitly refers to editor code
|
|
||||||
use_editor_context: bool
|
|
||||||
```
|
|
||||||
|
|
||||||
### Server changes (`server.py`)
|
|
||||||
|
|
||||||
Base64 decoding applied to `editor_content`, `selected_text` and `extra_context` before propagation. `user_info` passed as-is (plain JSON string). Helper function:
|
|
||||||
|
|
||||||
```python
|
|
||||||
def _decode_b64(value: str) -> str:
|
|
||||||
try:
|
|
||||||
return base64.b64decode(value).decode("utf-8") if value else ""
|
|
||||||
except Exception:
|
|
||||||
logger.warning(f"[base64] decode failed")
|
|
||||||
return ""
|
|
||||||
```
|
|
||||||
|
|
||||||
### Proxy changes (`openai_proxy.py`)
|
|
||||||
|
|
||||||
The `user` field is parsed as a JSON object. `_parse_editor_context` extracts all four fields:
|
|
||||||
|
|
||||||
```python
|
|
||||||
def _parse_editor_context(user: Optional[str]) -> tuple[str, str, str, str]:
|
|
||||||
if not user:
|
|
||||||
return "", "", "", ""
|
|
||||||
try:
|
|
||||||
ctx = json.loads(user)
|
|
||||||
if isinstance(ctx, dict):
|
|
||||||
return (
|
|
||||||
ctx.get("editor_content", "") or "",
|
|
||||||
ctx.get("selected_text", "") or "",
|
|
||||||
ctx.get("extra_context", "") or "",
|
|
||||||
json.dumps(ctx.get("user_info", {})),
|
|
||||||
)
|
|
||||||
except (json.JSONDecodeError, TypeError):
|
|
||||||
pass
|
|
||||||
return "", "", "", ""
|
|
||||||
```
|
|
||||||
|
|
||||||
`session_id` is now read exclusively from the dedicated `session_id` field — no longer falls back to `user`.
|
|
||||||
|
|
||||||
### Classifier changes (`prompts.py` + `graph.py`)
|
|
||||||
|
|
||||||
`CLASSIFY_PROMPT_TEMPLATE` now outputs two tokens separated by a space:
|
|
||||||
- First token: `RETRIEVAL`, `CODE_GENERATION`, or `CONVERSATIONAL`
|
|
||||||
- Second token: `EDITOR` or `NO_EDITOR`
|
|
||||||
|
|
||||||
`EDITOR` is set only when the user message explicitly refers to the editor code or selected text using expressions like "this code", "este codigo", "fix this", "que hace esto", "explain this", etc.
|
|
||||||
|
|
||||||
`_parse_query_type` returns `tuple[str, bool]`. Both `classify` nodes (in `build_graph` and `build_prepare_graph`) set `use_editor_context` in the state.
|
|
||||||
|
|
||||||
### Reformulator changes (`prompts.py` + `graph.py`)
|
|
||||||
|
|
||||||
Two fixes applied:
|
|
||||||
|
|
||||||
**Mode-aware reformulation:** The reformulator receives `[MODE: X]` prepended to the query. In `RETRIEVAL` mode it compresses the query without expanding AVAP commands. In `CODE_GENERATION` mode it applies the command mapping. In `CONVERSATIONAL` mode it returns the query as-is.
|
|
||||||
|
|
||||||
**Language preservation:** The reformulator never translates. Queries in Spanish are rewritten in Spanish. Queries in English are rewritten in English. This fix was required because the BM25 retrieval is lexical — a Spanish chunk ("AVAP es un DSL...") cannot be found by an English query ("AVAP stand for").
|
|
||||||
|
|
||||||
### Generator changes (`graph.py`)
|
|
||||||
|
|
||||||
`_build_generation_prompt` injects `editor_content` and `selected_text` into the prompt only when `use_editor_context` is `True`. Priority hierarchy when injected:
|
|
||||||
1. `selected_text` — highest priority, most specific signal
|
|
||||||
2. `editor_content` — file-level context
|
|
||||||
3. RAG-retrieved chunks — knowledge base context
|
|
||||||
4. `extra_context` — free-form additional context
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Validation
|
|
||||||
|
|
||||||
**Acceptance criteria:**
|
|
||||||
- A query explicitly referring to selected code (`selected_text` non-empty, classifier returns `EDITOR`) produces a response grounded in that specific code.
|
|
||||||
- A general query (`use_editor_context: False`) produces a response identical in quality to the pre-PRD-0002 system — no editor content injected, no regression.
|
|
||||||
- A query in Spanish retrieves Spanish chunks correctly — the reformulator preserves the language.
|
|
||||||
- Existing gRPC clients that do not send the new fields work without modification.
|
|
||||||
- The `user` field in the HTTP proxy can be a plain string or absent — no error raised.
|
|
||||||
|
|
||||||
**Future measurement:**
|
|
||||||
Once the extension is validated and the embedding model is selected (ADR-0005), a dedicated golden dataset of editor-context queries should be built and added to `EvaluateRAG` to measure the quantitative impact of this feature.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Impact on parallel workstreams
|
|
||||||
|
|
||||||
**Embedding evaluation (ADR-0005 / MrHouston):** No impact. The BEIR benchmarks and EvaluateRAG runs for embedding model selection use the existing golden dataset, which contains no editor-context queries. The two workstreams are independent.
|
|
||||||
|
|
||||||
**RAG architecture evolution:** This feature is additive. It does not change the retrieval infrastructure, the Elasticsearch index, or the embedding pipeline. It extends the graph with additional input signals that improve response quality for editor-anchored queries.
|
|
||||||
|
|
@ -41,5 +41,3 @@ dev = [
|
||||||
"selenium>=4.41.0",
|
"selenium>=4.41.0",
|
||||||
"tree-sitter-language-pack>=0.13.0",
|
"tree-sitter-language-pack>=0.13.0",
|
||||||
]
|
]
|
||||||
[tool.pytest.ini_options]
|
|
||||||
testpaths = ["Docker/tests"]
|
|
||||||
Binary file not shown.
File diff suppressed because one or more lines are too long
File diff suppressed because it is too large
Load Diff
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
|
|
@ -1,501 +0,0 @@
|
||||||
query-id corpus-id score
|
|
||||||
q20105 d20105 1
|
|
||||||
q20106 d20106 1
|
|
||||||
q20107 d20107 1
|
|
||||||
q20108 d20108 1
|
|
||||||
q20109 d20109 1
|
|
||||||
q20110 d20110 1
|
|
||||||
q20111 d20111 1
|
|
||||||
q20112 d20112 1
|
|
||||||
q20113 d20113 1
|
|
||||||
q20114 d20114 1
|
|
||||||
q20115 d20115 1
|
|
||||||
q20116 d20116 1
|
|
||||||
q20117 d20117 1
|
|
||||||
q20118 d20118 1
|
|
||||||
q20119 d20119 1
|
|
||||||
q20120 d20120 1
|
|
||||||
q20121 d20121 1
|
|
||||||
q20122 d20122 1
|
|
||||||
q20123 d20123 1
|
|
||||||
q20124 d20124 1
|
|
||||||
q20125 d20125 1
|
|
||||||
q20126 d20126 1
|
|
||||||
q20127 d20127 1
|
|
||||||
q20128 d20128 1
|
|
||||||
q20129 d20129 1
|
|
||||||
q20130 d20130 1
|
|
||||||
q20131 d20131 1
|
|
||||||
q20132 d20132 1
|
|
||||||
q20133 d20133 1
|
|
||||||
q20134 d20134 1
|
|
||||||
q20135 d20135 1
|
|
||||||
q20136 d20136 1
|
|
||||||
q20137 d20137 1
|
|
||||||
q20138 d20138 1
|
|
||||||
q20139 d20139 1
|
|
||||||
q20140 d20140 1
|
|
||||||
q20141 d20141 1
|
|
||||||
q20142 d20142 1
|
|
||||||
q20143 d20143 1
|
|
||||||
q20144 d20144 1
|
|
||||||
q20145 d20145 1
|
|
||||||
q20146 d20146 1
|
|
||||||
q20147 d20147 1
|
|
||||||
q20148 d20148 1
|
|
||||||
q20149 d20149 1
|
|
||||||
q20150 d20150 1
|
|
||||||
q20151 d20151 1
|
|
||||||
q20152 d20152 1
|
|
||||||
q20153 d20153 1
|
|
||||||
q20154 d20154 1
|
|
||||||
q20155 d20155 1
|
|
||||||
q20156 d20156 1
|
|
||||||
q20157 d20157 1
|
|
||||||
q20158 d20158 1
|
|
||||||
q20159 d20159 1
|
|
||||||
q20160 d20160 1
|
|
||||||
q20161 d20161 1
|
|
||||||
q20162 d20162 1
|
|
||||||
q20163 d20163 1
|
|
||||||
q20164 d20164 1
|
|
||||||
q20165 d20165 1
|
|
||||||
q20166 d20166 1
|
|
||||||
q20167 d20167 1
|
|
||||||
q20168 d20168 1
|
|
||||||
q20169 d20169 1
|
|
||||||
q20170 d20170 1
|
|
||||||
q20171 d20171 1
|
|
||||||
q20172 d20172 1
|
|
||||||
q20173 d20173 1
|
|
||||||
q20174 d20174 1
|
|
||||||
q20175 d20175 1
|
|
||||||
q20176 d20176 1
|
|
||||||
q20177 d20177 1
|
|
||||||
q20178 d20178 1
|
|
||||||
q20179 d20179 1
|
|
||||||
q20180 d20180 1
|
|
||||||
q20181 d20181 1
|
|
||||||
q20182 d20182 1
|
|
||||||
q20183 d20183 1
|
|
||||||
q20184 d20184 1
|
|
||||||
q20185 d20185 1
|
|
||||||
q20186 d20186 1
|
|
||||||
q20187 d20187 1
|
|
||||||
q20188 d20188 1
|
|
||||||
q20189 d20189 1
|
|
||||||
q20190 d20190 1
|
|
||||||
q20191 d20191 1
|
|
||||||
q20192 d20192 1
|
|
||||||
q20193 d20193 1
|
|
||||||
q20194 d20194 1
|
|
||||||
q20195 d20195 1
|
|
||||||
q20196 d20196 1
|
|
||||||
q20197 d20197 1
|
|
||||||
q20198 d20198 1
|
|
||||||
q20199 d20199 1
|
|
||||||
q20200 d20200 1
|
|
||||||
q20201 d20201 1
|
|
||||||
q20202 d20202 1
|
|
||||||
q20203 d20203 1
|
|
||||||
q20204 d20204 1
|
|
||||||
q20205 d20205 1
|
|
||||||
q20206 d20206 1
|
|
||||||
q20207 d20207 1
|
|
||||||
q20208 d20208 1
|
|
||||||
q20209 d20209 1
|
|
||||||
q20210 d20210 1
|
|
||||||
q20211 d20211 1
|
|
||||||
q20212 d20212 1
|
|
||||||
q20213 d20213 1
|
|
||||||
q20214 d20214 1
|
|
||||||
q20215 d20215 1
|
|
||||||
q20216 d20216 1
|
|
||||||
q20217 d20217 1
|
|
||||||
q20218 d20218 1
|
|
||||||
q20219 d20219 1
|
|
||||||
q20220 d20220 1
|
|
||||||
q20221 d20221 1
|
|
||||||
q20222 d20222 1
|
|
||||||
q20223 d20223 1
|
|
||||||
q20224 d20224 1
|
|
||||||
q20225 d20225 1
|
|
||||||
q20226 d20226 1
|
|
||||||
q20227 d20227 1
|
|
||||||
q20228 d20228 1
|
|
||||||
q20229 d20229 1
|
|
||||||
q20230 d20230 1
|
|
||||||
q20231 d20231 1
|
|
||||||
q20232 d20232 1
|
|
||||||
q20233 d20233 1
|
|
||||||
q20234 d20234 1
|
|
||||||
q20235 d20235 1
|
|
||||||
q20236 d20236 1
|
|
||||||
q20237 d20237 1
|
|
||||||
q20238 d20238 1
|
|
||||||
q20239 d20239 1
|
|
||||||
q20240 d20240 1
|
|
||||||
q20241 d20241 1
|
|
||||||
q20242 d20242 1
|
|
||||||
q20243 d20243 1
|
|
||||||
q20244 d20244 1
|
|
||||||
q20245 d20245 1
|
|
||||||
q20246 d20246 1
|
|
||||||
q20247 d20247 1
|
|
||||||
q20248 d20248 1
|
|
||||||
q20249 d20249 1
|
|
||||||
q20250 d20250 1
|
|
||||||
q20251 d20251 1
|
|
||||||
q20252 d20252 1
|
|
||||||
q20253 d20253 1
|
|
||||||
q20254 d20254 1
|
|
||||||
q20255 d20255 1
|
|
||||||
q20256 d20256 1
|
|
||||||
q20257 d20257 1
|
|
||||||
q20258 d20258 1
|
|
||||||
q20259 d20259 1
|
|
||||||
q20260 d20260 1
|
|
||||||
q20261 d20261 1
|
|
||||||
q20262 d20262 1
|
|
||||||
q20263 d20263 1
|
|
||||||
q20264 d20264 1
|
|
||||||
q20265 d20265 1
|
|
||||||
q20266 d20266 1
|
|
||||||
q20267 d20267 1
|
|
||||||
q20268 d20268 1
|
|
||||||
q20269 d20269 1
|
|
||||||
q20270 d20270 1
|
|
||||||
q20271 d20271 1
|
|
||||||
q20272 d20272 1
|
|
||||||
q20273 d20273 1
|
|
||||||
q20274 d20274 1
|
|
||||||
q20275 d20275 1
|
|
||||||
q20276 d20276 1
|
|
||||||
q20277 d20277 1
|
|
||||||
q20278 d20278 1
|
|
||||||
q20279 d20279 1
|
|
||||||
q20280 d20280 1
|
|
||||||
q20281 d20281 1
|
|
||||||
q20282 d20282 1
|
|
||||||
q20283 d20283 1
|
|
||||||
q20284 d20284 1
|
|
||||||
q20285 d20285 1
|
|
||||||
q20286 d20286 1
|
|
||||||
q20287 d20287 1
|
|
||||||
q20288 d20288 1
|
|
||||||
q20289 d20289 1
|
|
||||||
q20290 d20290 1
|
|
||||||
q20291 d20291 1
|
|
||||||
q20292 d20292 1
|
|
||||||
q20293 d20293 1
|
|
||||||
q20294 d20294 1
|
|
||||||
q20295 d20295 1
|
|
||||||
q20296 d20296 1
|
|
||||||
q20297 d20297 1
|
|
||||||
q20298 d20298 1
|
|
||||||
q20299 d20299 1
|
|
||||||
q20300 d20300 1
|
|
||||||
q20301 d20301 1
|
|
||||||
q20302 d20302 1
|
|
||||||
q20303 d20303 1
|
|
||||||
q20304 d20304 1
|
|
||||||
q20305 d20305 1
|
|
||||||
q20306 d20306 1
|
|
||||||
q20307 d20307 1
|
|
||||||
q20308 d20308 1
|
|
||||||
q20309 d20309 1
|
|
||||||
q20310 d20310 1
|
|
||||||
q20311 d20311 1
|
|
||||||
q20312 d20312 1
|
|
||||||
q20313 d20313 1
|
|
||||||
q20314 d20314 1
|
|
||||||
q20315 d20315 1
|
|
||||||
q20316 d20316 1
|
|
||||||
q20317 d20317 1
|
|
||||||
q20318 d20318 1
|
|
||||||
q20319 d20319 1
|
|
||||||
q20320 d20320 1
|
|
||||||
q20321 d20321 1
|
|
||||||
q20322 d20322 1
|
|
||||||
q20323 d20323 1
|
|
||||||
q20324 d20324 1
|
|
||||||
q20325 d20325 1
|
|
||||||
q20326 d20326 1
|
|
||||||
q20327 d20327 1
|
|
||||||
q20328 d20328 1
|
|
||||||
q20329 d20329 1
|
|
||||||
q20330 d20330 1
|
|
||||||
q20331 d20331 1
|
|
||||||
q20332 d20332 1
|
|
||||||
q20333 d20333 1
|
|
||||||
q20334 d20334 1
|
|
||||||
q20335 d20335 1
|
|
||||||
q20336 d20336 1
|
|
||||||
q20337 d20337 1
|
|
||||||
q20338 d20338 1
|
|
||||||
q20339 d20339 1
|
|
||||||
q20340 d20340 1
|
|
||||||
q20341 d20341 1
|
|
||||||
q20342 d20342 1
|
|
||||||
q20343 d20343 1
|
|
||||||
q20344 d20344 1
|
|
||||||
q20345 d20345 1
|
|
||||||
q20346 d20346 1
|
|
||||||
q20347 d20347 1
|
|
||||||
q20348 d20348 1
|
|
||||||
q20349 d20349 1
|
|
||||||
q20350 d20350 1
|
|
||||||
q20351 d20351 1
|
|
||||||
q20352 d20352 1
|
|
||||||
q20353 d20353 1
|
|
||||||
q20354 d20354 1
|
|
||||||
q20355 d20355 1
|
|
||||||
q20356 d20356 1
|
|
||||||
q20357 d20357 1
|
|
||||||
q20358 d20358 1
|
|
||||||
q20359 d20359 1
|
|
||||||
q20360 d20360 1
|
|
||||||
q20361 d20361 1
|
|
||||||
q20362 d20362 1
|
|
||||||
q20363 d20363 1
|
|
||||||
q20364 d20364 1
|
|
||||||
q20365 d20365 1
|
|
||||||
q20366 d20366 1
|
|
||||||
q20367 d20367 1
|
|
||||||
q20368 d20368 1
|
|
||||||
q20369 d20369 1
|
|
||||||
q20370 d20370 1
|
|
||||||
q20371 d20371 1
|
|
||||||
q20372 d20372 1
|
|
||||||
q20373 d20373 1
|
|
||||||
q20374 d20374 1
|
|
||||||
q20375 d20375 1
|
|
||||||
q20376 d20376 1
|
|
||||||
q20377 d20377 1
|
|
||||||
q20378 d20378 1
|
|
||||||
q20379 d20379 1
|
|
||||||
q20380 d20380 1
|
|
||||||
q20381 d20381 1
|
|
||||||
q20382 d20382 1
|
|
||||||
q20383 d20383 1
|
|
||||||
q20384 d20384 1
|
|
||||||
q20385 d20385 1
|
|
||||||
q20386 d20386 1
|
|
||||||
q20387 d20387 1
|
|
||||||
q20388 d20388 1
|
|
||||||
q20389 d20389 1
|
|
||||||
q20390 d20390 1
|
|
||||||
q20391 d20391 1
|
|
||||||
q20392 d20392 1
|
|
||||||
q20393 d20393 1
|
|
||||||
q20394 d20394 1
|
|
||||||
q20395 d20395 1
|
|
||||||
q20396 d20396 1
|
|
||||||
q20397 d20397 1
|
|
||||||
q20398 d20398 1
|
|
||||||
q20399 d20399 1
|
|
||||||
q20400 d20400 1
|
|
||||||
q20401 d20401 1
|
|
||||||
q20402 d20402 1
|
|
||||||
q20403 d20403 1
|
|
||||||
q20404 d20404 1
|
|
||||||
q20405 d20405 1
|
|
||||||
q20406 d20406 1
|
|
||||||
q20407 d20407 1
|
|
||||||
q20408 d20408 1
|
|
||||||
q20409 d20409 1
|
|
||||||
q20410 d20410 1
|
|
||||||
q20411 d20411 1
|
|
||||||
q20412 d20412 1
|
|
||||||
q20413 d20413 1
|
|
||||||
q20414 d20414 1
|
|
||||||
q20415 d20415 1
|
|
||||||
q20416 d20416 1
|
|
||||||
q20417 d20417 1
|
|
||||||
q20418 d20418 1
|
|
||||||
q20419 d20419 1
|
|
||||||
q20420 d20420 1
|
|
||||||
q20421 d20421 1
|
|
||||||
q20422 d20422 1
|
|
||||||
q20423 d20423 1
|
|
||||||
q20424 d20424 1
|
|
||||||
q20425 d20425 1
|
|
||||||
q20426 d20426 1
|
|
||||||
q20427 d20427 1
|
|
||||||
q20428 d20428 1
|
|
||||||
q20429 d20429 1
|
|
||||||
q20430 d20430 1
|
|
||||||
q20431 d20431 1
|
|
||||||
q20432 d20432 1
|
|
||||||
q20433 d20433 1
|
|
||||||
q20434 d20434 1
|
|
||||||
q20435 d20435 1
|
|
||||||
q20436 d20436 1
|
|
||||||
q20437 d20437 1
|
|
||||||
q20438 d20438 1
|
|
||||||
q20439 d20439 1
|
|
||||||
q20440 d20440 1
|
|
||||||
q20441 d20441 1
|
|
||||||
q20442 d20442 1
|
|
||||||
q20443 d20443 1
|
|
||||||
q20444 d20444 1
|
|
||||||
q20445 d20445 1
|
|
||||||
q20446 d20446 1
|
|
||||||
q20447 d20447 1
|
|
||||||
q20448 d20448 1
|
|
||||||
q20449 d20449 1
|
|
||||||
q20450 d20450 1
|
|
||||||
q20451 d20451 1
|
|
||||||
q20452 d20452 1
|
|
||||||
q20453 d20453 1
|
|
||||||
q20454 d20454 1
|
|
||||||
q20455 d20455 1
|
|
||||||
q20456 d20456 1
|
|
||||||
q20457 d20457 1
|
|
||||||
q20458 d20458 1
|
|
||||||
q20459 d20459 1
|
|
||||||
q20460 d20460 1
|
|
||||||
q20461 d20461 1
|
|
||||||
q20462 d20462 1
|
|
||||||
q20463 d20463 1
|
|
||||||
q20464 d20464 1
|
|
||||||
q20465 d20465 1
|
|
||||||
q20466 d20466 1
|
|
||||||
q20467 d20467 1
|
|
||||||
q20468 d20468 1
|
|
||||||
q20469 d20469 1
|
|
||||||
q20470 d20470 1
|
|
||||||
q20471 d20471 1
|
|
||||||
q20472 d20472 1
|
|
||||||
q20473 d20473 1
|
|
||||||
q20474 d20474 1
|
|
||||||
q20475 d20475 1
|
|
||||||
q20476 d20476 1
|
|
||||||
q20477 d20477 1
|
|
||||||
q20478 d20478 1
|
|
||||||
q20479 d20479 1
|
|
||||||
q20480 d20480 1
|
|
||||||
q20481 d20481 1
|
|
||||||
q20482 d20482 1
|
|
||||||
q20483 d20483 1
|
|
||||||
q20484 d20484 1
|
|
||||||
q20485 d20485 1
|
|
||||||
q20486 d20486 1
|
|
||||||
q20487 d20487 1
|
|
||||||
q20488 d20488 1
|
|
||||||
q20489 d20489 1
|
|
||||||
q20490 d20490 1
|
|
||||||
q20491 d20491 1
|
|
||||||
q20492 d20492 1
|
|
||||||
q20493 d20493 1
|
|
||||||
q20494 d20494 1
|
|
||||||
q20495 d20495 1
|
|
||||||
q20496 d20496 1
|
|
||||||
q20497 d20497 1
|
|
||||||
q20498 d20498 1
|
|
||||||
q20499 d20499 1
|
|
||||||
q20500 d20500 1
|
|
||||||
q20501 d20501 1
|
|
||||||
q20502 d20502 1
|
|
||||||
q20503 d20503 1
|
|
||||||
q20504 d20504 1
|
|
||||||
q20505 d20505 1
|
|
||||||
q20506 d20506 1
|
|
||||||
q20507 d20507 1
|
|
||||||
q20508 d20508 1
|
|
||||||
q20509 d20509 1
|
|
||||||
q20510 d20510 1
|
|
||||||
q20511 d20511 1
|
|
||||||
q20512 d20512 1
|
|
||||||
q20513 d20513 1
|
|
||||||
q20514 d20514 1
|
|
||||||
q20515 d20515 1
|
|
||||||
q20516 d20516 1
|
|
||||||
q20517 d20517 1
|
|
||||||
q20518 d20518 1
|
|
||||||
q20519 d20519 1
|
|
||||||
q20520 d20520 1
|
|
||||||
q20521 d20521 1
|
|
||||||
q20522 d20522 1
|
|
||||||
q20523 d20523 1
|
|
||||||
q20524 d20524 1
|
|
||||||
q20525 d20525 1
|
|
||||||
q20526 d20526 1
|
|
||||||
q20527 d20527 1
|
|
||||||
q20528 d20528 1
|
|
||||||
q20529 d20529 1
|
|
||||||
q20530 d20530 1
|
|
||||||
q20531 d20531 1
|
|
||||||
q20532 d20532 1
|
|
||||||
q20533 d20533 1
|
|
||||||
q20534 d20534 1
|
|
||||||
q20535 d20535 1
|
|
||||||
q20536 d20536 1
|
|
||||||
q20537 d20537 1
|
|
||||||
q20538 d20538 1
|
|
||||||
q20539 d20539 1
|
|
||||||
q20540 d20540 1
|
|
||||||
q20541 d20541 1
|
|
||||||
q20542 d20542 1
|
|
||||||
q20543 d20543 1
|
|
||||||
q20544 d20544 1
|
|
||||||
q20545 d20545 1
|
|
||||||
q20546 d20546 1
|
|
||||||
q20547 d20547 1
|
|
||||||
q20548 d20548 1
|
|
||||||
q20549 d20549 1
|
|
||||||
q20550 d20550 1
|
|
||||||
q20551 d20551 1
|
|
||||||
q20552 d20552 1
|
|
||||||
q20553 d20553 1
|
|
||||||
q20554 d20554 1
|
|
||||||
q20555 d20555 1
|
|
||||||
q20556 d20556 1
|
|
||||||
q20557 d20557 1
|
|
||||||
q20558 d20558 1
|
|
||||||
q20559 d20559 1
|
|
||||||
q20560 d20560 1
|
|
||||||
q20561 d20561 1
|
|
||||||
q20562 d20562 1
|
|
||||||
q20563 d20563 1
|
|
||||||
q20564 d20564 1
|
|
||||||
q20565 d20565 1
|
|
||||||
q20566 d20566 1
|
|
||||||
q20567 d20567 1
|
|
||||||
q20568 d20568 1
|
|
||||||
q20569 d20569 1
|
|
||||||
q20570 d20570 1
|
|
||||||
q20571 d20571 1
|
|
||||||
q20572 d20572 1
|
|
||||||
q20573 d20573 1
|
|
||||||
q20574 d20574 1
|
|
||||||
q20575 d20575 1
|
|
||||||
q20576 d20576 1
|
|
||||||
q20577 d20577 1
|
|
||||||
q20578 d20578 1
|
|
||||||
q20579 d20579 1
|
|
||||||
q20580 d20580 1
|
|
||||||
q20581 d20581 1
|
|
||||||
q20582 d20582 1
|
|
||||||
q20583 d20583 1
|
|
||||||
q20584 d20584 1
|
|
||||||
q20585 d20585 1
|
|
||||||
q20586 d20586 1
|
|
||||||
q20587 d20587 1
|
|
||||||
q20588 d20588 1
|
|
||||||
q20589 d20589 1
|
|
||||||
q20590 d20590 1
|
|
||||||
q20591 d20591 1
|
|
||||||
q20592 d20592 1
|
|
||||||
q20593 d20593 1
|
|
||||||
q20594 d20594 1
|
|
||||||
q20595 d20595 1
|
|
||||||
q20596 d20596 1
|
|
||||||
q20597 d20597 1
|
|
||||||
q20598 d20598 1
|
|
||||||
q20599 d20599 1
|
|
||||||
q20600 d20600 1
|
|
||||||
q20601 d20601 1
|
|
||||||
q20602 d20602 1
|
|
||||||
q20603 d20603 1
|
|
||||||
q20604 d20604 1
|
|
||||||
|
File diff suppressed because it is too large
Load Diff
Binary file not shown.
File diff suppressed because one or more lines are too long
|
|
@ -1,340 +0,0 @@
|
||||||
query-id corpus-id score
|
|
||||||
1 31715818 1
|
|
||||||
3 14717500 1
|
|
||||||
5 13734012 1
|
|
||||||
13 1606628 1
|
|
||||||
36 5152028 1
|
|
||||||
36 11705328 1
|
|
||||||
42 18174210 1
|
|
||||||
48 13734012 1
|
|
||||||
49 5953485 1
|
|
||||||
50 12580014 1
|
|
||||||
51 45638119 1
|
|
||||||
53 45638119 1
|
|
||||||
54 49556906 1
|
|
||||||
56 4709641 1
|
|
||||||
57 4709641 1
|
|
||||||
70 5956380 1
|
|
||||||
70 4414547 1
|
|
||||||
72 6076903 1
|
|
||||||
75 4387784 1
|
|
||||||
94 1215116 1
|
|
||||||
99 18810195 1
|
|
||||||
100 4381486 1
|
|
||||||
113 6157837 1
|
|
||||||
115 33872649 1
|
|
||||||
118 6372244 1
|
|
||||||
124 4883040 1
|
|
||||||
127 21598000 1
|
|
||||||
128 8290953 1
|
|
||||||
129 27768226 1
|
|
||||||
130 27768226 1
|
|
||||||
132 7975937 1
|
|
||||||
133 38485364 1
|
|
||||||
133 6969753 1
|
|
||||||
133 17934082 1
|
|
||||||
133 16280642 1
|
|
||||||
133 12640810 1
|
|
||||||
137 26016929 1
|
|
||||||
141 6955746 1
|
|
||||||
141 14437255 1
|
|
||||||
142 10582939 1
|
|
||||||
143 10582939 1
|
|
||||||
146 10582939 1
|
|
||||||
148 1084345 1
|
|
||||||
163 18872233 1
|
|
||||||
171 12670680 1
|
|
||||||
179 16322674 1
|
|
||||||
179 27123743 1
|
|
||||||
179 23557241 1
|
|
||||||
179 17450673 1
|
|
||||||
180 16966326 1
|
|
||||||
183 12827098 1
|
|
||||||
185 18340282 1
|
|
||||||
198 2177022 1
|
|
||||||
208 13519661 1
|
|
||||||
212 22038539 1
|
|
||||||
213 13625993 1
|
|
||||||
216 21366394 1
|
|
||||||
217 21366394 1
|
|
||||||
218 21366394 1
|
|
||||||
219 21366394 1
|
|
||||||
230 3067015 1
|
|
||||||
232 10536636 1
|
|
||||||
233 4388470 1
|
|
||||||
236 4388470 1
|
|
||||||
237 4942718 1
|
|
||||||
238 2251426 1
|
|
||||||
239 14079881 1
|
|
||||||
248 1568684 1
|
|
||||||
249 1568684 1
|
|
||||||
261 1122279 1
|
|
||||||
261 10697096 1
|
|
||||||
268 970012 1
|
|
||||||
269 970012 1
|
|
||||||
274 11614737 1
|
|
||||||
275 4961038 1
|
|
||||||
275 14241418 1
|
|
||||||
275 14819804 1
|
|
||||||
279 14376683 1
|
|
||||||
294 10874408 1
|
|
||||||
295 20310709 1
|
|
||||||
298 39381118 1
|
|
||||||
300 3553087 1
|
|
||||||
303 4388470 1
|
|
||||||
312 6173523 1
|
|
||||||
314 4347374 1
|
|
||||||
324 2014909 1
|
|
||||||
327 17997584 1
|
|
||||||
338 23349986 1
|
|
||||||
343 7873737 1
|
|
||||||
343 5884524 1
|
|
||||||
350 16927286 1
|
|
||||||
354 8774475 1
|
|
||||||
362 38587347 1
|
|
||||||
380 19005293 1
|
|
||||||
384 13770184 1
|
|
||||||
385 9955779 1
|
|
||||||
385 9767444 1
|
|
||||||
386 16495649 1
|
|
||||||
388 1148122 1
|
|
||||||
399 791050 1
|
|
||||||
410 14924526 1
|
|
||||||
411 14924526 1
|
|
||||||
415 6309659 1
|
|
||||||
421 11172205 1
|
|
||||||
431 28937856 1
|
|
||||||
436 14637235 1
|
|
||||||
437 18399038 1
|
|
||||||
439 4423559 1
|
|
||||||
440 4423559 1
|
|
||||||
443 10165258 1
|
|
||||||
452 12804937 1
|
|
||||||
452 464511 1
|
|
||||||
475 18678095 1
|
|
||||||
478 14767844 1
|
|
||||||
491 56893404 1
|
|
||||||
501 17930286 1
|
|
||||||
502 13071728 1
|
|
||||||
507 30774694 1
|
|
||||||
508 13980338 1
|
|
||||||
513 13230773 1
|
|
||||||
514 16256507 1
|
|
||||||
516 29564505 1
|
|
||||||
517 15663829 1
|
|
||||||
521 34873974 1
|
|
||||||
525 13639330 1
|
|
||||||
527 3863543 1
|
|
||||||
528 5476778 1
|
|
||||||
532 12991445 1
|
|
||||||
533 12991445 1
|
|
||||||
535 39368721 1
|
|
||||||
536 16056514 1
|
|
||||||
539 13282296 1
|
|
||||||
540 11886686 1
|
|
||||||
540 25007443 1
|
|
||||||
544 24221369 1
|
|
||||||
549 9433958 1
|
|
||||||
551 33499189 1
|
|
||||||
552 1471041 1
|
|
||||||
554 1049501 1
|
|
||||||
560 40096222 1
|
|
||||||
569 23460562 1
|
|
||||||
575 10300888 1
|
|
||||||
577 5289038 1
|
|
||||||
578 8764879 1
|
|
||||||
587 16999023 1
|
|
||||||
589 10984005 1
|
|
||||||
593 19675911 1
|
|
||||||
597 12779444 1
|
|
||||||
597 36355784 1
|
|
||||||
597 25742130 1
|
|
||||||
598 25742130 1
|
|
||||||
613 9638032 1
|
|
||||||
619 20888849 1
|
|
||||||
619 2565138 1
|
|
||||||
623 17000834 1
|
|
||||||
628 24512064 1
|
|
||||||
636 24294572 1
|
|
||||||
637 25649714 1
|
|
||||||
641 5912283 1
|
|
||||||
641 31554917 1
|
|
||||||
644 13619127 1
|
|
||||||
649 12789595 1
|
|
||||||
659 1215116 1
|
|
||||||
660 1215116 1
|
|
||||||
674 2095573 1
|
|
||||||
684 4942718 1
|
|
||||||
690 18750453 1
|
|
||||||
691 10991183 1
|
|
||||||
692 24088502 1
|
|
||||||
693 24088502 1
|
|
||||||
700 4350400 1
|
|
||||||
702 4350400 1
|
|
||||||
715 18421962 1
|
|
||||||
716 18421962 1
|
|
||||||
718 17587795 1
|
|
||||||
721 1834762 1
|
|
||||||
723 5531479 1
|
|
||||||
727 7521113 1
|
|
||||||
728 7521113 1
|
|
||||||
728 36444198 1
|
|
||||||
729 26851674 1
|
|
||||||
742 32159283 1
|
|
||||||
743 32159283 1
|
|
||||||
744 8460275 1
|
|
||||||
756 2831620 1
|
|
||||||
759 1805641 1
|
|
||||||
768 6421792 1
|
|
||||||
770 15476777 1
|
|
||||||
775 32275758 1
|
|
||||||
781 24338780 1
|
|
||||||
783 40632104 1
|
|
||||||
784 2356950 1
|
|
||||||
785 12471115 1
|
|
||||||
793 8551160 1
|
|
||||||
800 22543403 1
|
|
||||||
805 22180793 1
|
|
||||||
808 36606083 1
|
|
||||||
811 19799455 1
|
|
||||||
814 33387953 1
|
|
||||||
820 8646760 1
|
|
||||||
821 8646760 1
|
|
||||||
823 15319019 1
|
|
||||||
830 1897324 1
|
|
||||||
831 1897324 1
|
|
||||||
832 30303335 1
|
|
||||||
834 5483793 1
|
|
||||||
837 15928989 1
|
|
||||||
839 1469751 1
|
|
||||||
845 17741440 1
|
|
||||||
847 16787954 1
|
|
||||||
852 13843341 1
|
|
||||||
859 1982286 1
|
|
||||||
870 195689316 1
|
|
||||||
873 1180972 1
|
|
||||||
873 19307912 1
|
|
||||||
873 27393799 1
|
|
||||||
873 29025270 1
|
|
||||||
873 3315558 1
|
|
||||||
879 8426046 1
|
|
||||||
880 8426046 1
|
|
||||||
882 14803797 1
|
|
||||||
887 18855191 1
|
|
||||||
903 10648422 1
|
|
||||||
904 7370282 1
|
|
||||||
907 6923961 1
|
|
||||||
911 11254556 1
|
|
||||||
913 3203590 1
|
|
||||||
914 3203590 1
|
|
||||||
921 1642727 1
|
|
||||||
922 17077004 1
|
|
||||||
936 5483793 1
|
|
||||||
956 12956194 1
|
|
||||||
957 123859 1
|
|
||||||
960 8780599 1
|
|
||||||
967 2119889 1
|
|
||||||
967 8997410 1
|
|
||||||
971 46695481 1
|
|
||||||
971 27873158 1
|
|
||||||
971 28617573 1
|
|
||||||
971 9764256 1
|
|
||||||
975 5304891 1
|
|
||||||
982 2988714 1
|
|
||||||
985 6828370 1
|
|
||||||
993 16472469 1
|
|
||||||
1012 9745001 1
|
|
||||||
1014 6277638 1
|
|
||||||
1019 11603066 1
|
|
||||||
1020 9433958 1
|
|
||||||
1021 9433958 1
|
|
||||||
1024 5373138 1
|
|
||||||
1029 13923140 1
|
|
||||||
1029 13940200 1
|
|
||||||
1029 11899391 1
|
|
||||||
1041 25254425 1
|
|
||||||
1041 16626264 1
|
|
||||||
1049 12486491 1
|
|
||||||
1062 20381484 1
|
|
||||||
1086 39281140 1
|
|
||||||
1088 37549932 1
|
|
||||||
1089 17628888 1
|
|
||||||
1099 7662206 1
|
|
||||||
1100 7662206 1
|
|
||||||
1104 3898784 1
|
|
||||||
1107 20532591 1
|
|
||||||
1110 13770184 1
|
|
||||||
1121 4456756 1
|
|
||||||
1130 17997584 1
|
|
||||||
1132 33499189 1
|
|
||||||
1132 9283422 1
|
|
||||||
1137 33370 1
|
|
||||||
1140 12009265 1
|
|
||||||
1144 10071552 1
|
|
||||||
1146 13906581 1
|
|
||||||
1150 11369420 1
|
|
||||||
1163 15305881 1
|
|
||||||
1175 31272411 1
|
|
||||||
1179 31272411 1
|
|
||||||
1180 31272411 1
|
|
||||||
1185 16737210 1
|
|
||||||
1187 52873726 1
|
|
||||||
1191 30655442 1
|
|
||||||
1194 11419230 1
|
|
||||||
1196 25649714 1
|
|
||||||
1197 25649714 1
|
|
||||||
1199 16760369 1
|
|
||||||
1200 3441524 1
|
|
||||||
1202 3475317 1
|
|
||||||
1204 31141365 1
|
|
||||||
1207 18909530 1
|
|
||||||
1213 14407673 1
|
|
||||||
1216 24142891 1
|
|
||||||
1221 19736671 1
|
|
||||||
1225 9650982 1
|
|
||||||
1226 13777138 1
|
|
||||||
1232 13905670 1
|
|
||||||
1241 4427392 1
|
|
||||||
1245 7662395 1
|
|
||||||
1259 24341590 1
|
|
||||||
1262 44172171 1
|
|
||||||
1266 37480103 1
|
|
||||||
1270 13900610 1
|
|
||||||
1271 13768432 1
|
|
||||||
1272 17081238 1
|
|
||||||
1273 11041152 1
|
|
||||||
1274 12428814 1
|
|
||||||
1274 27731651 1
|
|
||||||
1274 4406819 1
|
|
||||||
1278 11335781 1
|
|
||||||
1279 11335781 1
|
|
||||||
1280 4387784 1
|
|
||||||
1281 4387784 1
|
|
||||||
1282 23649163 1
|
|
||||||
1290 4687948 1
|
|
||||||
1292 56893404 1
|
|
||||||
1298 11718220 1
|
|
||||||
1303 12631697 1
|
|
||||||
1316 27910499 1
|
|
||||||
1319 16284655 1
|
|
||||||
1320 16284655 1
|
|
||||||
1332 5304891 1
|
|
||||||
1335 27910499 1
|
|
||||||
1336 27910499 1
|
|
||||||
1337 20231138 1
|
|
||||||
1339 15482274 1
|
|
||||||
1344 9559146 1
|
|
||||||
1352 12885341 1
|
|
||||||
1359 11614737 1
|
|
||||||
1362 8290953 1
|
|
||||||
1363 8290953 1
|
|
||||||
1368 2425364 1
|
|
||||||
1370 2425364 1
|
|
||||||
1379 16322674 1
|
|
||||||
1379 27123743 1
|
|
||||||
1379 23557241 1
|
|
||||||
1379 17450673 1
|
|
||||||
1382 17755060 1
|
|
||||||
1385 306006 1
|
|
||||||
1389 23895668 1
|
|
||||||
1395 17717391 1
|
|
||||||
|
|
|
@ -1,920 +0,0 @@
|
||||||
query-id corpus-id score
|
|
||||||
0 31715818 1
|
|
||||||
2 13734012 1
|
|
||||||
4 22942787 1
|
|
||||||
6 2613775 1
|
|
||||||
9 44265107 1
|
|
||||||
10 32587939 1
|
|
||||||
11 32587939 1
|
|
||||||
12 33409100 1
|
|
||||||
14 641786 1
|
|
||||||
15 22080671 1
|
|
||||||
17 1606628 1
|
|
||||||
18 22942787 1
|
|
||||||
19 3202143 1
|
|
||||||
20 3202143 1
|
|
||||||
21 41493639 1
|
|
||||||
22 6490571 1
|
|
||||||
24 3471191 1
|
|
||||||
25 2613775 1
|
|
||||||
26 32390525 1
|
|
||||||
27 32390525 1
|
|
||||||
28 12670680 1
|
|
||||||
30 24341590 1
|
|
||||||
32 12428497 1
|
|
||||||
34 11705328 1
|
|
||||||
35 5152028 1
|
|
||||||
35 11705328 1
|
|
||||||
37 5152028 1
|
|
||||||
37 11705328 1
|
|
||||||
39 13497630 1
|
|
||||||
40 13497630 1
|
|
||||||
41 18174210 1
|
|
||||||
43 7224723 1
|
|
||||||
44 56893404 1
|
|
||||||
45 56893404 1
|
|
||||||
46 380526 1
|
|
||||||
47 3512154 1
|
|
||||||
47 26996935 1
|
|
||||||
52 45638119 1
|
|
||||||
55 49556906 1
|
|
||||||
58 4709641 1
|
|
||||||
60 13899137 1
|
|
||||||
60 13901073 1
|
|
||||||
61 13899137 1
|
|
||||||
61 13901073 1
|
|
||||||
62 32587939 1
|
|
||||||
63 40349336 1
|
|
||||||
64 40349336 1
|
|
||||||
66 14806256 1
|
|
||||||
67 21295300 1
|
|
||||||
68 21295300 1
|
|
||||||
69 5956380 1
|
|
||||||
69 4414547 1
|
|
||||||
71 1127562 1
|
|
||||||
73 6076903 1
|
|
||||||
74 4387784 1
|
|
||||||
76 5531479 1
|
|
||||||
77 5531479 1
|
|
||||||
78 5099266 1
|
|
||||||
79 5099266 1
|
|
||||||
80 4920376 1
|
|
||||||
81 1797622 1
|
|
||||||
82 3619372 1
|
|
||||||
85 7521113 1
|
|
||||||
85 22406695 1
|
|
||||||
86 7521113 1
|
|
||||||
86 22406695 1
|
|
||||||
88 7521113 1
|
|
||||||
88 22406695 1
|
|
||||||
89 7521113 1
|
|
||||||
89 22406695 1
|
|
||||||
90 22406695 1
|
|
||||||
91 1084345 1
|
|
||||||
92 1084345 1
|
|
||||||
93 2692522 1
|
|
||||||
95 1215116 1
|
|
||||||
96 14500725 1
|
|
||||||
98 6540064 1
|
|
||||||
104 40164383 1
|
|
||||||
105 36606083 1
|
|
||||||
106 25515907 1
|
|
||||||
106 5151024 1
|
|
||||||
108 6191684 1
|
|
||||||
108 22995579 1
|
|
||||||
108 23865182 1
|
|
||||||
109 4319174 1
|
|
||||||
111 13513790 1
|
|
||||||
112 6157837 1
|
|
||||||
114 33872649 1
|
|
||||||
116 33872649 1
|
|
||||||
119 14606752 1
|
|
||||||
120 14606752 1
|
|
||||||
121 31460499 1
|
|
||||||
122 31460499 1
|
|
||||||
123 4883040 1
|
|
||||||
126 24512064 1
|
|
||||||
134 4695046 1
|
|
||||||
138 26016929 1
|
|
||||||
139 22080671 1
|
|
||||||
144 10582939 1
|
|
||||||
149 6227220 1
|
|
||||||
152 15488881 1
|
|
||||||
153 4702639 1
|
|
||||||
154 4702639 1
|
|
||||||
155 37549932 1
|
|
||||||
156 37549932 1
|
|
||||||
157 13439128 1
|
|
||||||
159 9394119 1
|
|
||||||
160 52874170 1
|
|
||||||
161 6903077 1
|
|
||||||
164 5824985 1
|
|
||||||
165 5824985 1
|
|
||||||
166 18872233 1
|
|
||||||
167 18872233 1
|
|
||||||
168 5824985 1
|
|
||||||
169 5824985 1
|
|
||||||
172 12670680 1
|
|
||||||
173 8126244 1
|
|
||||||
174 1710116 1
|
|
||||||
175 1710116 1
|
|
||||||
176 32587939 1
|
|
||||||
177 9669099 1
|
|
||||||
178 16322674 1
|
|
||||||
178 27123743 1
|
|
||||||
178 23557241 1
|
|
||||||
178 17450673 1
|
|
||||||
181 16966326 1
|
|
||||||
182 11369420 1
|
|
||||||
184 12827098 1
|
|
||||||
186 16855829 1
|
|
||||||
187 16855829 1
|
|
||||||
189 4421578 1
|
|
||||||
196 19313533 1
|
|
||||||
197 2177022 1
|
|
||||||
199 2177022 1
|
|
||||||
200 18231807 1
|
|
||||||
201 2462673 1
|
|
||||||
203 9558539 1
|
|
||||||
204 7898952 1
|
|
||||||
205 7898952 1
|
|
||||||
205 470625 1
|
|
||||||
209 32587939 1
|
|
||||||
210 13794374 1
|
|
||||||
211 13794374 1
|
|
||||||
214 13625993 1
|
|
||||||
220 19205437 1
|
|
||||||
221 19205437 1
|
|
||||||
222 19205437 1
|
|
||||||
223 2014909 1
|
|
||||||
224 6944800 1
|
|
||||||
225 6944800 1
|
|
||||||
226 6944800 1
|
|
||||||
227 26973393 1
|
|
||||||
228 4928057 1
|
|
||||||
229 56893404 1
|
|
||||||
235 4388470 1
|
|
||||||
241 2212067 1
|
|
||||||
241 10608822 1
|
|
||||||
242 2212067 1
|
|
||||||
242 10608822 1
|
|
||||||
243 8148122 1
|
|
||||||
244 21498497 1
|
|
||||||
245 8447873 1
|
|
||||||
245 3430789 1
|
|
||||||
246 8447873 1
|
|
||||||
246 3430789 1
|
|
||||||
247 13578199 1
|
|
||||||
250 1568684 1
|
|
||||||
251 1568684 1
|
|
||||||
253 37424881 1
|
|
||||||
254 37424881 1
|
|
||||||
255 5850219 1
|
|
||||||
256 5850219 1
|
|
||||||
258 22080671 1
|
|
||||||
259 8883846 1
|
|
||||||
262 14610165 1
|
|
||||||
263 11328820 1
|
|
||||||
263 30041340 1
|
|
||||||
263 14853989 1
|
|
||||||
264 11328820 1
|
|
||||||
265 2033917 1
|
|
||||||
266 22405338 1
|
|
||||||
267 5912283 1
|
|
||||||
267 31554917 1
|
|
||||||
272 11614737 1
|
|
||||||
277 14376683 1
|
|
||||||
278 14376683 1
|
|
||||||
280 25001628 1
|
|
||||||
281 4632921 1
|
|
||||||
283 1974176 1
|
|
||||||
285 5548081 1
|
|
||||||
286 4709641 1
|
|
||||||
287 4709641 1
|
|
||||||
290 15048300 1
|
|
||||||
292 15048300 1
|
|
||||||
293 10874408 1
|
|
||||||
296 4398832 1
|
|
||||||
299 39381118 1
|
|
||||||
301 3553087 1
|
|
||||||
304 14797520 1
|
|
||||||
305 14797520 1
|
|
||||||
306 7821634 1
|
|
||||||
308 7821634 1
|
|
||||||
309 7821634 1
|
|
||||||
310 6173523 1
|
|
||||||
313 6173523 1
|
|
||||||
315 3701541 1
|
|
||||||
316 712078 1
|
|
||||||
317 4506414 1
|
|
||||||
323 2014909 1
|
|
||||||
325 40349336 1
|
|
||||||
326 40349336 1
|
|
||||||
330 9505448 1
|
|
||||||
331 9505448 1
|
|
||||||
332 29023309 1
|
|
||||||
333 29023309 1
|
|
||||||
334 25079962 1
|
|
||||||
335 1780819 1
|
|
||||||
336 2097256 1
|
|
||||||
337 2097256 1
|
|
||||||
339 23349986 1
|
|
||||||
340 7098463 1
|
|
||||||
341 7098463 1
|
|
||||||
342 7873737 1
|
|
||||||
342 5884524 1
|
|
||||||
345 4394817 1
|
|
||||||
346 11902109 1
|
|
||||||
347 11902109 1
|
|
||||||
349 13497630 1
|
|
||||||
351 14658685 1
|
|
||||||
352 14658685 1
|
|
||||||
355 12800122 1
|
|
||||||
355 38380061 1
|
|
||||||
356 6144337 1
|
|
||||||
357 18111172 1
|
|
||||||
358 18111172 1
|
|
||||||
361 38587347 1
|
|
||||||
363 5386514 1
|
|
||||||
364 1550937 1
|
|
||||||
365 600437 1
|
|
||||||
366 13956305 1
|
|
||||||
367 27099731 1
|
|
||||||
368 27099731 1
|
|
||||||
369 6826100 1
|
|
||||||
370 1550937 1
|
|
||||||
371 1550937 1
|
|
||||||
372 24922825 1
|
|
||||||
375 1522647 1
|
|
||||||
376 22401061 1
|
|
||||||
377 18810195 1
|
|
||||||
378 45154987 1
|
|
||||||
378 10534299 1
|
|
||||||
378 11886686 1
|
|
||||||
378 25007443 1
|
|
||||||
378 17150648 1
|
|
||||||
379 19005293 1
|
|
||||||
381 18340282 1
|
|
||||||
382 11659421 1
|
|
||||||
383 13770184 1
|
|
||||||
389 1148122 1
|
|
||||||
390 1148122 1
|
|
||||||
391 1148122 1
|
|
||||||
392 1148122 1
|
|
||||||
393 1148122 1
|
|
||||||
394 11360768 1
|
|
||||||
396 1456068 1
|
|
||||||
397 1456068 1
|
|
||||||
398 8883846 1
|
|
||||||
400 791050 1
|
|
||||||
401 5633876 1
|
|
||||||
403 1921218 1
|
|
||||||
404 1921218 1
|
|
||||||
406 6796297 1
|
|
||||||
407 9889151 1
|
|
||||||
413 6309659 1
|
|
||||||
414 6309659 1
|
|
||||||
416 6309659 1
|
|
||||||
417 6309659 1
|
|
||||||
418 16660256 1
|
|
||||||
420 9315213 1
|
|
||||||
422 11172205 1
|
|
||||||
423 8595678 1
|
|
||||||
425 33257464 1
|
|
||||||
426 16728949 1
|
|
||||||
428 16728949 1
|
|
||||||
429 36540079 1
|
|
||||||
430 28937856 1
|
|
||||||
432 8002887 1
|
|
||||||
434 9500590 1
|
|
||||||
435 9500590 1
|
|
||||||
441 2014909 1
|
|
||||||
444 10165258 1
|
|
||||||
445 10165258 1
|
|
||||||
447 2052720 1
|
|
||||||
448 2052720 1
|
|
||||||
449 12209494 1
|
|
||||||
449 3430789 1
|
|
||||||
453 4200695 1
|
|
||||||
454 4200695 1
|
|
||||||
455 12643937 1
|
|
||||||
456 30507607 1
|
|
||||||
458 597790 1
|
|
||||||
461 40096222 1
|
|
||||||
463 19736671 1
|
|
||||||
466 22544171 1
|
|
||||||
469 1410197 1
|
|
||||||
470 12685434 1
|
|
||||||
472 7185591 1
|
|
||||||
472 26330861 1
|
|
||||||
472 4414481 1
|
|
||||||
473 4373433 1
|
|
||||||
474 4373433 1
|
|
||||||
479 6325527 1
|
|
||||||
480 6325527 1
|
|
||||||
481 14706752 1
|
|
||||||
482 10991183 1
|
|
||||||
483 22703082 1
|
|
||||||
484 14637235 1
|
|
||||||
485 14637235 1
|
|
||||||
486 14637235 1
|
|
||||||
487 14637235 1
|
|
||||||
488 1780819 1
|
|
||||||
489 6625693 1
|
|
||||||
490 56893404 1
|
|
||||||
492 19583924 1
|
|
||||||
493 19583924 1
|
|
||||||
494 34873974 1
|
|
||||||
495 17077004 1
|
|
||||||
498 17077004 1
|
|
||||||
499 26064662 1
|
|
||||||
500 17930286 1
|
|
||||||
504 10883736 1
|
|
||||||
505 22703082 1
|
|
||||||
506 7433668 1
|
|
||||||
509 13980338 1
|
|
||||||
515 29564505 1
|
|
||||||
523 14803797 1
|
|
||||||
524 14803797 1
|
|
||||||
526 3863543 1
|
|
||||||
529 10546779 1
|
|
||||||
529 25413327 1
|
|
||||||
529 36651210 1
|
|
||||||
530 10546779 1
|
|
||||||
530 25413327 1
|
|
||||||
530 36651210 1
|
|
||||||
530 87610599 1
|
|
||||||
531 10546779 1
|
|
||||||
531 25413327 1
|
|
||||||
531 36651210 1
|
|
||||||
537 16056514 1
|
|
||||||
541 45154987 1
|
|
||||||
541 11886686 1
|
|
||||||
541 25007443 1
|
|
||||||
542 19688024 1
|
|
||||||
545 24221369 1
|
|
||||||
547 10648422 1
|
|
||||||
548 18199839 1
|
|
||||||
550 33499189 1
|
|
||||||
553 1471041 1
|
|
||||||
555 1049501 1
|
|
||||||
557 1049501 1
|
|
||||||
559 3475317 1
|
|
||||||
562 20101846 1
|
|
||||||
563 2867345 1
|
|
||||||
564 2867345 1
|
|
||||||
565 16120395 1
|
|
||||||
566 16120395 1
|
|
||||||
568 23418635 1
|
|
||||||
570 20333864 1
|
|
||||||
571 20333864 1
|
|
||||||
572 4447055 1
|
|
||||||
573 10300888 1
|
|
||||||
574 10300888 1
|
|
||||||
576 4468861 1
|
|
||||||
579 34139429 1
|
|
||||||
580 23460562 1
|
|
||||||
582 14260013 1
|
|
||||||
584 14260013 1
|
|
||||||
585 42291761 1
|
|
||||||
588 16999023 1
|
|
||||||
590 10984005 1
|
|
||||||
591 14682243 1
|
|
||||||
592 14682243 1
|
|
||||||
594 19675911 1
|
|
||||||
595 4824840 1
|
|
||||||
600 12258338 1
|
|
||||||
601 12258338 1
|
|
||||||
602 3701541 1
|
|
||||||
603 6540064 1
|
|
||||||
606 712078 1
|
|
||||||
607 4506414 1
|
|
||||||
609 40096222 1
|
|
||||||
610 40096222 1
|
|
||||||
611 32408470 1
|
|
||||||
612 9638032 1
|
|
||||||
614 9638032 1
|
|
||||||
615 9638032 1
|
|
||||||
616 18670 1
|
|
||||||
617 18670 1
|
|
||||||
618 6836086 1
|
|
||||||
620 2565138 1
|
|
||||||
621 1642727 1
|
|
||||||
622 17000834 1
|
|
||||||
624 20033112 1
|
|
||||||
625 20033112 1
|
|
||||||
626 16355392 1
|
|
||||||
631 5468807 1
|
|
||||||
632 5172048 1
|
|
||||||
633 5172048 1
|
|
||||||
635 1686997 1
|
|
||||||
638 25649714 1
|
|
||||||
640 6503185 1
|
|
||||||
642 13619127 1
|
|
||||||
643 15535511 1
|
|
||||||
645 12810152 1
|
|
||||||
646 12810152 1
|
|
||||||
647 15041758 1
|
|
||||||
648 15041758 1
|
|
||||||
650 12789595 1
|
|
||||||
651 9433958 1
|
|
||||||
652 9433958 1
|
|
||||||
653 24384587 1
|
|
||||||
654 57574395 1
|
|
||||||
655 57574395 1
|
|
||||||
657 8533245 1
|
|
||||||
658 5293024 1
|
|
||||||
661 37204802 1
|
|
||||||
662 37204802 1
|
|
||||||
663 22080671 1
|
|
||||||
665 12580014 1
|
|
||||||
666 4469125 1
|
|
||||||
667 6493422 1
|
|
||||||
668 6493422 1
|
|
||||||
668 25148216 1
|
|
||||||
669 6493422 1
|
|
||||||
669 25148216 1
|
|
||||||
670 5573975 1
|
|
||||||
671 5573975 1
|
|
||||||
672 15635366 1
|
|
||||||
673 2095573 1
|
|
||||||
676 857189 1
|
|
||||||
677 857189 1
|
|
||||||
679 13639330 1
|
|
||||||
680 9315213 1
|
|
||||||
681 9315213 1
|
|
||||||
682 9315213 1
|
|
||||||
683 9315213 1
|
|
||||||
685 4452659 1
|
|
||||||
686 4452659 1
|
|
||||||
687 4452659 1
|
|
||||||
688 4452659 1
|
|
||||||
689 22080671 1
|
|
||||||
694 1071991 1
|
|
||||||
696 16355392 1
|
|
||||||
698 22544171 1
|
|
||||||
703 4350400 1
|
|
||||||
704 14658685 1
|
|
||||||
705 22442133 1
|
|
||||||
709 22442133 1
|
|
||||||
710 22442133 1
|
|
||||||
713 18421962 1
|
|
||||||
714 18421962 1
|
|
||||||
717 17587795 1
|
|
||||||
724 5531479 1
|
|
||||||
726 7521113 1
|
|
||||||
726 36444198 1
|
|
||||||
730 13400643 1
|
|
||||||
732 34469966 1
|
|
||||||
733 34469966 1
|
|
||||||
734 4961038 1
|
|
||||||
736 5389095 1
|
|
||||||
737 16562534 1
|
|
||||||
737 6609935 1
|
|
||||||
738 16562534 1
|
|
||||||
738 6609935 1
|
|
||||||
738 33912020 1
|
|
||||||
739 4446814 1
|
|
||||||
740 23078022 1
|
|
||||||
745 11291348 1
|
|
||||||
746 11291348 1
|
|
||||||
747 11291348 1
|
|
||||||
748 11291348 1
|
|
||||||
749 13868795 1
|
|
||||||
751 19800147 1
|
|
||||||
752 19800147 1
|
|
||||||
753 1173667 1
|
|
||||||
755 17844478 1
|
|
||||||
757 17123657 1
|
|
||||||
758 14195528 1
|
|
||||||
760 1805641 1
|
|
||||||
761 10009203 1
|
|
||||||
762 4695046 1
|
|
||||||
764 7552215 1
|
|
||||||
765 7552215 1
|
|
||||||
766 7552215 1
|
|
||||||
767 2488880 1
|
|
||||||
771 15476777 1
|
|
||||||
772 24922825 1
|
|
||||||
774 32275758 1
|
|
||||||
776 32275758 1
|
|
||||||
777 32275758 1
|
|
||||||
778 13001323 1
|
|
||||||
779 13001323 1
|
|
||||||
780 8246922 1
|
|
||||||
780 24338780 1
|
|
||||||
782 8246922 1
|
|
||||||
787 4740447 1
|
|
||||||
788 4740447 1
|
|
||||||
789 15493354 1
|
|
||||||
790 15493354 1
|
|
||||||
791 15984735 1
|
|
||||||
792 3610080 1
|
|
||||||
795 8551160 1
|
|
||||||
797 8551160 1
|
|
||||||
798 8551160 1
|
|
||||||
799 5293024 1
|
|
||||||
801 22180793 1
|
|
||||||
802 22180793 1
|
|
||||||
803 22180793 1
|
|
||||||
804 22180793 1
|
|
||||||
807 36606083 1
|
|
||||||
810 13513790 1
|
|
||||||
812 19799455 1
|
|
||||||
813 33387953 1
|
|
||||||
815 8148304 1
|
|
||||||
816 8148304 1
|
|
||||||
817 17814815 1
|
|
||||||
818 17814815 1
|
|
||||||
822 15319019 1
|
|
||||||
825 15319019 1
|
|
||||||
826 4678846 1
|
|
||||||
828 4678846 1
|
|
||||||
835 15928989 1
|
|
||||||
838 15928989 1
|
|
||||||
840 15663829 1
|
|
||||||
841 15663829 1
|
|
||||||
844 17741440 1
|
|
||||||
846 22696649 1
|
|
||||||
848 14500725 1
|
|
||||||
853 24922825 1
|
|
||||||
854 12206390 1
|
|
||||||
855 8190282 1
|
|
||||||
856 43334921 1
|
|
||||||
857 43334921 1
|
|
||||||
858 1982286 1
|
|
||||||
860 16066726 1
|
|
||||||
861 16066726 1
|
|
||||||
863 20568364 1
|
|
||||||
863 16361581 1
|
|
||||||
866 37822406 1
|
|
||||||
867 14340571 1
|
|
||||||
871 195689316 1
|
|
||||||
876 195689316 1
|
|
||||||
877 313394 1
|
|
||||||
881 14803797 1
|
|
||||||
883 14803797 1
|
|
||||||
884 14803797 1
|
|
||||||
885 6477536 1
|
|
||||||
886 6477536 1
|
|
||||||
890 2097256 1
|
|
||||||
891 2097256 1
|
|
||||||
893 13509809 1
|
|
||||||
894 14724693 1
|
|
||||||
895 18750453 1
|
|
||||||
896 14338915 1
|
|
||||||
897 14338915 1
|
|
||||||
898 13106686 1
|
|
||||||
898 5572127 1
|
|
||||||
899 13106686 1
|
|
||||||
899 5572127 1
|
|
||||||
900 18678095 1
|
|
||||||
901 6540064 1
|
|
||||||
902 10648422 1
|
|
||||||
908 6923961 1
|
|
||||||
909 11254556 1
|
|
||||||
910 11254556 1
|
|
||||||
912 11254556 1
|
|
||||||
916 18037805 1
|
|
||||||
917 34071621 1
|
|
||||||
919 16422880 1
|
|
||||||
923 17077004 1
|
|
||||||
925 17077004 1
|
|
||||||
926 16390264 1
|
|
||||||
927 16390264 1
|
|
||||||
928 18174210 1
|
|
||||||
929 18174210 1
|
|
||||||
930 16056514 1
|
|
||||||
933 14711483 1
|
|
||||||
934 8563659 1
|
|
||||||
935 5483793 1
|
|
||||||
938 26231129 1
|
|
||||||
939 26231129 1
|
|
||||||
940 12258338 1
|
|
||||||
941 12258338 1
|
|
||||||
942 11527199 1
|
|
||||||
944 1642727 1
|
|
||||||
945 8428935 1
|
|
||||||
945 26112696 1
|
|
||||||
945 4463588 1
|
|
||||||
945 13083189 1
|
|
||||||
946 8428935 1
|
|
||||||
946 26112696 1
|
|
||||||
946 4463588 1
|
|
||||||
946 13083189 1
|
|
||||||
949 13578199 1
|
|
||||||
951 21414718 1
|
|
||||||
952 3355397 1
|
|
||||||
953 3355397 1
|
|
||||||
954 3355397 1
|
|
||||||
955 2078658 1
|
|
||||||
955 30507607 1
|
|
||||||
959 8780599 1
|
|
||||||
962 13931771 1
|
|
||||||
962 935538 1
|
|
||||||
962 4306711 1
|
|
||||||
963 4162857 1
|
|
||||||
963 29828242 1
|
|
||||||
964 4162857 1
|
|
||||||
964 29828242 1
|
|
||||||
965 40817021 1
|
|
||||||
969 19356271 1
|
|
||||||
969 17368516 1
|
|
||||||
970 19356271 1
|
|
||||||
970 17368516 1
|
|
||||||
972 46695481 1
|
|
||||||
972 27873158 1
|
|
||||||
972 28617573 1
|
|
||||||
972 9764256 1
|
|
||||||
973 27446873 1
|
|
||||||
973 27873158 1
|
|
||||||
973 28617573 1
|
|
||||||
973 9764256 1
|
|
||||||
976 5304891 1
|
|
||||||
977 14075252 1
|
|
||||||
977 39264456 1
|
|
||||||
978 14075252 1
|
|
||||||
979 11659421 1
|
|
||||||
980 20128547 1
|
|
||||||
984 6828370 1
|
|
||||||
988 3033830 1
|
|
||||||
989 9988425 1
|
|
||||||
990 16472469 1
|
|
||||||
992 16472469 1
|
|
||||||
994 16472469 1
|
|
||||||
996 16472469 1
|
|
||||||
997 16472469 1
|
|
||||||
998 16472469 1
|
|
||||||
999 16472469 1
|
|
||||||
1000 16472469 1
|
|
||||||
1001 5702790 1
|
|
||||||
1002 13639330 1
|
|
||||||
1003 14332945 1
|
|
||||||
1003 4319844 1
|
|
||||||
1003 4899981 1
|
|
||||||
1004 301838 1
|
|
||||||
1004 2734421 1
|
|
||||||
1004 3952288 1
|
|
||||||
1005 301838 1
|
|
||||||
1005 2734421 1
|
|
||||||
1005 3952288 1
|
|
||||||
1006 4926049 1
|
|
||||||
1008 2547636 1
|
|
||||||
1009 1982286 1
|
|
||||||
1011 9745001 1
|
|
||||||
1015 6277638 1
|
|
||||||
1016 6277638 1
|
|
||||||
1018 11603066 1
|
|
||||||
1023 16927286 1
|
|
||||||
1025 32408470 1
|
|
||||||
1026 3113630 1
|
|
||||||
1027 3113630 1
|
|
||||||
1028 13923140 1
|
|
||||||
1028 11899391 1
|
|
||||||
1030 6441369 1
|
|
||||||
1031 12486491 1
|
|
||||||
1032 6836086 1
|
|
||||||
1033 6836086 1
|
|
||||||
1034 4547102 1
|
|
||||||
1035 4547102 1
|
|
||||||
1036 4547102 1
|
|
||||||
1037 16287725 1
|
|
||||||
1038 16287725 1
|
|
||||||
1040 25254425 1
|
|
||||||
1040 16626264 1
|
|
||||||
1042 17421851 1
|
|
||||||
1043 17671145 1
|
|
||||||
1044 22500262 1
|
|
||||||
1045 22500262 1
|
|
||||||
1046 418246 1
|
|
||||||
1046 4324278 1
|
|
||||||
1046 16712164 1
|
|
||||||
1047 14706752 1
|
|
||||||
1048 12486491 1
|
|
||||||
1050 19878070 1
|
|
||||||
1052 18816720 1
|
|
||||||
1053 18816720 1
|
|
||||||
1054 10072941 1
|
|
||||||
1055 13906581 1
|
|
||||||
1056 4200695 1
|
|
||||||
1058 13027590 1
|
|
||||||
1065 20418809 1
|
|
||||||
1067 4429668 1
|
|
||||||
1068 4429668 1
|
|
||||||
1069 4200695 1
|
|
||||||
1070 25649714 1
|
|
||||||
1072 4824840 1
|
|
||||||
1073 4824840 1
|
|
||||||
1074 14658685 1
|
|
||||||
1075 14658685 1
|
|
||||||
1081 5691302 1
|
|
||||||
1084 5691302 1
|
|
||||||
1085 5691302 1
|
|
||||||
1087 39281140 1
|
|
||||||
1090 17628888 1
|
|
||||||
1091 2603304 1
|
|
||||||
1096 29638116 1
|
|
||||||
1097 26851674 1
|
|
||||||
1098 13552682 1
|
|
||||||
1101 3874000 1
|
|
||||||
1102 3874000 1
|
|
||||||
1103 3898784 1
|
|
||||||
1105 6710713 1
|
|
||||||
1106 6710713 1
|
|
||||||
1109 13770184 1
|
|
||||||
1109 8582337 1
|
|
||||||
1111 1686881 1
|
|
||||||
1112 1686881 1
|
|
||||||
1114 12824568 1
|
|
||||||
1115 44048701 1
|
|
||||||
1118 23351136 1
|
|
||||||
1119 5323845 1
|
|
||||||
1119 18997216 1
|
|
||||||
1119 13907928 1
|
|
||||||
1120 5323845 1
|
|
||||||
1120 18997216 1
|
|
||||||
1120 13907928 1
|
|
||||||
1125 21009874 1
|
|
||||||
1126 21009874 1
|
|
||||||
1127 27466734 1
|
|
||||||
1128 33499189 1
|
|
||||||
1128 9283422 1
|
|
||||||
1133 24142891 1
|
|
||||||
1134 33370 1
|
|
||||||
1135 33370 1
|
|
||||||
1136 33370 1
|
|
||||||
1138 6796297 1
|
|
||||||
1139 12009265 1
|
|
||||||
1141 12009265 1
|
|
||||||
1142 5260382 1
|
|
||||||
1145 10071552 1
|
|
||||||
1148 4828631 1
|
|
||||||
1153 7370282 1
|
|
||||||
1156 12584053 1
|
|
||||||
1157 12584053 1
|
|
||||||
1158 12584053 1
|
|
||||||
1159 12584053 1
|
|
||||||
1161 13048272 1
|
|
||||||
1162 15305881 1
|
|
||||||
1164 4455466 1
|
|
||||||
1165 4455466 1
|
|
||||||
1166 9889151 1
|
|
||||||
1168 8563659 1
|
|
||||||
1169 4319174 1
|
|
||||||
1170 18956141 1
|
|
||||||
1171 18956141 1
|
|
||||||
1173 7370282 1
|
|
||||||
1174 31272411 1
|
|
||||||
1176 13910150 1
|
|
||||||
1177 13910150 1
|
|
||||||
1178 31272411 1
|
|
||||||
1181 301838 1
|
|
||||||
1181 2734421 1
|
|
||||||
1181 39128592 1
|
|
||||||
1181 3952288 1
|
|
||||||
1182 14541844 1
|
|
||||||
1183 1967017 1
|
|
||||||
1184 16737210 1
|
|
||||||
1186 7485455 1
|
|
||||||
1188 4394817 1
|
|
||||||
1190 30655442 1
|
|
||||||
1193 20532591 1
|
|
||||||
1195 26283293 1
|
|
||||||
1205 5558754 1
|
|
||||||
1206 18909530 1
|
|
||||||
1208 10284593 1
|
|
||||||
1209 4347374 1
|
|
||||||
1210 4928282 1
|
|
||||||
1211 4928282 1
|
|
||||||
1212 6493422 1
|
|
||||||
1212 44724517 1
|
|
||||||
1214 6493422 1
|
|
||||||
1214 14407673 1
|
|
||||||
1215 16355392 1
|
|
||||||
1218 15635366 1
|
|
||||||
1219 9393969 1
|
|
||||||
1219 14864285 1
|
|
||||||
1220 13023410 1
|
|
||||||
1223 5289038 1
|
|
||||||
1224 21932050 1
|
|
||||||
1224 34016987 1
|
|
||||||
1227 25641414 1
|
|
||||||
1228 25641414 1
|
|
||||||
1229 1676568 1
|
|
||||||
1230 13905670 1
|
|
||||||
1231 13905670 1
|
|
||||||
1234 13905670 1
|
|
||||||
1235 17973161 1
|
|
||||||
1236 17973161 1
|
|
||||||
1237 3654468 1
|
|
||||||
1238 3654468 1
|
|
||||||
1239 21387297 1
|
|
||||||
1239 4427392 1
|
|
||||||
1244 18949516 1
|
|
||||||
1246 7662395 1
|
|
||||||
1247 5114282 1
|
|
||||||
1248 7209559 1
|
|
||||||
1249 7209559 1
|
|
||||||
1253 3321943 1
|
|
||||||
1254 16939583 1
|
|
||||||
1255 16939583 1
|
|
||||||
1257 581832 1
|
|
||||||
1258 12040627 1
|
|
||||||
1260 24341590 1
|
|
||||||
1261 13023410 1
|
|
||||||
1263 3981729 1
|
|
||||||
1265 37480103 1
|
|
||||||
1268 52072815 1
|
|
||||||
1269 13900610 1
|
|
||||||
1275 27731651 1
|
|
||||||
1276 3475317 1
|
|
||||||
1284 3578380 1
|
|
||||||
1288 4687948 1
|
|
||||||
1289 21239672 1
|
|
||||||
1291 56893404 1
|
|
||||||
1293 43329366 1
|
|
||||||
1294 2078658 1
|
|
||||||
1294 30507607 1
|
|
||||||
1295 21239672 1
|
|
||||||
1297 9167230 1
|
|
||||||
1300 6421792 1
|
|
||||||
1302 12631697 1
|
|
||||||
1304 12631697 1
|
|
||||||
1305 12631697 1
|
|
||||||
1306 6000423 1
|
|
||||||
1306 5836 1
|
|
||||||
1307 18231807 1
|
|
||||||
1308 18231807 1
|
|
||||||
1309 18231807 1
|
|
||||||
1310 8042158 1
|
|
||||||
1311 13763195 1
|
|
||||||
1312 24177706 1
|
|
||||||
1314 13072112 1
|
|
||||||
1314 16237005 1
|
|
||||||
1315 13072112 1
|
|
||||||
1315 16237005 1
|
|
||||||
1322 16284655 1
|
|
||||||
1323 19912367 1
|
|
||||||
1324 19912367 1
|
|
||||||
1325 40476126 1
|
|
||||||
1327 24241932 1
|
|
||||||
1327 22194407 1
|
|
||||||
1328 3475317 1
|
|
||||||
1330 14075252 1
|
|
||||||
1331 14075252 1
|
|
||||||
1333 1649738 1
|
|
||||||
1334 13923140 1
|
|
||||||
1334 13940200 1
|
|
||||||
1334 11899391 1
|
|
||||||
1340 15482274 1
|
|
||||||
1341 15482274 1
|
|
||||||
1342 8148122 1
|
|
||||||
1345 9559146 1
|
|
||||||
1346 9505402 1
|
|
||||||
1347 19005293 1
|
|
||||||
1348 19005293 1
|
|
||||||
1349 5377642 1
|
|
||||||
1350 5377642 1
|
|
||||||
1351 28369117 1
|
|
||||||
1353 18816720 1
|
|
||||||
1355 5256564 1
|
|
||||||
1356 13764090 1
|
|
||||||
1360 11614737 1
|
|
||||||
1361 15488881 1
|
|
||||||
1361 15058155 1
|
|
||||||
1364 8290953 1
|
|
||||||
1366 4406819 1
|
|
||||||
1367 2425364 1
|
|
||||||
1371 16256507 1
|
|
||||||
1372 21003930 1
|
|
||||||
1373 21003930 1
|
|
||||||
1374 21993510 1
|
|
||||||
1375 21993510 1
|
|
||||||
1376 3944632 1
|
|
||||||
1378 2488880 1
|
|
||||||
1380 16322674 1
|
|
||||||
1380 23557241 1
|
|
||||||
1380 17450673 1
|
|
||||||
1381 13481880 1
|
|
||||||
1383 17755060 1
|
|
||||||
1386 306006 1
|
|
||||||
1387 9669099 1
|
|
||||||
1390 2890952 1
|
|
||||||
1391 6766459 1
|
|
||||||
1392 6766459 1
|
|
||||||
1393 2000038 1
|
|
||||||
1393 12440953 1
|
|
||||||
1394 2251426 1
|
|
||||||
1397 17717391 1
|
|
||||||
1398 17717391 1
|
|
||||||
1400 14706752 1
|
|
||||||
1401 5185871 1
|
|
||||||
1402 8126244 1
|
|
||||||
1403 33370 1
|
|
||||||
1403 38355793 1
|
|
||||||
1404 33370 1
|
|
||||||
1404 38355793 1
|
|
||||||
1405 10504681 1
|
|
||||||
1406 2617858 1
|
|
||||||
1407 8087082 1
|
|
||||||
1407 29863668 1
|
|
||||||
|
File diff suppressed because it is too large
Load Diff
|
|
@ -1,54 +0,0 @@
|
||||||
{
|
|
||||||
"qwen3-0.6B-emb:latest": {
|
|
||||||
"NDCG": {
|
|
||||||
"NDCG@1": 0.94971,
|
|
||||||
"NDCG@3": 0.96956,
|
|
||||||
"NDCG@5": 0.97166,
|
|
||||||
"NDCG@10": 0.97342
|
|
||||||
},
|
|
||||||
"MAP": {
|
|
||||||
"MAP@1": 0.94971,
|
|
||||||
"MAP@3": 0.96504,
|
|
||||||
"MAP@5": 0.9662,
|
|
||||||
"MAP@10": 0.96694
|
|
||||||
},
|
|
||||||
"Recall": {
|
|
||||||
"Recall@1": 0.94971,
|
|
||||||
"Recall@3": 0.98251,
|
|
||||||
"Recall@5": 0.98761,
|
|
||||||
"Recall@10": 0.99297
|
|
||||||
},
|
|
||||||
"Precision": {
|
|
||||||
"P@1": 0.94971,
|
|
||||||
"P@3": 0.3275,
|
|
||||||
"P@5": 0.19752,
|
|
||||||
"P@10": 0.0993
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"qwen2.5:1.5b": {
|
|
||||||
"NDCG": {
|
|
||||||
"NDCG@1": 0.00031,
|
|
||||||
"NDCG@3": 0.00061,
|
|
||||||
"NDCG@5": 0.00086,
|
|
||||||
"NDCG@10": 0.00118
|
|
||||||
},
|
|
||||||
"MAP": {
|
|
||||||
"MAP@1": 0.00031,
|
|
||||||
"MAP@3": 0.00051,
|
|
||||||
"MAP@5": 0.00065,
|
|
||||||
"MAP@10": 0.00078
|
|
||||||
},
|
|
||||||
"Recall": {
|
|
||||||
"Recall@1": 0.00031,
|
|
||||||
"Recall@3": 0.00088,
|
|
||||||
"Recall@5": 0.00151,
|
|
||||||
"Recall@10": 0.0025
|
|
||||||
},
|
|
||||||
"Precision": {
|
|
||||||
"P@1": 0.00031,
|
|
||||||
"P@3": 0.00029,
|
|
||||||
"P@5": 0.0003,
|
|
||||||
"P@10": 0.00025
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
@ -1,62 +0,0 @@
|
||||||
{
|
|
||||||
"qwen3-0.6B-emb:latest": {
|
|
||||||
"NDCG": {
|
|
||||||
"NDCG@1": 0.56333,
|
|
||||||
"NDCG@3": 0.64367,
|
|
||||||
"NDCG@5": 0.66577,
|
|
||||||
"NDCG@10": 0.68551,
|
|
||||||
"NDCG@100": 0.71285
|
|
||||||
},
|
|
||||||
"MAP": {
|
|
||||||
"MAP@1": 0.52994,
|
|
||||||
"MAP@3": 0.6117,
|
|
||||||
"MAP@5": 0.62815,
|
|
||||||
"MAP@10": 0.6383,
|
|
||||||
"MAP@100": 0.64466
|
|
||||||
},
|
|
||||||
"Recall": {
|
|
||||||
"Recall@1": 0.52994,
|
|
||||||
"Recall@3": 0.7035,
|
|
||||||
"Recall@5": 0.75967,
|
|
||||||
"Recall@10": 0.81611,
|
|
||||||
"Recall@100": 0.94
|
|
||||||
},
|
|
||||||
"Precision": {
|
|
||||||
"P@1": 0.56333,
|
|
||||||
"P@3": 0.25889,
|
|
||||||
"P@5": 0.17067,
|
|
||||||
"P@10": 0.093,
|
|
||||||
"P@100": 0.0107
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"qwen2.5:1.5b": {
|
|
||||||
"NDCG": {
|
|
||||||
"NDCG@1": 0.02333,
|
|
||||||
"NDCG@3": 0.03498,
|
|
||||||
"NDCG@5": 0.0404,
|
|
||||||
"NDCG@10": 0.04619,
|
|
||||||
"NDCG@100": 0.07768
|
|
||||||
},
|
|
||||||
"MAP": {
|
|
||||||
"MAP@1": 0.02083,
|
|
||||||
"MAP@3": 0.03083,
|
|
||||||
"MAP@5": 0.03375,
|
|
||||||
"MAP@10": 0.03632,
|
|
||||||
"MAP@100": 0.04123
|
|
||||||
},
|
|
||||||
"Recall": {
|
|
||||||
"Recall@1": 0.02083,
|
|
||||||
"Recall@3": 0.04417,
|
|
||||||
"Recall@5": 0.0575,
|
|
||||||
"Recall@10": 0.07417,
|
|
||||||
"Recall@100": 0.23144
|
|
||||||
},
|
|
||||||
"Precision": {
|
|
||||||
"P@1": 0.02333,
|
|
||||||
"P@3": 0.01556,
|
|
||||||
"P@5": 0.01267,
|
|
||||||
"P@10": 0.00833,
|
|
||||||
"P@100": 0.00277
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
@ -1,62 +0,0 @@
|
||||||
{
|
|
||||||
"qwen3-0.6B-emb:latest": {
|
|
||||||
"NDCG": {
|
|
||||||
"NDCG@1": 0.174,
|
|
||||||
"NDCG@3": 0.27374,
|
|
||||||
"NDCG@5": 0.33509,
|
|
||||||
"NDCG@10": 0.39086,
|
|
||||||
"NDCG@100": 0.45099
|
|
||||||
},
|
|
||||||
"MAP": {
|
|
||||||
"MAP@1": 0.174,
|
|
||||||
"MAP@3": 0.247,
|
|
||||||
"MAP@5": 0.2808,
|
|
||||||
"MAP@10": 0.30466,
|
|
||||||
"MAP@100": 0.31702
|
|
||||||
},
|
|
||||||
"Recall": {
|
|
||||||
"Recall@1": 0.174,
|
|
||||||
"Recall@3": 0.352,
|
|
||||||
"Recall@5": 0.502,
|
|
||||||
"Recall@10": 0.67,
|
|
||||||
"Recall@100": 0.952
|
|
||||||
},
|
|
||||||
"Precision": {
|
|
||||||
"P@1": 0.174,
|
|
||||||
"P@3": 0.11733,
|
|
||||||
"P@5": 0.1004,
|
|
||||||
"P@10": 0.067,
|
|
||||||
"P@100": 0.00952
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"qwen2.5:1.5b": {
|
|
||||||
"NDCG": {
|
|
||||||
"NDCG@1": 0.0,
|
|
||||||
"NDCG@3": 0.0,
|
|
||||||
"NDCG@5": 0.0,
|
|
||||||
"NDCG@10": 0.0,
|
|
||||||
"NDCG@100": 0.0021
|
|
||||||
},
|
|
||||||
"MAP": {
|
|
||||||
"MAP@1": 0.0,
|
|
||||||
"MAP@3": 0.0,
|
|
||||||
"MAP@5": 0.0,
|
|
||||||
"MAP@10": 0.0,
|
|
||||||
"MAP@100": 0.00043
|
|
||||||
},
|
|
||||||
"Recall": {
|
|
||||||
"Recall@1": 0.0,
|
|
||||||
"Recall@3": 0.0,
|
|
||||||
"Recall@5": 0.0,
|
|
||||||
"Recall@10": 0.0,
|
|
||||||
"Recall@100": 0.01
|
|
||||||
},
|
|
||||||
"Precision": {
|
|
||||||
"P@1": 0.0,
|
|
||||||
"P@3": 0.0,
|
|
||||||
"P@5": 0.0,
|
|
||||||
"P@10": 0.0,
|
|
||||||
"P@100": 0.0001
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
@ -1,162 +0,0 @@
|
||||||
{
|
|
||||||
"bge-m3:latest": {
|
|
||||||
"scifact": {
|
|
||||||
"NDCG": {
|
|
||||||
"NDCG@1": 0.51,
|
|
||||||
"NDCG@5": 0.61904,
|
|
||||||
"NDCG@10": 0.64312,
|
|
||||||
"NDCG@100": 0.6705
|
|
||||||
},
|
|
||||||
"MAP": {
|
|
||||||
"MAP@1": 0.48178,
|
|
||||||
"MAP@5": 0.58023,
|
|
||||||
"MAP@10": 0.59181,
|
|
||||||
"MAP@100": 0.59849
|
|
||||||
},
|
|
||||||
"Recall": {
|
|
||||||
"Recall@1": 0.48178,
|
|
||||||
"Recall@5": 0.71489,
|
|
||||||
"Recall@10": 0.78344,
|
|
||||||
"Recall@100": 0.90367
|
|
||||||
},
|
|
||||||
"Precision": {
|
|
||||||
"P@1": 0.51,
|
|
||||||
"P@5": 0.15667,
|
|
||||||
"P@10": 0.088,
|
|
||||||
"P@100": 0.01027
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"cosqa": {
|
|
||||||
"NDCG": {
|
|
||||||
"NDCG@1": 0.116,
|
|
||||||
"NDCG@5": 0.23831,
|
|
||||||
"NDCG@10": 0.28783,
|
|
||||||
"NDCG@100": 0.36311
|
|
||||||
},
|
|
||||||
"MAP": {
|
|
||||||
"MAP@1": 0.116,
|
|
||||||
"MAP@5": 0.19687,
|
|
||||||
"MAP@10": 0.21791,
|
|
||||||
"MAP@100": 0.23272
|
|
||||||
},
|
|
||||||
"Recall": {
|
|
||||||
"Recall@1": 0.116,
|
|
||||||
"Recall@5": 0.366,
|
|
||||||
"Recall@10": 0.516,
|
|
||||||
"Recall@100": 0.874
|
|
||||||
},
|
|
||||||
"Precision": {
|
|
||||||
"P@1": 0.116,
|
|
||||||
"P@5": 0.0732,
|
|
||||||
"P@10": 0.0516,
|
|
||||||
"P@100": 0.00874
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"codexglue": {
|
|
||||||
"NDCG": {
|
|
||||||
"NDCG@1": 0.952,
|
|
||||||
"NDCG@5": 0.97379,
|
|
||||||
"NDCG@10": 0.97494,
|
|
||||||
"NDCG@100": 0.97629
|
|
||||||
},
|
|
||||||
"MAP": {
|
|
||||||
"MAP@1": 0.952,
|
|
||||||
"MAP@5": 0.96849,
|
|
||||||
"MAP@10": 0.96897,
|
|
||||||
"MAP@100": 0.96926
|
|
||||||
},
|
|
||||||
"Recall": {
|
|
||||||
"Recall@1": 0.952,
|
|
||||||
"Recall@5": 0.98922,
|
|
||||||
"Recall@10": 0.99276,
|
|
||||||
"Recall@100": 0.99885
|
|
||||||
},
|
|
||||||
"Precision": {
|
|
||||||
"P@1": 0.952,
|
|
||||||
"P@5": 0.19784,
|
|
||||||
"P@10": 0.09928,
|
|
||||||
"P@100": 0.00999
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"qwen3-0.6B-emb:latest": {
|
|
||||||
"scifact": {
|
|
||||||
"NDCG": {
|
|
||||||
"NDCG@1": 0.55333,
|
|
||||||
"NDCG@5": 0.65926,
|
|
||||||
"NDCG@10": 0.67848,
|
|
||||||
"NDCG@100": 0.70557
|
|
||||||
},
|
|
||||||
"MAP": {
|
|
||||||
"MAP@1": 0.52428,
|
|
||||||
"MAP@5": 0.62128,
|
|
||||||
"MAP@10": 0.63094,
|
|
||||||
"MAP@100": 0.63723
|
|
||||||
},
|
|
||||||
"Recall": {
|
|
||||||
"Recall@1": 0.52428,
|
|
||||||
"Recall@5": 0.75867,
|
|
||||||
"Recall@10": 0.81444,
|
|
||||||
"Recall@100": 0.93667
|
|
||||||
},
|
|
||||||
"Precision": {
|
|
||||||
"P@1": 0.55333,
|
|
||||||
"P@5": 0.17067,
|
|
||||||
"P@10": 0.093,
|
|
||||||
"P@100": 0.01067
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"cosqa": {
|
|
||||||
"NDCG": {
|
|
||||||
"NDCG@1": 0.174,
|
|
||||||
"NDCG@5": 0.33509,
|
|
||||||
"NDCG@10": 0.39086,
|
|
||||||
"NDCG@100": 0.45099
|
|
||||||
},
|
|
||||||
"MAP": {
|
|
||||||
"MAP@1": 0.174,
|
|
||||||
"MAP@5": 0.2808,
|
|
||||||
"MAP@10": 0.30466,
|
|
||||||
"MAP@100": 0.31702
|
|
||||||
},
|
|
||||||
"Recall": {
|
|
||||||
"Recall@1": 0.174,
|
|
||||||
"Recall@5": 0.502,
|
|
||||||
"Recall@10": 0.67,
|
|
||||||
"Recall@100": 0.952
|
|
||||||
},
|
|
||||||
"Precision": {
|
|
||||||
"P@1": 0.174,
|
|
||||||
"P@5": 0.1004,
|
|
||||||
"P@10": 0.067,
|
|
||||||
"P@100": 0.00952
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"codexglue": {
|
|
||||||
"NDCG": {
|
|
||||||
"NDCG@1": 0.94971,
|
|
||||||
"NDCG@5": 0.97166,
|
|
||||||
"NDCG@10": 0.97342,
|
|
||||||
"NDCG@100": 0.97453
|
|
||||||
},
|
|
||||||
"MAP": {
|
|
||||||
"MAP@1": 0.94971,
|
|
||||||
"MAP@5": 0.9662,
|
|
||||||
"MAP@10": 0.96694,
|
|
||||||
"MAP@100": 0.96718
|
|
||||||
},
|
|
||||||
"Recall": {
|
|
||||||
"Recall@1": 0.94971,
|
|
||||||
"Recall@5": 0.98761,
|
|
||||||
"Recall@10": 0.99297,
|
|
||||||
"Recall@100": 0.99807
|
|
||||||
},
|
|
||||||
"Precision": {
|
|
||||||
"P@1": 0.94971,
|
|
||||||
"P@5": 0.19752,
|
|
||||||
"P@10": 0.0993,
|
|
||||||
"P@100": 0.00998
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
@ -1,333 +0,0 @@
|
||||||
{
|
|
||||||
"cells": [
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"id": "66cbbaf8",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"# Libraries"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 1,
|
|
||||||
"id": "c01c19dc",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"from typing import Dict, List, Union\n",
|
|
||||||
"import numpy as np\n",
|
|
||||||
"from langchain_ollama import OllamaEmbeddings\n",
|
|
||||||
"from beir.datasets.data_loader import GenericDataLoader\n",
|
|
||||||
"from beir.retrieval.search.dense import DenseRetrievalExactSearch\n",
|
|
||||||
"from beir.retrieval.evaluation import EvaluateRetrieval\n",
|
|
||||||
"from beir import util\n",
|
|
||||||
"import json\n",
|
|
||||||
"from datasets import load_dataset"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"id": "ac011c1c",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"# Utils"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 2,
|
|
||||||
"id": "b83e7900",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"class BEIROllamaEmbeddings:\n",
|
|
||||||
" \"\"\"\n",
|
|
||||||
" Adapter that makes LangChain's OllamaEmbeddings compatible with BEIR.\n",
|
|
||||||
" \"\"\"\n",
|
|
||||||
"\n",
|
|
||||||
" def __init__(\n",
|
|
||||||
" self,\n",
|
|
||||||
" base_url: str,\n",
|
|
||||||
" model: str,\n",
|
|
||||||
" batch_size: int = 64,\n",
|
|
||||||
" ) -> None:\n",
|
|
||||||
" self.batch_size = batch_size\n",
|
|
||||||
" self.embeddings = OllamaEmbeddings(\n",
|
|
||||||
" base_url=base_url,\n",
|
|
||||||
" model=model,\n",
|
|
||||||
" )\n",
|
|
||||||
"\n",
|
|
||||||
" def _batch_embed(self, texts: List[str]) -> np.ndarray:\n",
|
|
||||||
" vectors = []\n",
|
|
||||||
"\n",
|
|
||||||
" for i in range(0, len(texts), self.batch_size):\n",
|
|
||||||
" batch = texts[i : i + self.batch_size]\n",
|
|
||||||
" batch_vectors = self.embeddings.embed_documents(batch)\n",
|
|
||||||
" vectors.extend(batch_vectors)\n",
|
|
||||||
"\n",
|
|
||||||
" return np.asarray(vectors, dtype=np.float32)\n",
|
|
||||||
"\n",
|
|
||||||
" def encode_queries(self, queries: List[str], **kwargs) -> np.ndarray:\n",
|
|
||||||
" \"\"\"\n",
|
|
||||||
" BEIR query encoder\n",
|
|
||||||
" \"\"\"\n",
|
|
||||||
" return self._batch_embed(queries)\n",
|
|
||||||
"\n",
|
|
||||||
" def encode_corpus(\n",
|
|
||||||
" self,\n",
|
|
||||||
" corpus: Union[List[Dict[str, str]], Dict[str, Dict[str, str]]],\n",
|
|
||||||
" **kwargs,\n",
|
|
||||||
" ) -> np.ndarray:\n",
|
|
||||||
" \"\"\"\n",
|
|
||||||
" BEIR corpus encoder\n",
|
|
||||||
" \"\"\"\n",
|
|
||||||
" if isinstance(corpus, dict):\n",
|
|
||||||
" corpus = list(corpus.values())\n",
|
|
||||||
"\n",
|
|
||||||
" texts = []\n",
|
|
||||||
" for doc in corpus:\n",
|
|
||||||
" title = (doc.get(\"title\") or \"\").strip()\n",
|
|
||||||
" text = (doc.get(\"text\") or \"\").strip()\n",
|
|
||||||
"\n",
|
|
||||||
" if title:\n",
|
|
||||||
" texts.append(f\"{title}\\n{text}\")\n",
|
|
||||||
" else:\n",
|
|
||||||
" texts.append(text)\n",
|
|
||||||
"\n",
|
|
||||||
" return self._batch_embed(texts)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 3,
|
|
||||||
"id": "af3eb66d",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"def convert_hf_to_beir(hf_dataset):\n",
|
|
||||||
" corpus, queries, qrels = {}, {}, {}\n",
|
|
||||||
" \n",
|
|
||||||
" for i, data in enumerate(hf_dataset):\n",
|
|
||||||
" docid = f\"doc_{i}\"\n",
|
|
||||||
" queryid = f\"q_{i}\"\n",
|
|
||||||
" \n",
|
|
||||||
" # El código es el documento (lo que el agente debe recuperar)\n",
|
|
||||||
" corpus[docid] = {\"title\": data.get(\"func_name\", \"\"), \"text\": data['code']}\n",
|
|
||||||
" \n",
|
|
||||||
" # El docstring es la consulta (lo que el usuario pide)\n",
|
|
||||||
" queries[queryid] = data['docstring']\n",
|
|
||||||
" \n",
|
|
||||||
" # Relación 1 a 1: la query i busca el código i\n",
|
|
||||||
" qrels[queryid] = {docid: 1}\n",
|
|
||||||
" \n",
|
|
||||||
" return corpus, queries, qrels"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"id": "c9528fb6",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"# Data"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 4,
|
|
||||||
"id": "230aae25",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"raw_dataset = load_dataset(\"google/code_x_glue_tc_nl_code_search_adv\", split=\"test\")\n",
|
|
||||||
"corpus, queries, qrels = convert_hf_to_beir(raw_dataset)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"id": "13050d31",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"# Test qwen3-0.6B-emb:latest"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"id": "514540af",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"model = BEIROllamaEmbeddings(\n",
|
|
||||||
" base_url=\"http://localhost:11434\",\n",
|
|
||||||
" model=\"qwen3-0.6B-emb:latest\",\n",
|
|
||||||
" batch_size=64,\n",
|
|
||||||
")\n",
|
|
||||||
"\n",
|
|
||||||
"# Inicializar buscador y evaluador\n",
|
|
||||||
"retriever = DenseRetrievalExactSearch(model, batch_size=64)\n",
|
|
||||||
"evaluator = EvaluateRetrieval(retriever, score_function=\"cos_sim\")\n",
|
|
||||||
"\n",
|
|
||||||
"# Ejecutar recuperación\n",
|
|
||||||
"results = evaluator.retrieve(corpus, queries)\n",
|
|
||||||
"\n",
|
|
||||||
"# Evaluar métricas (NDCG, MAP, Recall, Precision)\n",
|
|
||||||
"ndcg, _map, recall, precision = evaluator.evaluate(\n",
|
|
||||||
" qrels, results, [1, 3, 5, 10]\n",
|
|
||||||
")"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 7,
|
|
||||||
"id": "5c0f9845",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"Resultados para CodeXGLUE:\n",
|
|
||||||
"NDCG: {'NDCG@1': 0.94971, 'NDCG@3': 0.96956, 'NDCG@5': 0.97166, 'NDCG@10': 0.97342}\n",
|
|
||||||
"MAP: {'MAP@1': 0.94971, 'MAP@3': 0.96504, 'MAP@5': 0.9662, 'MAP@10': 0.96694}\n",
|
|
||||||
"Recall: {'Recall@1': 0.94971, 'Recall@3': 0.98251, 'Recall@5': 0.98761, 'Recall@10': 0.99297}\n",
|
|
||||||
"Precision: {'P@1': 0.94971, 'P@3': 0.3275, 'P@5': 0.19752, 'P@10': 0.0993}\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"print(f\"Resultados para CodeXGLUE:\")\n",
|
|
||||||
"print(\"NDCG:\", ndcg)\n",
|
|
||||||
"print(\"MAP:\", _map)\n",
|
|
||||||
"print(\"Recall:\", recall)\n",
|
|
||||||
"print(\"Precision:\", precision)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"id": "c4e643ca",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"# Test qwen2.5:1.5b"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 8,
|
|
||||||
"id": "5ced1c25",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"model_q2 = BEIROllamaEmbeddings(\n",
|
|
||||||
" base_url=\"http://localhost:11434\",\n",
|
|
||||||
" model=\"qwen2.5:1.5b\",\n",
|
|
||||||
" batch_size=64,\n",
|
|
||||||
")\n",
|
|
||||||
"\n",
|
|
||||||
"# Inicializar buscador y evaluador\n",
|
|
||||||
"retriever_q2 = DenseRetrievalExactSearch(model_q2, batch_size=64)\n",
|
|
||||||
"evaluator_q2 = EvaluateRetrieval(retriever_q2, score_function=\"cos_sim\")\n",
|
|
||||||
"\n",
|
|
||||||
"# Ejecutar recuperación\n",
|
|
||||||
"results_q2 = evaluator_q2.retrieve(corpus, queries)\n",
|
|
||||||
"\n",
|
|
||||||
"# Evaluar métricas (NDCG, MAP, Recall, Precision)\n",
|
|
||||||
"ndcg_qwen_2, _map_qwen_2, recall_qwen_2, precision_qwen_2 = evaluator_q2.evaluate(\n",
|
|
||||||
" qrels, results_q2, [1, 3, 5, 10]\n",
|
|
||||||
")"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 9,
|
|
||||||
"id": "6a95189e",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"Resultados para CodeXGLUE:\n",
|
|
||||||
"NDCG: {'NDCG@1': 0.00031, 'NDCG@3': 0.00061, 'NDCG@5': 0.00086, 'NDCG@10': 0.00118}\n",
|
|
||||||
"MAP: {'MAP@1': 0.00031, 'MAP@3': 0.00051, 'MAP@5': 0.00065, 'MAP@10': 0.00078}\n",
|
|
||||||
"Recall: {'Recall@1': 0.00031, 'Recall@3': 0.00088, 'Recall@5': 0.00151, 'Recall@10': 0.0025}\n",
|
|
||||||
"Precision: {'P@1': 0.00031, 'P@3': 0.00029, 'P@5': 0.0003, 'P@10': 0.00025}\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"print(f\"Resultados para CodeXGLUE:\")\n",
|
|
||||||
"print(\"NDCG:\", ndcg_qwen_2)\n",
|
|
||||||
"print(\"MAP:\", _map_qwen_2)\n",
|
|
||||||
"print(\"Recall:\", recall_qwen_2)\n",
|
|
||||||
"print(\"Precision:\", precision_qwen_2)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"id": "3dad9811",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"# Save data"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 10,
|
|
||||||
"id": "f875dd8d",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"Resultados guardados en /home/pseco/VsCodeProjects/assistance-engine/data/interim/beir_CodeXGlue_results.json\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"results_data = {\n",
|
|
||||||
" \"qwen3-0.6B-emb:latest\": {\n",
|
|
||||||
" \"NDCG\": ndcg,\n",
|
|
||||||
" \"MAP\": _map,\n",
|
|
||||||
" \"Recall\": recall,\n",
|
|
||||||
" \"Precision\": precision,\n",
|
|
||||||
" },\n",
|
|
||||||
" \"qwen2.5:1.5b\": {\n",
|
|
||||||
" \"NDCG\": ndcg_qwen_2,\n",
|
|
||||||
" \"MAP\": _map_qwen_2,\n",
|
|
||||||
" \"Recall\": recall_qwen_2,\n",
|
|
||||||
" \"Precision\": precision_qwen_2,\n",
|
|
||||||
" }\n",
|
|
||||||
"}\n",
|
|
||||||
"\n",
|
|
||||||
"output_file = \"/home/pseco/VsCodeProjects/assistance-engine/data/interim/beir_CodeXGlue_results.json\"\n",
|
|
||||||
"with open(output_file, \"w\") as f:\n",
|
|
||||||
" json.dump(results_data, f, indent=2)\n",
|
|
||||||
"\n",
|
|
||||||
"print(f\"Resultados guardados en {output_file}\")"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"metadata": {
|
|
||||||
"kernelspec": {
|
|
||||||
"display_name": "assistance-engine",
|
|
||||||
"language": "python",
|
|
||||||
"name": "python3"
|
|
||||||
},
|
|
||||||
"language_info": {
|
|
||||||
"codemirror_mode": {
|
|
||||||
"name": "ipython",
|
|
||||||
"version": 3
|
|
||||||
},
|
|
||||||
"file_extension": ".py",
|
|
||||||
"mimetype": "text/x-python",
|
|
||||||
"name": "python",
|
|
||||||
"nbconvert_exporter": "python",
|
|
||||||
"pygments_lexer": "ipython3",
|
|
||||||
"version": "3.12.11"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"nbformat": 4,
|
|
||||||
"nbformat_minor": 5
|
|
||||||
}
|
|
||||||
|
|
@ -1,323 +0,0 @@
|
||||||
{
|
|
||||||
"cells": [
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"id": "66cbbaf8",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"# Libraries"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 15,
|
|
||||||
"id": "c01c19dc",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"from typing import Dict, List, Union\n",
|
|
||||||
"import numpy as np\n",
|
|
||||||
"from langchain_ollama import OllamaEmbeddings\n",
|
|
||||||
"from beir.datasets.data_loader import GenericDataLoader\n",
|
|
||||||
"from beir.retrieval.search.dense import DenseRetrievalExactSearch\n",
|
|
||||||
"from beir.retrieval.evaluation import EvaluateRetrieval\n",
|
|
||||||
"from beir import util\n",
|
|
||||||
"import json"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"id": "ac011c1c",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"# Utils"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 10,
|
|
||||||
"id": "b83e7900",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"class BEIROllamaEmbeddings:\n",
|
|
||||||
" \"\"\"\n",
|
|
||||||
" Adapter that makes LangChain's OllamaEmbeddings compatible with BEIR.\n",
|
|
||||||
" \"\"\"\n",
|
|
||||||
"\n",
|
|
||||||
" def __init__(\n",
|
|
||||||
" self,\n",
|
|
||||||
" base_url: str,\n",
|
|
||||||
" model: str,\n",
|
|
||||||
" batch_size: int = 64,\n",
|
|
||||||
" ) -> None:\n",
|
|
||||||
" self.batch_size = batch_size\n",
|
|
||||||
" self.embeddings = OllamaEmbeddings(\n",
|
|
||||||
" base_url=base_url,\n",
|
|
||||||
" model=model,\n",
|
|
||||||
" )\n",
|
|
||||||
"\n",
|
|
||||||
" def _batch_embed(self, texts: List[str]) -> np.ndarray:\n",
|
|
||||||
" vectors = []\n",
|
|
||||||
"\n",
|
|
||||||
" for i in range(0, len(texts), self.batch_size):\n",
|
|
||||||
" batch = texts[i : i + self.batch_size]\n",
|
|
||||||
" batch_vectors = self.embeddings.embed_documents(batch)\n",
|
|
||||||
" vectors.extend(batch_vectors)\n",
|
|
||||||
"\n",
|
|
||||||
" return np.asarray(vectors, dtype=np.float32)\n",
|
|
||||||
"\n",
|
|
||||||
" def encode_queries(self, queries: List[str], **kwargs) -> np.ndarray:\n",
|
|
||||||
" \"\"\"\n",
|
|
||||||
" BEIR query encoder\n",
|
|
||||||
" \"\"\"\n",
|
|
||||||
" return self._batch_embed(queries)\n",
|
|
||||||
"\n",
|
|
||||||
" def encode_corpus(\n",
|
|
||||||
" self,\n",
|
|
||||||
" corpus: Union[List[Dict[str, str]], Dict[str, Dict[str, str]]],\n",
|
|
||||||
" **kwargs,\n",
|
|
||||||
" ) -> np.ndarray:\n",
|
|
||||||
" \"\"\"\n",
|
|
||||||
" BEIR corpus encoder\n",
|
|
||||||
" \"\"\"\n",
|
|
||||||
" if isinstance(corpus, dict):\n",
|
|
||||||
" corpus = list(corpus.values())\n",
|
|
||||||
"\n",
|
|
||||||
" texts = []\n",
|
|
||||||
" for doc in corpus:\n",
|
|
||||||
" title = (doc.get(\"title\") or \"\").strip()\n",
|
|
||||||
" text = (doc.get(\"text\") or \"\").strip()\n",
|
|
||||||
"\n",
|
|
||||||
" if title:\n",
|
|
||||||
" texts.append(f\"{title}\\n{text}\")\n",
|
|
||||||
" else:\n",
|
|
||||||
" texts.append(text)\n",
|
|
||||||
"\n",
|
|
||||||
" return self._batch_embed(texts)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"id": "af3eb66d",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"def convert_codexglue_to_beir(input_file):\n",
|
|
||||||
" corpus, queries, qrels = {}, {}, {}\n",
|
|
||||||
" with open(input_file, 'r') as f:\n",
|
|
||||||
" for i, line in enumerate(f):\n",
|
|
||||||
" data = json.loads(line)\n",
|
|
||||||
" docid = f\"doc_{i}\"\n",
|
|
||||||
" queryid = f\"q_{i}\"\n",
|
|
||||||
" \n",
|
|
||||||
" # El código es nuestro documento (Corpus)\n",
|
|
||||||
" corpus[docid] = {\"title\": \"\", \"text\": data['code']}\n",
|
|
||||||
" # El docstring es nuestra consulta (Query)\n",
|
|
||||||
" queries[queryid] = data['docstring']\n",
|
|
||||||
" # En CodeXGLUE, la consulta i corresponde al código i\n",
|
|
||||||
" qrels[queryid] = {docid: 1}\n",
|
|
||||||
" \n",
|
|
||||||
" return corpus, queries, qrels\n",
|
|
||||||
"\n",
|
|
||||||
"# Carga tus datos (ejemplo con el set de test de AdvTest)\n",
|
|
||||||
"corpus, queries, qrels = convert_codexglue_to_beir(\"test.jsonl\")\n"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"id": "c9528fb6",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"# Data"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 11,
|
|
||||||
"id": "230aae25",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"application/vnd.jupyter.widget-view+json": {
|
|
||||||
"model_id": "1915c67ec20f4806b30b48eff9a132e2",
|
|
||||||
"version_major": 2,
|
|
||||||
"version_minor": 0
|
|
||||||
},
|
|
||||||
"text/plain": [
|
|
||||||
" 0%| | 0/5183 [00:00<?, ?it/s]"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "display_data"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"dataset=\"scifact\"\n",
|
|
||||||
"url=f\"https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{dataset}.zip\"\n",
|
|
||||||
"data_path=util.download_and_unzip(url, out_dir=\"datasets\")\n",
|
|
||||||
"corpus, queries, qrels=GenericDataLoader(data_path).load(split=\"test\")"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"id": "13050d31",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"# Test qwen3-0.6B-emb:latest"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 12,
|
|
||||||
"id": "514540af",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"NDCG: {'NDCG@1': 0.56333, 'NDCG@3': 0.64367, 'NDCG@5': 0.66577, 'NDCG@10': 0.68551, 'NDCG@100': 0.71285}\n",
|
|
||||||
"MAP: {'MAP@1': 0.52994, 'MAP@3': 0.6117, 'MAP@5': 0.62815, 'MAP@10': 0.6383, 'MAP@100': 0.64466}\n",
|
|
||||||
"Recall: {'Recall@1': 0.52994, 'Recall@3': 0.7035, 'Recall@5': 0.75967, 'Recall@10': 0.81611, 'Recall@100': 0.94}\n",
|
|
||||||
"Precision: {'P@1': 0.56333, 'P@3': 0.25889, 'P@5': 0.17067, 'P@10': 0.093, 'P@100': 0.0107}\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"model = BEIROllamaEmbeddings(\n",
|
|
||||||
" base_url=\"http://localhost:11434\",\n",
|
|
||||||
" model=\"qwen3-0.6B-emb:latest\",\n",
|
|
||||||
" batch_size=64,\n",
|
|
||||||
")\n",
|
|
||||||
"\n",
|
|
||||||
"retriever = DenseRetrievalExactSearch(model, batch_size=64)\n",
|
|
||||||
"evaluator = EvaluateRetrieval(retriever, score_function=\"cos_sim\")\n",
|
|
||||||
"\n",
|
|
||||||
"results = evaluator.retrieve(corpus, queries)\n",
|
|
||||||
"ndcg, _map, recall, precision = evaluator.evaluate(\n",
|
|
||||||
" qrels, results, [1, 3, 5, 10, 100]\n",
|
|
||||||
")\n",
|
|
||||||
"\n",
|
|
||||||
"print(\"NDCG:\", ndcg)\n",
|
|
||||||
"print(\"MAP:\", _map)\n",
|
|
||||||
"print(\"Recall:\", recall)\n",
|
|
||||||
"print(\"Precision:\", precision)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"id": "c4e643ca",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"# Test qwen2.5:1.5b"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 13,
|
|
||||||
"id": "5ced1c25",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"NDCG: {'NDCG@1': 0.02333, 'NDCG@3': 0.03498, 'NDCG@5': 0.0404, 'NDCG@10': 0.04619, 'NDCG@100': 0.07768}\n",
|
|
||||||
"MAP: {'MAP@1': 0.02083, 'MAP@3': 0.03083, 'MAP@5': 0.03375, 'MAP@10': 0.03632, 'MAP@100': 0.04123}\n",
|
|
||||||
"Recall: {'Recall@1': 0.02083, 'Recall@3': 0.04417, 'Recall@5': 0.0575, 'Recall@10': 0.07417, 'Recall@100': 0.23144}\n",
|
|
||||||
"Precision: {'P@1': 0.02333, 'P@3': 0.01556, 'P@5': 0.01267, 'P@10': 0.00833, 'P@100': 0.00277}\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"model_qwen2 = BEIROllamaEmbeddings(\n",
|
|
||||||
" base_url=\"http://localhost:11434\",\n",
|
|
||||||
" model=\"qwen2.5:1.5b\",\n",
|
|
||||||
" batch_size=64,\n",
|
|
||||||
")\n",
|
|
||||||
"\n",
|
|
||||||
"retriever_qwen_2 = DenseRetrievalExactSearch(model_qwen2, batch_size=64)\n",
|
|
||||||
"evaluator_qwen_2 = EvaluateRetrieval(retriever_qwen_2, score_function=\"cos_sim\")\n",
|
|
||||||
"\n",
|
|
||||||
"results_qwen_2 = evaluator_qwen_2.retrieve(corpus, queries)\n",
|
|
||||||
"ndcg_qwen_2, _map_qwen_2, recall_qwen_2, precision_qwen_2 = evaluator_qwen_2.evaluate(\n",
|
|
||||||
" qrels, results_qwen_2, [1, 3, 5, 10, 100]\n",
|
|
||||||
")\n",
|
|
||||||
"\n",
|
|
||||||
"print(\"NDCG:\", ndcg_qwen_2)\n",
|
|
||||||
"print(\"MAP:\", _map_qwen_2)\n",
|
|
||||||
"print(\"Recall:\", recall_qwen_2)\n",
|
|
||||||
"print(\"Precision:\", precision_qwen_2)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"id": "b9402837",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"# Save Data"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 16,
|
|
||||||
"id": "c281d5e1",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"Resultados guardados en /home/pseco/VsCodeProjects/assistance-engine/data/interim/beir_Scifact_results.json\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"results_data = {\n",
|
|
||||||
" \"qwen3-0.6B-emb:latest\": {\n",
|
|
||||||
" \"NDCG\": ndcg,\n",
|
|
||||||
" \"MAP\": _map,\n",
|
|
||||||
" \"Recall\": recall,\n",
|
|
||||||
" \"Precision\": precision,\n",
|
|
||||||
" },\n",
|
|
||||||
" \"qwen2.5:1.5b\": {\n",
|
|
||||||
" \"NDCG\": ndcg_qwen_2,\n",
|
|
||||||
" \"MAP\": _map_qwen_2,\n",
|
|
||||||
" \"Recall\": recall_qwen_2,\n",
|
|
||||||
" \"Precision\": precision_qwen_2,\n",
|
|
||||||
" }\n",
|
|
||||||
"}\n",
|
|
||||||
"\n",
|
|
||||||
"output_file = \"/home/pseco/VsCodeProjects/assistance-engine/data/interim/beir_Scifact_results.json\"\n",
|
|
||||||
"with open(output_file, \"w\") as f:\n",
|
|
||||||
" json.dump(results_data, f, indent=2)\n",
|
|
||||||
"\n",
|
|
||||||
"print(f\"Resultados guardados en {output_file}\")"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"metadata": {
|
|
||||||
"kernelspec": {
|
|
||||||
"display_name": "assistance-engine",
|
|
||||||
"language": "python",
|
|
||||||
"name": "python3"
|
|
||||||
},
|
|
||||||
"language_info": {
|
|
||||||
"codemirror_mode": {
|
|
||||||
"name": "ipython",
|
|
||||||
"version": 3
|
|
||||||
},
|
|
||||||
"file_extension": ".py",
|
|
||||||
"mimetype": "text/x-python",
|
|
||||||
"name": "python",
|
|
||||||
"nbconvert_exporter": "python",
|
|
||||||
"pygments_lexer": "ipython3",
|
|
||||||
"version": "3.12.11"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"nbformat": 4,
|
|
||||||
"nbformat_minor": 5
|
|
||||||
}
|
|
||||||
|
|
@ -1,332 +0,0 @@
|
||||||
{
|
|
||||||
"cells": [
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"id": "66cbbaf8",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"# Libraries"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 8,
|
|
||||||
"id": "c01c19dc",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"import os\n",
|
|
||||||
"import json\n",
|
|
||||||
"from typing import Dict, List, Union\n",
|
|
||||||
"import numpy as np\n",
|
|
||||||
"from datasets import load_dataset\n",
|
|
||||||
"from langchain_ollama import OllamaEmbeddings\n",
|
|
||||||
"from beir.datasets.data_loader import GenericDataLoader\n",
|
|
||||||
"from beir.retrieval.search.dense import DenseRetrievalExactSearch\n",
|
|
||||||
"from beir.retrieval.evaluation import EvaluateRetrieval\n",
|
|
||||||
"from beir import util"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"id": "ac011c1c",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"# Utils"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 3,
|
|
||||||
"id": "b83e7900",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"class BEIROllamaEmbeddings:\n",
|
|
||||||
" \"\"\"\n",
|
|
||||||
" Adapter that makes LangChain's OllamaEmbeddings compatible with BEIR.\n",
|
|
||||||
" \"\"\"\n",
|
|
||||||
"\n",
|
|
||||||
" def __init__(\n",
|
|
||||||
" self,\n",
|
|
||||||
" base_url: str,\n",
|
|
||||||
" model: str,\n",
|
|
||||||
" batch_size: int = 64,\n",
|
|
||||||
" ) -> None:\n",
|
|
||||||
" self.batch_size = batch_size\n",
|
|
||||||
" self.embeddings = OllamaEmbeddings(\n",
|
|
||||||
" base_url=base_url,\n",
|
|
||||||
" model=model,\n",
|
|
||||||
" )\n",
|
|
||||||
"\n",
|
|
||||||
" def _batch_embed(self, texts: List[str]) -> np.ndarray:\n",
|
|
||||||
" vectors = []\n",
|
|
||||||
"\n",
|
|
||||||
" for i in range(0, len(texts), self.batch_size):\n",
|
|
||||||
" batch = texts[i : i + self.batch_size]\n",
|
|
||||||
" batch_vectors = self.embeddings.embed_documents(batch)\n",
|
|
||||||
" vectors.extend(batch_vectors)\n",
|
|
||||||
"\n",
|
|
||||||
" return np.asarray(vectors, dtype=np.float32)\n",
|
|
||||||
"\n",
|
|
||||||
" def encode_queries(self, queries: List[str], **kwargs) -> np.ndarray:\n",
|
|
||||||
" \"\"\"\n",
|
|
||||||
" BEIR query encoder\n",
|
|
||||||
" \"\"\"\n",
|
|
||||||
" return self._batch_embed(queries)\n",
|
|
||||||
"\n",
|
|
||||||
" def encode_corpus(\n",
|
|
||||||
" self,\n",
|
|
||||||
" corpus: Union[List[Dict[str, str]], Dict[str, Dict[str, str]]],\n",
|
|
||||||
" **kwargs,\n",
|
|
||||||
" ) -> np.ndarray:\n",
|
|
||||||
" \"\"\"\n",
|
|
||||||
" BEIR corpus encoder\n",
|
|
||||||
" \"\"\"\n",
|
|
||||||
" if isinstance(corpus, dict):\n",
|
|
||||||
" corpus = list(corpus.values())\n",
|
|
||||||
"\n",
|
|
||||||
" texts = []\n",
|
|
||||||
" for doc in corpus:\n",
|
|
||||||
" title = (doc.get(\"title\") or \"\").strip()\n",
|
|
||||||
" text = (doc.get(\"text\") or \"\").strip()\n",
|
|
||||||
"\n",
|
|
||||||
" if title:\n",
|
|
||||||
" texts.append(f\"{title}\\n{text}\")\n",
|
|
||||||
" else:\n",
|
|
||||||
" texts.append(text)\n",
|
|
||||||
"\n",
|
|
||||||
" return self._batch_embed(texts)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"id": "c9528fb6",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"# Data"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"id": "230aae25",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"Descargando datos de Hugging Face...\n",
|
|
||||||
"Cargando con BEIR GenericDataLoader...\n"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"application/vnd.jupyter.widget-view+json": {
|
|
||||||
"model_id": "0e67479e959248f598db3415efbb13ae",
|
|
||||||
"version_major": 2,
|
|
||||||
"version_minor": 0
|
|
||||||
},
|
|
||||||
"text/plain": [
|
|
||||||
" 0%| | 0/20604 [00:00<?, ?it/s]"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "display_data"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"dataset_name = \"cosqa\"\n",
|
|
||||||
"data_path = f\"/home/acano/PycharmProjects/assistance-engine/data/external/{dataset_name}\"\n",
|
|
||||||
"\n",
|
|
||||||
"os.makedirs(f\"{data_path}/qrels\", exist_ok=True)\n",
|
|
||||||
"\n",
|
|
||||||
"# 1. Cargar desde Hugging Face con los nombres de configuración correctos\n",
|
|
||||||
"print(\"Descargando datos de Hugging Face...\")\n",
|
|
||||||
"hf_corpus = load_dataset(\"CoIR-Retrieval/cosqa\", \"corpus\", split=\"corpus\")\n",
|
|
||||||
"hf_queries = load_dataset(\"CoIR-Retrieval/cosqa\", \"queries\", split=\"queries\")\n",
|
|
||||||
"# Los qrels están en la config 'default'\n",
|
|
||||||
"hf_qrels = load_dataset(\"CoIR-Retrieval/cosqa\", \"default\", split=\"test\")\n",
|
|
||||||
"\n",
|
|
||||||
"# 2. Guardar Corpus\n",
|
|
||||||
"with open(f\"{data_path}/corpus.jsonl\", \"w\") as f:\n",
|
|
||||||
" for item in hf_corpus:\n",
|
|
||||||
" f.write(json.dumps({\"_id\": str(item[\"_id\"]), \"text\": item[\"text\"], \"title\": \"\"}) + \"\\n\")\n",
|
|
||||||
"\n",
|
|
||||||
"# 3. Guardar Queries\n",
|
|
||||||
"with open(f\"{data_path}/queries.jsonl\", \"w\") as f:\n",
|
|
||||||
" for item in hf_queries:\n",
|
|
||||||
" f.write(json.dumps({\"_id\": str(item[\"_id\"]), \"text\": item[\"text\"]}) + \"\\n\")\n",
|
|
||||||
"\n",
|
|
||||||
"# 4. Guardar Qrels (Formato TSV para BEIR)\n",
|
|
||||||
"with open(f\"{data_path}/qrels/test.tsv\", \"w\") as f:\n",
|
|
||||||
" f.write(\"query-id\\tcorpus-id\\tscore\\n\")\n",
|
|
||||||
" for item in hf_qrels:\n",
|
|
||||||
" # En la config 'default', los campos suelen ser 'query-id' y 'corpus-id'\n",
|
|
||||||
" f.write(f\"{item['query-id']}\\t{item['corpus-id']}\\t{item['score']}\\n\")\n",
|
|
||||||
"\n",
|
|
||||||
"print(\"Cargando con BEIR GenericDataLoader...\")\n",
|
|
||||||
"corpus, queries, qrels = GenericDataLoader(data_path).load(split=\"test\")"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"id": "13050d31",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"# Test qwen3-0.6B-emb:latest"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 12,
|
|
||||||
"id": "514540af",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"NDCG: {'NDCG@1': 0.174, 'NDCG@3': 0.27374, 'NDCG@5': 0.33509, 'NDCG@10': 0.39086, 'NDCG@100': 0.45099}\n",
|
|
||||||
"MAP: {'MAP@1': 0.174, 'MAP@3': 0.247, 'MAP@5': 0.2808, 'MAP@10': 0.30466, 'MAP@100': 0.31702}\n",
|
|
||||||
"Recall: {'Recall@1': 0.174, 'Recall@3': 0.352, 'Recall@5': 0.502, 'Recall@10': 0.67, 'Recall@100': 0.952}\n",
|
|
||||||
"Precision: {'P@1': 0.174, 'P@3': 0.11733, 'P@5': 0.1004, 'P@10': 0.067, 'P@100': 0.00952}\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"model = BEIROllamaEmbeddings(\n",
|
|
||||||
" base_url=\"http://localhost:11434\",\n",
|
|
||||||
" model=\"qwen3-0.6B-emb:latest\",\n",
|
|
||||||
" batch_size=64,\n",
|
|
||||||
")\n",
|
|
||||||
"\n",
|
|
||||||
"retriever = DenseRetrievalExactSearch(model, batch_size=64)\n",
|
|
||||||
"evaluator = EvaluateRetrieval(retriever, score_function=\"cos_sim\")\n",
|
|
||||||
"\n",
|
|
||||||
"results = evaluator.retrieve(corpus, queries)\n",
|
|
||||||
"ndcg, _map, recall, precision = evaluator.evaluate(\n",
|
|
||||||
" qrels, results, [1, 3, 5, 10, 100]\n",
|
|
||||||
")\n",
|
|
||||||
"\n",
|
|
||||||
"print(\"NDCG:\", ndcg)\n",
|
|
||||||
"print(\"MAP:\", _map)\n",
|
|
||||||
"print(\"Recall:\", recall)\n",
|
|
||||||
"print(\"Precision:\", precision)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"id": "c4e643ca",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"# Test qwen2.5:1.5b"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 13,
|
|
||||||
"id": "5ced1c25",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"NDCG: {'NDCG@1': 0.0, 'NDCG@3': 0.0, 'NDCG@5': 0.0, 'NDCG@10': 0.0, 'NDCG@100': 0.0021}\n",
|
|
||||||
"MAP: {'MAP@1': 0.0, 'MAP@3': 0.0, 'MAP@5': 0.0, 'MAP@10': 0.0, 'MAP@100': 0.00043}\n",
|
|
||||||
"Recall: {'Recall@1': 0.0, 'Recall@3': 0.0, 'Recall@5': 0.0, 'Recall@10': 0.0, 'Recall@100': 0.01}\n",
|
|
||||||
"Precision: {'P@1': 0.0, 'P@3': 0.0, 'P@5': 0.0, 'P@10': 0.0, 'P@100': 0.0001}\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"model_qwen2 = BEIROllamaEmbeddings(\n",
|
|
||||||
" base_url=\"http://localhost:11434\",\n",
|
|
||||||
" model=\"qwen2.5:1.5b\",\n",
|
|
||||||
" batch_size=64,\n",
|
|
||||||
")\n",
|
|
||||||
"\n",
|
|
||||||
"retriever_qwen_2 = DenseRetrievalExactSearch(model_qwen2, batch_size=64)\n",
|
|
||||||
"evaluator_qwen_2 = EvaluateRetrieval(retriever_qwen_2, score_function=\"cos_sim\")\n",
|
|
||||||
"\n",
|
|
||||||
"results_qwen_2 = evaluator_qwen_2.retrieve(corpus, queries)\n",
|
|
||||||
"ndcg_qwen_2, _map_qwen_2, recall_qwen_2, precision_qwen_2 = evaluator_qwen_2.evaluate(\n",
|
|
||||||
" qrels, results_qwen_2, [1, 3, 5, 10, 100]\n",
|
|
||||||
")\n",
|
|
||||||
"\n",
|
|
||||||
"print(\"NDCG:\", ndcg_qwen_2)\n",
|
|
||||||
"print(\"MAP:\", _map_qwen_2)\n",
|
|
||||||
"print(\"Recall:\", recall_qwen_2)\n",
|
|
||||||
"print(\"Precision:\", precision_qwen_2)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 14,
|
|
||||||
"id": "1db7d110",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"Resultados guardados en /home/acano/PycharmProjects/assistance-engine/data/interim/beir_cosqa_results.json\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"results_data = {\n",
|
|
||||||
" \"qwen3-0.6B-emb:latest\": {\n",
|
|
||||||
" \"NDCG\": ndcg,\n",
|
|
||||||
" \"MAP\": _map,\n",
|
|
||||||
" \"Recall\": recall,\n",
|
|
||||||
" \"Precision\": precision,\n",
|
|
||||||
" },\n",
|
|
||||||
" \"qwen2.5:1.5b\": {\n",
|
|
||||||
" \"NDCG\": ndcg_qwen_2,\n",
|
|
||||||
" \"MAP\": _map_qwen_2,\n",
|
|
||||||
" \"Recall\": recall_qwen_2,\n",
|
|
||||||
" \"Precision\": precision_qwen_2,\n",
|
|
||||||
" }\n",
|
|
||||||
"}\n",
|
|
||||||
" \n",
|
|
||||||
"output_file = \"/home/acano/PycharmProjects/assistance-engine/data/interim/beir_cosqa_results.json\"\n",
|
|
||||||
"with open(output_file, \"w\") as f:\n",
|
|
||||||
" json.dump(results_data, f, indent=2)\n",
|
|
||||||
" \n",
|
|
||||||
"print(f\"Resultados guardados en {output_file}\")"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"id": "e4f8d78b",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": []
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"metadata": {
|
|
||||||
"kernelspec": {
|
|
||||||
"display_name": "assistance-engine",
|
|
||||||
"language": "python",
|
|
||||||
"name": "python3"
|
|
||||||
},
|
|
||||||
"language_info": {
|
|
||||||
"codemirror_mode": {
|
|
||||||
"name": "ipython",
|
|
||||||
"version": 3
|
|
||||||
},
|
|
||||||
"file_extension": ".py",
|
|
||||||
"mimetype": "text/x-python",
|
|
||||||
"name": "python",
|
|
||||||
"nbconvert_exporter": "python",
|
|
||||||
"pygments_lexer": "ipython3",
|
|
||||||
"version": "3.11.13"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"nbformat": 4,
|
|
||||||
"nbformat_minor": 5
|
|
||||||
}
|
|
||||||
|
|
@ -1,289 +0,0 @@
|
||||||
{
|
|
||||||
"cells": [
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"id": "096e6224",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"# Libraries"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 1,
|
|
||||||
"id": "4b0853e9",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"name": "stderr",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"/tmp/ipykernel_2931729/1845255288.py:4: DeprecationWarning: Importing SemanticSimilarity from 'ragas.metrics' is deprecated and will be removed in v1.0. Please use 'ragas.metrics.collections' instead. Example: from ragas.metrics.collections import SemanticSimilarity\n",
|
|
||||||
" from ragas.metrics import SemanticSimilarity\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"# ...existing code...\n",
|
|
||||||
"from datasets import load_dataset\n",
|
|
||||||
"from ragas import EvaluationDataset, evaluate\n",
|
|
||||||
"from ragas.metrics import SemanticSimilarity\n",
|
|
||||||
"from langchain_community.embeddings import OllamaEmbeddings\n",
|
|
||||||
"import asyncio\n",
|
|
||||||
"from typing import Sequence\n",
|
|
||||||
"from ragas.embeddings.base import BaseRagasEmbedding\n",
|
|
||||||
"import os\n",
|
|
||||||
"from transformers import AutoConfig\n",
|
|
||||||
"import nltk\n",
|
|
||||||
"# ...existing code..."
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 2,
|
|
||||||
"id": "6bfe1ca0",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"nltk.download(\"punkt\", quiet=True)\n",
|
|
||||||
"\n",
|
|
||||||
"ES_URL = os.getenv(\"ELASTICSEARCH_LOCAL_URL\")\n",
|
|
||||||
"ES_INDEX_NAME = os.getenv(\"ELASTICSEARCH_INDEX\")\n",
|
|
||||||
"HF_EMBEDDING_MODEL_NAME = os.getenv(\"HF_EMBEDDING_MODEL_NAME\")\n",
|
|
||||||
"BASE_URL = os.getenv(\"LLM_BASE_LOCAL_URL\")\n",
|
|
||||||
"MODEL_NAME = os.getenv(\"OLLAMA_MODEL_NAME\")\n",
|
|
||||||
"\n",
|
|
||||||
"config = AutoConfig.from_pretrained(HF_EMBEDDING_MODEL_NAME)\n",
|
|
||||||
"embedding_dim = config.hidden_size"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 3,
|
|
||||||
"id": "ea41ce0f",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"name": "stderr",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"/tmp/ipykernel_2931729/256987240.py:1: LangChainDeprecationWarning: The class `OllamaEmbeddings` was deprecated in LangChain 0.3.1 and will be removed in 1.0.0. An updated version of the class exists in the `langchain-ollama package and should be used instead. To use it run `pip install -U `langchain-ollama` and import as `from `langchain_ollama import OllamaEmbeddings``.\n",
|
|
||||||
" embeddings = OllamaEmbeddings(base_url=BASE_URL, model=MODEL_NAME)\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"embeddings = OllamaEmbeddings(base_url=BASE_URL, model=MODEL_NAME)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"id": "8eee9390",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"# Similitud Aleatoria"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"id": "d7b150e5",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"from datasets import load_dataset\n",
|
|
||||||
"from ragas import EvaluationDataset\n",
|
|
||||||
"\n",
|
|
||||||
"\n",
|
|
||||||
"def _normalize_answer(answer_value: object) -> str:\n",
|
|
||||||
" \"\"\"\n",
|
|
||||||
" Normalize answer values to a single string.\n",
|
|
||||||
" \"\"\"\n",
|
|
||||||
" if isinstance(answer_value, dict):\n",
|
|
||||||
" text_value = answer_value.get(\"text\")\n",
|
|
||||||
" if isinstance(text_value, list):\n",
|
|
||||||
" return str(text_value[0]) if text_value else \"\"\n",
|
|
||||||
" if text_value is not None:\n",
|
|
||||||
" return str(text_value)\n",
|
|
||||||
"\n",
|
|
||||||
" if isinstance(answer_value, list):\n",
|
|
||||||
" return str(answer_value[0]) if answer_value else \"\"\n",
|
|
||||||
"\n",
|
|
||||||
" return str(answer_value)\n",
|
|
||||||
"\n",
|
|
||||||
"\n",
|
|
||||||
"def _first_existing_key(candidates: list[str], keys: set[str]) -> str | None:\n",
|
|
||||||
" \"\"\"\n",
|
|
||||||
" Return the first key present in keys from candidates.\n",
|
|
||||||
" \"\"\"\n",
|
|
||||||
" for candidate in candidates:\n",
|
|
||||||
" if candidate in keys:\n",
|
|
||||||
" return candidate\n",
|
|
||||||
" return None\n",
|
|
||||||
"\n",
|
|
||||||
"\n",
|
|
||||||
"ds = load_dataset(\"sentence-transformers/natural-questions\")\n",
|
|
||||||
"train_ds = ds[\"train\"]\n",
|
|
||||||
"\n",
|
|
||||||
"max_questions = min(100, len(train_ds))\n",
|
|
||||||
"train_ds = train_ds.select(range(max_questions))\n",
|
|
||||||
"\n",
|
|
||||||
"available_keys = set(train_ds.column_names)\n",
|
|
||||||
"reference_key = _first_existing_key(\n",
|
|
||||||
" [\"question\", \"query\", \"text\", \"input\"], available_keys\n",
|
|
||||||
")\n",
|
|
||||||
"response_key = _first_existing_key(\n",
|
|
||||||
" [\"answer\", \"answers\", \"response\", \"output\"], available_keys\n",
|
|
||||||
")\n",
|
|
||||||
"\n",
|
|
||||||
"if reference_key is None or response_key is None:\n",
|
|
||||||
" raise KeyError(\n",
|
|
||||||
" f\"Expected question/answer-like columns not found. \"\n",
|
|
||||||
" f\"Available columns: {train_ds.column_names}\"\n",
|
|
||||||
" )\n",
|
|
||||||
"\n",
|
|
||||||
"rows = []\n",
|
|
||||||
"for row in train_ds:\n",
|
|
||||||
" rows.append(\n",
|
|
||||||
" {\n",
|
|
||||||
" \"reference\": str(row[reference_key]),\n",
|
|
||||||
" \"response\": _normalize_answer(row[response_key]),\n",
|
|
||||||
" }\n",
|
|
||||||
" )\n",
|
|
||||||
"\n",
|
|
||||||
"eval_ds = EvaluationDataset.from_list(rows)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 6,
|
|
||||||
"id": "753aab30",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"DatasetDict({\n",
|
|
||||||
" train: Dataset({\n",
|
|
||||||
" features: ['query', 'answer'],\n",
|
|
||||||
" num_rows: 100231\n",
|
|
||||||
" })\n",
|
|
||||||
"})\n",
|
|
||||||
"['query', 'answer']\n",
|
|
||||||
"{'query': 'when did richmond last play in a preliminary final', 'answer': \"Richmond Football Club Richmond began 2017 with 5 straight wins, a feat it had not achieved since 1995. A series of close losses hampered the Tigers throughout the middle of the season, including a 5-point loss to the Western Bulldogs, 2-point loss to Fremantle, and a 3-point loss to the Giants. Richmond ended the season strongly with convincing victories over Fremantle and St Kilda in the final two rounds, elevating the club to 3rd on the ladder. Richmond's first final of the season against the Cats at the MCG attracted a record qualifying final crowd of 95,028; the Tigers won by 51 points. Having advanced to the first preliminary finals for the first time since 2001, Richmond defeated Greater Western Sydney by 36 points in front of a crowd of 94,258 to progress to the Grand Final against Adelaide, their first Grand Final appearance since 1982. The attendance was 100,021, the largest crowd to a grand final since 1986. The Crows led at quarter time and led by as many as 13, but the Tigers took over the game as it progressed and scored seven straight goals at one point. They eventually would win by 48 points – 16.12 (108) to Adelaide's 8.12 (60) – to end their 37-year flag drought.[22] Dustin Martin also became the first player to win a Premiership medal, the Brownlow Medal and the Norm Smith Medal in the same season, while Damien Hardwick was named AFL Coaches Association Coach of the Year. Richmond's jump from 13th to premiers also marked the biggest jump from one AFL season to the next.\"}\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"print(ds)\n",
|
|
||||||
"print(ds[\"train\"].column_names)\n",
|
|
||||||
"print(ds[\"train\"][0])"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 7,
|
|
||||||
"id": "6c3d4235",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"# ...existing code...\n",
|
|
||||||
"class OllamaRagasEmbeddingAdapter(BaseRagasEmbedding):\n",
|
|
||||||
" \"\"\"Adaptador de LangChain Ollama a la API moderna de embeddings en Ragas.\"\"\"\n",
|
|
||||||
"\n",
|
|
||||||
" def __init__(self, base_url: str, model_name: str) -> None:\n",
|
|
||||||
" self._client = OllamaEmbeddings(base_url=base_url, model=model_name)\n",
|
|
||||||
"\n",
|
|
||||||
" def embed_text(self, text: str) -> list[float]:\n",
|
|
||||||
" return self._client.embed_query(text)\n",
|
|
||||||
"\n",
|
|
||||||
" async def aembed_text(self, text: str) -> list[float]:\n",
|
|
||||||
" return await asyncio.to_thread(self.embed_text, text)\n",
|
|
||||||
"\n",
|
|
||||||
" def embed_query(self, text: str) -> list[float]:\n",
|
|
||||||
" return self.embed_text(text)\n",
|
|
||||||
"\n",
|
|
||||||
" def embed_documents(self, texts: Sequence[str]) -> list[list[float]]:\n",
|
|
||||||
" return self._client.embed_documents(list(texts))\n",
|
|
||||||
"\n",
|
|
||||||
" async def aembed_query(self, text: str) -> list[float]:\n",
|
|
||||||
" return await self.aembed_text(text)\n",
|
|
||||||
"\n",
|
|
||||||
" async def aembed_documents(\n",
|
|
||||||
" self, texts: Sequence[str]\n",
|
|
||||||
" ) -> list[list[float]]:\n",
|
|
||||||
" return await asyncio.to_thread(self.embed_documents, texts)\n",
|
|
||||||
"\n",
|
|
||||||
"\n",
|
|
||||||
"if not BASE_URL or not MODEL_NAME:\n",
|
|
||||||
" raise ValueError(\n",
|
|
||||||
" \"Faltan variables de entorno: LLM_BASE_LOCAL_URL u OLLAMA_MODEL_NAME.\"\n",
|
|
||||||
" )\n",
|
|
||||||
"\n",
|
|
||||||
"embeddings = OllamaRagasEmbeddingAdapter(\n",
|
|
||||||
" base_url=BASE_URL,\n",
|
|
||||||
" model_name=MODEL_NAME,\n",
|
|
||||||
")\n",
|
|
||||||
"\n",
|
|
||||||
"semantic_sim = SemanticSimilarity()\n",
|
|
||||||
"# ...existing code..."
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"id": "54aacf01",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"application/vnd.jupyter.widget-view+json": {
|
|
||||||
"model_id": "6a4b6e91c71d4849922f36d45f3e9f7f",
|
|
||||||
"version_major": 2,
|
|
||||||
"version_minor": 0
|
|
||||||
},
|
|
||||||
"text/plain": [
|
|
||||||
"Evaluating: 0%| | 0/100231 [00:00<?, ?it/s]"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "display_data"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"# ...existing code...\n",
|
|
||||||
"result = evaluate(\n",
|
|
||||||
" dataset=eval_ds,\n",
|
|
||||||
" metrics=[semantic_sim],\n",
|
|
||||||
" embeddings=embeddings,\n",
|
|
||||||
")\n",
|
|
||||||
"\n",
|
|
||||||
"print(result)\n",
|
|
||||||
"# ...existing code...\n",
|
|
||||||
"# ...existing code..."
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"metadata": {
|
|
||||||
"kernelspec": {
|
|
||||||
"display_name": "assistance-engine",
|
|
||||||
"language": "python",
|
|
||||||
"name": "python3"
|
|
||||||
},
|
|
||||||
"language_info": {
|
|
||||||
"codemirror_mode": {
|
|
||||||
"name": "ipython",
|
|
||||||
"version": 3
|
|
||||||
},
|
|
||||||
"file_extension": ".py",
|
|
||||||
"mimetype": "text/x-python",
|
|
||||||
"name": "python",
|
|
||||||
"nbconvert_exporter": "python",
|
|
||||||
"pygments_lexer": "ipython3",
|
|
||||||
"version": "3.12.11"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"nbformat": 4,
|
|
||||||
"nbformat_minor": 5
|
|
||||||
}
|
|
||||||
|
|
@ -1,431 +0,0 @@
|
||||||
"""
|
|
||||||
Embedding Evaluation Pipeline
|
|
||||||
|
|
||||||
Evaluate embedding models across CodexGlue, CoSQA, and SciFact benchmarks.
|
|
||||||
Supports multiple embedding providers via factory methods.
|
|
||||||
"""
|
|
||||||
|
|
||||||
import json
|
|
||||||
from pathlib import Path
|
|
||||||
from typing import Any, Dict, List, Union
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
import typer
|
|
||||||
from langchain_ollama import OllamaEmbeddings
|
|
||||||
from langchain_huggingface import HuggingFaceEmbeddings
|
|
||||||
from beir.datasets.data_loader import GenericDataLoader
|
|
||||||
from beir.retrieval.evaluation import EvaluateRetrieval
|
|
||||||
from beir.retrieval.search.dense import DenseRetrievalExactSearch
|
|
||||||
from beir import util
|
|
||||||
from datasets import load_dataset
|
|
||||||
from src.config import settings
|
|
||||||
# Import embedding factory
|
|
||||||
project_root = settings.proj_root
|
|
||||||
DATASETS_ROOT = project_root / "research" / "embeddings" / "datasets"
|
|
||||||
|
|
||||||
app = typer.Typer()
|
|
||||||
|
|
||||||
|
|
||||||
def _has_local_beir_files(data_path: Path) -> bool:
|
|
||||||
"""Return True when a dataset folder already has the required BEIR files."""
|
|
||||||
required_files = [
|
|
||||||
data_path / "corpus.jsonl",
|
|
||||||
data_path / "queries.jsonl",
|
|
||||||
data_path / "qrels" / "test.tsv",
|
|
||||||
]
|
|
||||||
return all(path.exists() and path.stat().st_size > 0 for path in required_files)
|
|
||||||
|
|
||||||
|
|
||||||
def _load_local_beir_dataset(data_path: Path) -> tuple[Dict, Dict, Dict]:
|
|
||||||
"""Load a BEIR-formatted dataset from local disk."""
|
|
||||||
return GenericDataLoader(str(data_path)).load(split="test")
|
|
||||||
|
|
||||||
|
|
||||||
class BEIROllamaEmbeddings:
|
|
||||||
"""
|
|
||||||
Adapter that makes LangChain's OllamaEmbeddings compatible with BEIR.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
base_url: str,
|
|
||||||
model: str,
|
|
||||||
batch_size: int = 64,
|
|
||||||
) -> None:
|
|
||||||
self.batch_size = batch_size
|
|
||||||
self.embeddings = OllamaEmbeddings(
|
|
||||||
base_url=base_url,
|
|
||||||
model=model,
|
|
||||||
)
|
|
||||||
|
|
||||||
def _batch_embed(self, texts: List[str]) -> np.ndarray:
|
|
||||||
vectors = []
|
|
||||||
|
|
||||||
for i in range(0, len(texts), self.batch_size):
|
|
||||||
batch = texts[i : i + self.batch_size]
|
|
||||||
batch_vectors = self.embeddings.embed_documents(batch)
|
|
||||||
|
|
||||||
# Handle NaN values by replacing with zeros
|
|
||||||
for vec in batch_vectors:
|
|
||||||
if isinstance(vec, (list, np.ndarray)):
|
|
||||||
vec_array = np.asarray(vec, dtype=np.float32)
|
|
||||||
# Replace NaN with zeros
|
|
||||||
vec_array = np.nan_to_num(vec_array, nan=0.0, posinf=0.0, neginf=0.0)
|
|
||||||
vectors.append(vec_array)
|
|
||||||
else:
|
|
||||||
vectors.append(vec)
|
|
||||||
|
|
||||||
return np.asarray(vectors, dtype=np.float32)
|
|
||||||
|
|
||||||
def encode_queries(self, queries: List[str], **kwargs) -> np.ndarray:
|
|
||||||
"""
|
|
||||||
BEIR query encoder
|
|
||||||
"""
|
|
||||||
# Filter and clean queries - replace empty ones with placeholder
|
|
||||||
cleaned_queries = []
|
|
||||||
for q in queries:
|
|
||||||
if isinstance(q, str):
|
|
||||||
cleaned = q.strip()
|
|
||||||
if not cleaned:
|
|
||||||
cleaned = "[EMPTY]"
|
|
||||||
else:
|
|
||||||
cleaned = "[INVALID]"
|
|
||||||
cleaned_queries.append(cleaned)
|
|
||||||
|
|
||||||
return self._batch_embed(cleaned_queries)
|
|
||||||
|
|
||||||
def encode_corpus(
|
|
||||||
self,
|
|
||||||
corpus: Union[List[Dict[str, str]], Dict[str, Dict[str, str]]],
|
|
||||||
**kwargs,
|
|
||||||
) -> np.ndarray:
|
|
||||||
"""
|
|
||||||
BEIR corpus encoder
|
|
||||||
"""
|
|
||||||
if isinstance(corpus, dict):
|
|
||||||
corpus = list(corpus.values())
|
|
||||||
|
|
||||||
texts = []
|
|
||||||
for doc in corpus:
|
|
||||||
title = (doc.get("title") or "").strip()
|
|
||||||
text = (doc.get("text") or "").strip()
|
|
||||||
|
|
||||||
# Combine title and text, filtering out empty strings
|
|
||||||
combined = " ".join(filter(None, [title, text]))
|
|
||||||
|
|
||||||
# Use placeholder if both are empty to avoid NaN embeddings
|
|
||||||
if not combined:
|
|
||||||
combined = "[EMPTY]"
|
|
||||||
|
|
||||||
texts.append(combined)
|
|
||||||
|
|
||||||
return self._batch_embed(texts)
|
|
||||||
|
|
||||||
|
|
||||||
class BEIRHuggingFaceEmbeddings:
|
|
||||||
"""
|
|
||||||
Adapter that makes LangChain's HuggingFaceEmbeddings compatible with BEIR.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, model: str, batch_size: int = 64) -> None:
|
|
||||||
self.batch_size = batch_size
|
|
||||||
self.embeddings = HuggingFaceEmbeddings(model_name=model)
|
|
||||||
|
|
||||||
def _batch_embed(self, texts: List[str]) -> np.ndarray:
|
|
||||||
vectors = []
|
|
||||||
for i in range(0, len(texts), self.batch_size):
|
|
||||||
batch = texts[i : i + self.batch_size]
|
|
||||||
batch_vectors = self.embeddings.embed_documents(batch)
|
|
||||||
|
|
||||||
# Handle NaN values
|
|
||||||
for vec in batch_vectors:
|
|
||||||
if isinstance(vec, (list, np.ndarray)):
|
|
||||||
vec_array = np.asarray(vec, dtype=np.float32)
|
|
||||||
vec_array = np.nan_to_num(vec_array, nan=0.0, posinf=0.0, neginf=0.0)
|
|
||||||
vectors.append(vec_array)
|
|
||||||
else:
|
|
||||||
vectors.append(vec)
|
|
||||||
|
|
||||||
return np.asarray(vectors, dtype=np.float32)
|
|
||||||
|
|
||||||
def encode_queries(self, queries: List[str], **kwargs) -> np.ndarray:
|
|
||||||
"""BEIR query encoder"""
|
|
||||||
cleaned_queries = []
|
|
||||||
for q in queries:
|
|
||||||
if isinstance(q, str):
|
|
||||||
cleaned = q.strip()
|
|
||||||
if not cleaned:
|
|
||||||
cleaned = "[EMPTY]"
|
|
||||||
else:
|
|
||||||
cleaned = "[INVALID]"
|
|
||||||
cleaned_queries.append(cleaned)
|
|
||||||
return self._batch_embed(cleaned_queries)
|
|
||||||
|
|
||||||
def encode_corpus(
|
|
||||||
self,
|
|
||||||
corpus: Union[List[Dict[str, str]], Dict[str, Dict[str, str]]],
|
|
||||||
**kwargs,
|
|
||||||
) -> np.ndarray:
|
|
||||||
"""BEIR corpus encoder"""
|
|
||||||
if isinstance(corpus, dict):
|
|
||||||
corpus = list(corpus.values())
|
|
||||||
|
|
||||||
texts = []
|
|
||||||
for doc in corpus:
|
|
||||||
title = (doc.get("title") or "").strip()
|
|
||||||
text = (doc.get("text") or "").strip()
|
|
||||||
combined = " ".join(filter(None, [title, text]))
|
|
||||||
if not combined:
|
|
||||||
combined = "[EMPTY]"
|
|
||||||
texts.append(combined)
|
|
||||||
|
|
||||||
return self._batch_embed(texts)
|
|
||||||
|
|
||||||
|
|
||||||
def load_scifact_dataset() -> tuple[Dict, Dict, Dict]:
|
|
||||||
"""Load SciFact benchmark."""
|
|
||||||
DATASETS_ROOT.mkdir(parents=True, exist_ok=True)
|
|
||||||
scifact_path = DATASETS_ROOT / "scifact"
|
|
||||||
|
|
||||||
if _has_local_beir_files(scifact_path):
|
|
||||||
print(" Using local SciFact dataset cache")
|
|
||||||
return _load_local_beir_dataset(scifact_path)
|
|
||||||
|
|
||||||
print(" SciFact dataset not found locally. Downloading...")
|
|
||||||
url = "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/scifact.zip"
|
|
||||||
data_path = util.download_and_unzip(url, out_dir=str(DATASETS_ROOT))
|
|
||||||
downloaded_path = Path(data_path)
|
|
||||||
if downloaded_path.name == "scifact" and _has_local_beir_files(downloaded_path):
|
|
||||||
return _load_local_beir_dataset(downloaded_path)
|
|
||||||
|
|
||||||
return _load_local_beir_dataset(scifact_path)
|
|
||||||
|
|
||||||
|
|
||||||
def load_cosqa_dataset() -> tuple[Dict, Dict, Dict]:
|
|
||||||
"""Load CoSQA benchmark."""
|
|
||||||
data_path = DATASETS_ROOT / "cosqa"
|
|
||||||
if _has_local_beir_files(data_path):
|
|
||||||
print(" Using local CoSQA dataset cache")
|
|
||||||
return _load_local_beir_dataset(data_path)
|
|
||||||
|
|
||||||
print(" CoSQA dataset not found locally. Downloading and preparing...")
|
|
||||||
(data_path / "qrels").mkdir(parents=True, exist_ok=True)
|
|
||||||
|
|
||||||
# Load from HuggingFace
|
|
||||||
hf_corpus = load_dataset("CoIR-Retrieval/cosqa", "corpus", split="corpus")
|
|
||||||
hf_queries = load_dataset("CoIR-Retrieval/cosqa", "queries", split="queries")
|
|
||||||
hf_qrels = load_dataset("CoIR-Retrieval/cosqa", "default", split="test")
|
|
||||||
|
|
||||||
# Save in BEIR format
|
|
||||||
with open(data_path / "corpus.jsonl", "w") as f:
|
|
||||||
for item in hf_corpus:
|
|
||||||
f.write(
|
|
||||||
json.dumps(
|
|
||||||
{"_id": str(item["_id"]), "text": item["text"], "title": ""}
|
|
||||||
)
|
|
||||||
+ "\n"
|
|
||||||
)
|
|
||||||
|
|
||||||
with open(data_path / "queries.jsonl", "w") as f:
|
|
||||||
for item in hf_queries:
|
|
||||||
f.write(json.dumps({"_id": str(item["_id"]), "text": item["text"]}) + "\n")
|
|
||||||
|
|
||||||
with open(data_path / "qrels" / "test.tsv", "w") as f:
|
|
||||||
f.write("query-id\tcorpus-id\tscore\n")
|
|
||||||
for item in hf_qrels:
|
|
||||||
f.write(f"{item['query-id']}\t{item['corpus-id']}\t{item['score']}\n")
|
|
||||||
|
|
||||||
return _load_local_beir_dataset(data_path)
|
|
||||||
|
|
||||||
|
|
||||||
def load_codexglue_dataset() -> tuple[Dict, Dict, Dict]:
|
|
||||||
"""Load CodexGlue benchmark."""
|
|
||||||
data_path = DATASETS_ROOT / "codexglue"
|
|
||||||
if _has_local_beir_files(data_path):
|
|
||||||
print(" Using local CodexGlue dataset cache")
|
|
||||||
return _load_local_beir_dataset(data_path)
|
|
||||||
|
|
||||||
print(" CodexGlue dataset not found locally. Downloading and preparing...")
|
|
||||||
(data_path / "qrels").mkdir(parents=True, exist_ok=True)
|
|
||||||
|
|
||||||
raw_dataset = load_dataset("google/code_x_glue_tc_nl_code_search_adv", split="test")
|
|
||||||
with open(data_path / "corpus.jsonl", "w") as corpus_file:
|
|
||||||
for i, data in enumerate(raw_dataset):
|
|
||||||
docid = f"doc_{i}"
|
|
||||||
corpus_file.write(
|
|
||||||
json.dumps(
|
|
||||||
{
|
|
||||||
"_id": docid,
|
|
||||||
"title": data.get("func_name", ""),
|
|
||||||
"text": data["code"],
|
|
||||||
}
|
|
||||||
)
|
|
||||||
+ "\n"
|
|
||||||
)
|
|
||||||
|
|
||||||
with open(data_path / "queries.jsonl", "w") as query_file:
|
|
||||||
for i, data in enumerate(raw_dataset):
|
|
||||||
queryid = f"q_{i}"
|
|
||||||
query_file.write(
|
|
||||||
json.dumps({"_id": queryid, "text": data["docstring"]}) + "\n"
|
|
||||||
)
|
|
||||||
|
|
||||||
with open(data_path / "qrels" / "test.tsv", "w") as qrels_file:
|
|
||||||
qrels_file.write("query-id\tcorpus-id\tscore\n")
|
|
||||||
for i, _ in enumerate(raw_dataset):
|
|
||||||
qrels_file.write(f"q_{i}\tdoc_{i}\t1\n")
|
|
||||||
|
|
||||||
return _load_local_beir_dataset(data_path)
|
|
||||||
|
|
||||||
|
|
||||||
BENCHMARK_LOADERS = {
|
|
||||||
"scifact": load_scifact_dataset,
|
|
||||||
"cosqa": load_cosqa_dataset,
|
|
||||||
"codexglue": load_codexglue_dataset,
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def evaluate_model_on_benchmark(
|
|
||||||
benchmark: str, provider: str, model: str, k_values: List[int] = None
|
|
||||||
) -> Dict[str, Any]:
|
|
||||||
"""Evaluate a model on a benchmark."""
|
|
||||||
if k_values is None:
|
|
||||||
k_values = [1, 5, 10, 100]
|
|
||||||
|
|
||||||
print(f" Loading {benchmark.upper()} dataset...")
|
|
||||||
corpus, queries, qrels = BENCHMARK_LOADERS[benchmark]()
|
|
||||||
|
|
||||||
print(f" Corpus: {len(corpus)}, Queries: {len(queries)}")
|
|
||||||
|
|
||||||
# Select adapter based on provider
|
|
||||||
if provider == "ollama":
|
|
||||||
adapter = BEIROllamaEmbeddings(
|
|
||||||
base_url=settings.ollama_local_url,
|
|
||||||
model=model,
|
|
||||||
batch_size=64
|
|
||||||
)
|
|
||||||
elif provider == "huggingface":
|
|
||||||
adapter = BEIRHuggingFaceEmbeddings(model=model, batch_size=64)
|
|
||||||
else:
|
|
||||||
raise ValueError(f"Unknown provider: {provider}")
|
|
||||||
|
|
||||||
retriever = DenseRetrievalExactSearch(adapter, batch_size=64)
|
|
||||||
evaluator = EvaluateRetrieval(retriever, score_function="cos_sim")
|
|
||||||
|
|
||||||
print(" Running retrieval...")
|
|
||||||
results = evaluator.retrieve(corpus, queries)
|
|
||||||
|
|
||||||
print(" Computing metrics...")
|
|
||||||
ndcg, _map, recall, precision = evaluator.evaluate(qrels, results, k_values)
|
|
||||||
|
|
||||||
return {"NDCG": ndcg, "MAP": _map, "Recall": recall, "Precision": precision}
|
|
||||||
|
|
||||||
|
|
||||||
def parse_model_spec(model_spec: str) -> tuple[str, str]:
|
|
||||||
"""
|
|
||||||
Parse model spec. Format: "provider:model_name" (default provider: ollama).
|
|
||||||
Examples: "ollama:qwen3", "openai:text-embedding-3-small", "bge-me3:latest"
|
|
||||||
"""
|
|
||||||
if ":" in model_spec:
|
|
||||||
parts = model_spec.split(":", 1)
|
|
||||||
if parts[0].lower() in ["ollama", "openai", "huggingface", "bedrock"]:
|
|
||||||
return parts[0].lower(), parts[1]
|
|
||||||
return "ollama", model_spec
|
|
||||||
|
|
||||||
|
|
||||||
def evaluate_models(
|
|
||||||
models: List[str], benchmarks: List[str], output_folder: Path, k_values: List[int]
|
|
||||||
) -> None:
|
|
||||||
"""Evaluate multiple models on multiple benchmarks."""
|
|
||||||
output_folder.mkdir(parents=True, exist_ok=True)
|
|
||||||
all_results = {}
|
|
||||||
|
|
||||||
for model_spec in models:
|
|
||||||
provider, model_name = parse_model_spec(model_spec)
|
|
||||||
print(f"\n{'='*60}\nModel: {model_name} ({provider})\n{'='*60}")
|
|
||||||
|
|
||||||
model_results = {}
|
|
||||||
for benchmark in benchmarks:
|
|
||||||
if benchmark not in BENCHMARK_LOADERS:
|
|
||||||
print(f"✗ Unknown benchmark: {benchmark}")
|
|
||||||
continue
|
|
||||||
|
|
||||||
print(f"\nEvaluating on {benchmark}...")
|
|
||||||
try:
|
|
||||||
metrics = evaluate_model_on_benchmark(
|
|
||||||
benchmark, provider, model_name, k_values=k_values
|
|
||||||
)
|
|
||||||
model_results[benchmark] = metrics
|
|
||||||
print("✓ Complete")
|
|
||||||
except Exception as e:
|
|
||||||
print(f"✗ Error: {e}")
|
|
||||||
import traceback
|
|
||||||
traceback.print_exc()
|
|
||||||
|
|
||||||
all_results[model_spec] = model_results
|
|
||||||
|
|
||||||
output_file = output_folder / f"results_{'_'.join(models)}_{'_'.join(benchmarks)}.json"
|
|
||||||
print(f"\n{'='*60}\nSaving to {output_file}")
|
|
||||||
with open(output_file, "w") as f:
|
|
||||||
json.dump(all_results, f, indent=2)
|
|
||||||
print("✓ Done")
|
|
||||||
|
|
||||||
|
|
||||||
@app.command()
|
|
||||||
def main(
|
|
||||||
models: List[str] = typer.Option(
|
|
||||||
None,
|
|
||||||
"--model",
|
|
||||||
"-m",
|
|
||||||
help="Model spec (format: 'provider:model' or just 'model' for Ollama). "
|
|
||||||
"Providers: ollama, huggingface. Can specify multiple times. "
|
|
||||||
"Default: huggingface:sentence-transformers/all-MiniLM-L6-v2",
|
|
||||||
),
|
|
||||||
benchmarks: List[str] = typer.Option(
|
|
||||||
None,
|
|
||||||
"--benchmark",
|
|
||||||
"-b",
|
|
||||||
help="Benchmark name (scifact, cosqa, codexglue). Default: all three",
|
|
||||||
),
|
|
||||||
output_folder: Path = typer.Option(
|
|
||||||
Path("research/embedding_eval_results"),
|
|
||||||
"--output",
|
|
||||||
"-o",
|
|
||||||
help="Output folder for results.",
|
|
||||||
),
|
|
||||||
k_values: str = typer.Option(
|
|
||||||
"1,5,10,100",
|
|
||||||
"--k-values",
|
|
||||||
"-k",
|
|
||||||
help="Comma-separated k values for metrics.",
|
|
||||||
),
|
|
||||||
) -> None:
|
|
||||||
"""
|
|
||||||
Evaluate embedding models on CodexGlue, CoSQA, and SciFact benchmarks.
|
|
||||||
|
|
||||||
Examples:
|
|
||||||
# HuggingFace model (no Ollama required)
|
|
||||||
python evaluate_embeddings_pipeline.py
|
|
||||||
|
|
||||||
# Different HuggingFace model
|
|
||||||
python evaluate_embeddings_pipeline.py -m huggingface:sentence-transformers/bge-small-en-v1.5
|
|
||||||
|
|
||||||
# Ollama model
|
|
||||||
python evaluate_embeddings_pipeline.py -m ollama:qwen:embeddings
|
|
||||||
|
|
||||||
# Multiple models and single benchmark
|
|
||||||
python evaluate_embeddings_pipeline.py -m huggingface:all-MiniLM-L6-v2 -m ollama:bge-m3 -b scifact -o ./results
|
|
||||||
"""
|
|
||||||
if not models:
|
|
||||||
models = ["bge-m3:latest", "qwen3-0.6B-emb:latest"]
|
|
||||||
|
|
||||||
if not benchmarks:
|
|
||||||
benchmarks = ["scifact", "cosqa", "codexglue"]
|
|
||||||
|
|
||||||
k_list = [int(k.strip()) for k in k_values.split(",")]
|
|
||||||
|
|
||||||
evaluate_models(models=models, benchmarks=benchmarks, output_folder=output_folder, k_values=k_list)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
app()
|
|
||||||
|
|
@ -1,99 +0,0 @@
|
||||||
{
|
|
||||||
"language": "avap",
|
|
||||||
"version": "2.0",
|
|
||||||
"file_extensions": [".avap"],
|
|
||||||
|
|
||||||
"lexer": {
|
|
||||||
"string_delimiters": ["\"", "'"],
|
|
||||||
"escape_char": "\\",
|
|
||||||
"comment_line": ["///", "//"],
|
|
||||||
"comment_block": { "open": "/*", "close": "*/" },
|
|
||||||
"line_oriented": true
|
|
||||||
},
|
|
||||||
|
|
||||||
"blocks": [
|
|
||||||
{
|
|
||||||
"name": "function",
|
|
||||||
"doc_type": "code",
|
|
||||||
"opener_pattern": "^\\s*function\\s+(\\w+)\\s*\\(([^)]*)",
|
|
||||||
"closer_pattern": "^\\s*\\}\\s*$",
|
|
||||||
"extract_signature": true,
|
|
||||||
"signature_template": "function {group1}({group2})"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "if",
|
|
||||||
"doc_type": "code",
|
|
||||||
"opener_pattern": "^\\s*if\\s*\\(",
|
|
||||||
"closer_pattern": "^\\s*end\\s*\\(\\s*\\)",
|
|
||||||
"note": "Closer is end(). The else() marker is an inline separator within the if block, not a block opener."
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "startLoop",
|
|
||||||
"doc_type": "code",
|
|
||||||
"opener_pattern": "^\\s*startLoop\\s*\\(",
|
|
||||||
"closer_pattern": "^\\s*endLoop\\s*\\(\\s*\\)"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "try",
|
|
||||||
"doc_type": "code",
|
|
||||||
"opener_pattern": "^\\s*try\\s*\\(\\s*\\)",
|
|
||||||
"closer_pattern": "^\\s*end\\s*\\(\\s*\\)",
|
|
||||||
"note": "try() closes with end(), same as if(). The exception() command is a statement within the try block."
|
|
||||||
}
|
|
||||||
],
|
|
||||||
|
|
||||||
"statements": [
|
|
||||||
{ "name": "registerEndpoint", "pattern": "^\\s*registerEndpoint\\s*\\(" },
|
|
||||||
{ "name": "addVar", "pattern": "^\\s*addVar\\s*\\(" },
|
|
||||||
{ "name": "addResult", "pattern": "^\\s*addResult\\s*\\(" },
|
|
||||||
{ "name": "addParam", "pattern": "^\\s*addParam\\s*\\(" },
|
|
||||||
{ "name": "getQueryParamList", "pattern": "^\\s*getQueryParamList\\s*\\(" },
|
|
||||||
{ "name": "getListLen", "pattern": "^\\s*getListLen\\s*\\(" },
|
|
||||||
{ "name": "itemFromList", "pattern": "^\\s*itemFromList\\s*\\(" },
|
|
||||||
{ "name": "variableToList", "pattern": "^\\s*variableToList\\s*\\(" },
|
|
||||||
{ "name": "variableFromJSON", "pattern": "^\\s*variableFromJSON\\s*\\(" },
|
|
||||||
{ "name": "addVariableToJSON", "pattern": "^\\s*AddvariableToJSON\\s*\\(|^\\s*AddVariableToJSON\\s*\\(", "note": "init.sql uses AddvariableToJSON (lowercase v). Both casings are accepted." },
|
|
||||||
{ "name": "RequestGet", "pattern": "^\\s*\\w+\\s*=\\s*RequestGet\\s*\\(|^\\s*RequestGet\\s*\\(" },
|
|
||||||
{ "name": "RequestPost", "pattern": "^\\s*\\w+\\s*=\\s*RequestPost\\s*\\(|^\\s*RequestPost\\s*\\(" },
|
|
||||||
{ "name": "ormDirect", "pattern": "^\\s*\\w+\\s*=\\s*ormDirect\\s*\\(|^\\s*ormDirect\\s*\\(" },
|
|
||||||
{ "name": "orm_command", "pattern": "^\\s*(ormCheckTable|ormCreateTable|ormAccessSelect|ormAccessInsert|ormAccessUpdate)\\s*\\(" },
|
|
||||||
{ "name": "exception", "pattern": "^\\s*exception\\s*\\(|^\\s*\\w+\\s*=\\s*exception\\s*\\(", "note": "exception() appears inside try blocks as an error capture statement. Can be used as: exception(var) or var = exception(...)" },
|
|
||||||
{ "name": "else", "pattern": "^\\s*else\\s*\\(\\s*\\)", "note": "else() is a flow separator marker inside if() blocks. Not a block opener — the parser handles branching at the if() level." },
|
|
||||||
{ "name": "end", "pattern": "^\\s*end\\s*\\(\\s*\\)", "note": "end() closes if() and try() blocks. Handled by the block closer_pattern of those blocks. Listed here as a fallback for standalone end() statements." },
|
|
||||||
{ "name": "endLoop", "pattern": "^\\s*endLoop\\s*\\(\\s*\\)", "note": "endLoop() closes startLoop() blocks. Listed here as a fallback for standalone endLoop() statements." },
|
|
||||||
{ "name": "encodeSHA256", "pattern": "^\\s*\\w+\\s*=\\s*encodeSHA256\\s*\\(|^\\s*encodeSHA256\\s*\\(" },
|
|
||||||
{ "name": "encodeMD5", "pattern": "^\\s*\\w+\\s*=\\s*encodeMD5\\s*\\(|^\\s*encodeMD5\\s*\\(" },
|
|
||||||
{ "name": "randomString", "pattern": "^\\s*randomString\\s*\\(" },
|
|
||||||
{ "name": "replace", "pattern": "^\\s*replace\\s*\\(" },
|
|
||||||
{ "name": "getRegex", "pattern": "^\\s*\\w+\\s*=\\s*getRegex\\s*\\(|^\\s*getRegex\\s*\\(" },
|
|
||||||
{ "name": "getDateTime", "pattern": "^\\s*\\w+\\s*=\\s*getDateTime\\s*\\(|^\\s*getDateTime\\s*\\(" },
|
|
||||||
{ "name": "getTimeStamp", "pattern": "^\\s*\\w+\\s*=\\s*getTimeStamp\\s*\\(|^\\s*getTimeStamp\\s*\\(" },
|
|
||||||
{ "name": "stampToDatetime", "pattern": "^\\s*\\w+\\s*=\\s*stampToDatetime\\s*\\(|^\\s*stampToDatetime\\s*\\(" },
|
|
||||||
{ "name": "async_command", "pattern": "^\\s*\\w+\\s*=\\s*go\\s+\\w+\\s*\\(|^\\s*gather\\s*\\(" },
|
|
||||||
{ "name": "connector", "pattern": "^\\s*\\w+\\s*=\\s*avapConnector\\s*\\(" },
|
|
||||||
{ "name": "return", "pattern": "^\\s*return\\s+\\S" },
|
|
||||||
{ "name": "modularity", "pattern": "^\\s*(import|include)\\s+" },
|
|
||||||
{ "name": "assignment", "pattern": "^\\s*\\w+\\s*=\\s*" }
|
|
||||||
],
|
|
||||||
|
|
||||||
"semantic_tags": [
|
|
||||||
{ "tag": "uses_orm", "pattern": "\\b(ormDirect|ormCheckTable|ormCreateTable|ormAccessSelect|ormAccessInsert|ormAccessUpdate)\\s*\\(" },
|
|
||||||
{ "tag": "uses_http", "pattern": "\\b(RequestPost|RequestGet)\\s*\\(" },
|
|
||||||
{ "tag": "uses_connector", "pattern": "\\bavapConnector\\s*\\(" },
|
|
||||||
{ "tag": "uses_async", "pattern": "\\bgo\\s+\\w+\\s*\\(|\\bgather\\s*\\(" },
|
|
||||||
{ "tag": "uses_crypto", "pattern": "\\b(encodeSHA256|encodeMD5)\\s*\\(" },
|
|
||||||
{ "tag": "uses_auth", "pattern": "\\b(addParam|_status)\\b" },
|
|
||||||
{ "tag": "uses_error_handling", "pattern": "\\btry\\s*\\(\\s*\\)" },
|
|
||||||
{ "tag": "uses_exception", "pattern": "\\bexception\\s*\\(" },
|
|
||||||
{ "tag": "uses_loop", "pattern": "\\bstartLoop\\s*\\(" },
|
|
||||||
{ "tag": "uses_conditional", "pattern": "\\bif\\s*\\(" },
|
|
||||||
{ "tag": "uses_json", "pattern": "\\b(variableFromJSON|AddvariableToJSON|AddVariableToJSON)\\s*\\(" },
|
|
||||||
{ "tag": "uses_list", "pattern": "\\b(variableToList|itemFromList|getListLen)\\s*\\(" },
|
|
||||||
{ "tag": "uses_regex", "pattern": "\\bgetRegex\\s*\\(" },
|
|
||||||
{ "tag": "uses_datetime", "pattern": "\\b(getDateTime|getTimeStamp|stampToDatetime)\\s*\\(" },
|
|
||||||
{ "tag": "uses_string_ops", "pattern": "\\b(randomString|replace|encodeSHA256|encodeMD5)\\s*\\(" },
|
|
||||||
{ "tag": "uses_return", "pattern": "^\\s*return\\s+\\S" },
|
|
||||||
{ "tag": "returns_result", "pattern": "\\baddResult\\s*\\(" },
|
|
||||||
{ "tag": "registers_endpoint", "pattern": "\\bregisterEndpoint\\s*\\(" }
|
|
||||||
]
|
|
||||||
}
|
|
||||||
|
|
@ -1,341 +0,0 @@
|
||||||
# AVAP MAP-Elites Dataset Pipeline — Documentación
|
|
||||||
|
|
||||||
> Scripts para la síntesis de datasets de benchmarks AVAP mediante cobertura de gramática garantizada y priors estadísticos extraídos de código real de producción.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Índice
|
|
||||||
|
|
||||||
- [Visión general del sistema](#visión-general-del-sistema)
|
|
||||||
- [Requisitos e instalación](#requisitos-e-instalación)
|
|
||||||
- [construct_prior.py](#construct_priorpy)
|
|
||||||
- [Uso](#uso-construct_prior)
|
|
||||||
- [Cómo funciona](#cómo-funciona-construct_prior)
|
|
||||||
- [generate_mbap_v2.py](#generate_mbap_v2py)
|
|
||||||
- [Uso](#uso-generate_mbap_v2)
|
|
||||||
- [Cómo funciona](#cómo-funciona-generate_mbap_v2)
|
|
||||||
- [Flujo de trabajo recomendado](#flujo-de-trabajo-recomendado)
|
|
||||||
- [Archivos generados](#archivos-generados)
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Visión general del sistema
|
|
||||||
|
|
||||||
El pipeline consta de dos scripts que trabajan en conjunto:
|
|
||||||
|
|
||||||
```
|
|
||||||
construct_prior.py → construct_map.yaml
|
|
||||||
↓ ↓
|
|
||||||
generate_mbap_v2.py ←─────────────────┘
|
|
||||||
↓
|
|
||||||
dataset .json + coverage_stats .json
|
|
||||||
```
|
|
||||||
|
|
||||||
1. **`construct_prior.py`** analiza codebases reales en GitHub para extraer con qué frecuencia co-ocurren los 38 comandos AVAP en código de producción real. El resultado es un fichero `construct_map.yaml` con pesos estadísticos.
|
|
||||||
|
|
||||||
2. **`generate_mbap_v2.py`** usa esos pesos para dirigir un generador MAP-Elites que llama a la API de Claude, garantizando cobertura uniforme de todas las combinaciones de pares y tríos del DSL AVAP.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Requisitos e instalación
|
|
||||||
|
|
||||||
```bash
|
|
||||||
pip install requests pyyaml anthropic
|
|
||||||
pip install datasets huggingface_hub # solo si usas --prior-sources huggingface
|
|
||||||
|
|
||||||
export ANTHROPIC_API_KEY=sk-ant-...
|
|
||||||
export GITHUB_TOKEN=ghp_... # opcional pero recomendado
|
|
||||||
```
|
|
||||||
|
|
||||||
El parser AVAP debe estar en ejecución si se desea validación AST real (opcional; si no está disponible, se usa keyword scanning como fallback):
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# El parser debe escuchar en el puerto configurado (por defecto 8080)
|
|
||||||
# Ejemplo: http://localhost:8080
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## `construct_prior.py`
|
|
||||||
|
|
||||||
### Uso (construct_prior)
|
|
||||||
|
|
||||||
El script tiene dos modos de operación: **generación** del mapa y **verificación** de uno existente.
|
|
||||||
|
|
||||||
#### Generar `construct_map.yaml` (ejecutar una vez, o cuando cambie la gramática AVAP)
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Ejecución básica (sin token: límite de 10 req/min en GitHub)
|
|
||||||
python construct_prior.py --generate-map
|
|
||||||
|
|
||||||
# Con token de GitHub (recomendado — 30 req/min)
|
|
||||||
python construct_prior.py --generate-map --github-token ghp_...
|
|
||||||
|
|
||||||
# Analizar más ficheros para un prior más rico
|
|
||||||
python construct_prior.py --generate-map --max-files 200 --github-token ghp_...
|
|
||||||
|
|
||||||
# Ruta de salida personalizada
|
|
||||||
python construct_prior.py --generate-map --output /ruta/construct_map.yaml
|
|
||||||
```
|
|
||||||
|
|
||||||
#### Verificar un `construct_map.yaml` existente
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python construct_prior.py --verify --map construct_map.yaml
|
|
||||||
```
|
|
||||||
|
|
||||||
#### Uso como módulo desde `generate_mbap_v2.py`
|
|
||||||
|
|
||||||
```python
|
|
||||||
from construct_prior import ConstructPrior
|
|
||||||
|
|
||||||
# Cargar desde YAML generado
|
|
||||||
prior = ConstructPrior.from_yaml("construct_map.yaml")
|
|
||||||
|
|
||||||
# Consultar el peso de una celda (par o trío de comandos AVAP)
|
|
||||||
w = prior.cell_weight(frozenset({"try", "ormAccessSelect"}))
|
|
||||||
|
|
||||||
# Fallback estático (sin necesidad de YAML)
|
|
||||||
prior = ConstructPrior.static_fallback()
|
|
||||||
```
|
|
||||||
|
|
||||||
#### Parámetros CLI
|
|
||||||
|
|
||||||
| Parámetro | Tipo | Por defecto | Descripción |
|
|
||||||
|---|---|---|---|
|
|
||||||
| `--generate-map` | flag | — | Activa el modo de extracción desde GitHub |
|
|
||||||
| `--verify` | flag | — | Carga y muestra estadísticas de un YAML existente |
|
|
||||||
| `--github-token` | str | `$GITHUB_TOKEN` | Token de acceso personal de GitHub |
|
|
||||||
| `--max-files` | int | `100` | Número máximo de ficheros a analizar |
|
|
||||||
| `--output` | str | `construct_map.yaml` | Ruta de salida del YAML |
|
|
||||||
| `--map` | str | `construct_map.yaml` | Ruta del YAML a verificar (solo con `--verify`) |
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
### Cómo funciona (construct_prior)
|
|
||||||
|
|
||||||
El objetivo del script es responder a la pregunta: **¿qué combinaciones de comandos AVAP aparecen juntas con más frecuencia en código API real de producción?** Esa información sirve para sesgar el generador de datasets hacia patrones realistas.
|
|
||||||
|
|
||||||
#### 1. Vocabulario AVAP (`AVAP_NODE_NAMES`)
|
|
||||||
|
|
||||||
Se define una lista canónica de 38 comandos AVAP organizados por categoría (variables, ORM, HTTP, criptografía, concurrencia, etc.). Esta lista es la única fuente de verdad del módulo y es importada también por `generate_mbap_v2.py`.
|
|
||||||
|
|
||||||
#### 2. Tabla de equivalencias de lenguaje (`LANGUAGE_MAPPINGS`)
|
|
||||||
|
|
||||||
Cada comando AVAP se mapea a sus equivalentes en Python (detección por AST), Go (keywords) y SQL. Esta tabla, verificada manualmente, define *qué buscar* al escanear los codebases. Por ejemplo:
|
|
||||||
|
|
||||||
- `ormAccessSelect` → llamadas como `.fetchall()`, `.query()`, `.filter()`
|
|
||||||
- `RequestPost` → `requests.post`, `httpx.post`, `session.post`
|
|
||||||
- `try` → nodo `ast.Try` en el AST de Python
|
|
||||||
|
|
||||||
#### 3. Extracción desde GitHub (`GitHubFetcher`)
|
|
||||||
|
|
||||||
Se lanzan 16 queries predefinidas contra la GitHub Code Search API, cada una orientada a un patrón típico de microservicio (ORM + manejo de errores, clientes HTTP, autenticación con crypto, concurrencia async, JSON, fechas, etc.). Los ficheros encontrados se descargan en base64 y se decodifican en memoria.
|
|
||||||
|
|
||||||
El fetcher respeta automáticamente el rate limit de la API (10 req/min sin token, 30 req/min con token) con esperas adaptativas entre peticiones.
|
|
||||||
|
|
||||||
#### 4. Detección AST de Python (`PythonASTDetector`)
|
|
||||||
|
|
||||||
Cada fichero descargado se parsea con el módulo estándar `ast` de Python. El walker del AST recorre todos los nodos y detecta qué comandos AVAP están presentes según su equivalente estructural:
|
|
||||||
|
|
||||||
- `ast.Try` → `"try"` + `"exception"`
|
|
||||||
- `ast.FunctionDef` → `"function"` + `"addParam"` (si tiene argumentos)
|
|
||||||
- `ast.For` / `ast.AsyncFor` → `"startLoop"`
|
|
||||||
- `ast.Call` con callee `.fetchall` → `"ormAccessSelect"`
|
|
||||||
- etc.
|
|
||||||
|
|
||||||
Este enfoque es **AST-level**, no keyword scanning, lo que elimina falsos positivos por nombres de variable, strings o comentarios. Si el fichero tiene errores de sintaxis, cae automáticamente a un keyword fallback.
|
|
||||||
|
|
||||||
#### 5. Acumulación de co-ocurrencias (`CooccurrenceExtractor`)
|
|
||||||
|
|
||||||
Por cada fichero analizado se obtiene un conjunto de comandos AVAP detectados. Se calculan todas las combinaciones de pares y tríos posibles de ese conjunto y se incrementan sus contadores. Por ejemplo, si un fichero contiene `{try, ormAccessSelect, return}`, se incrementan los contadores de `(try, ormAccessSelect)`, `(try, return)`, `(ormAccessSelect, return)` y el trío `(try, ormAccessSelect, return)`.
|
|
||||||
|
|
||||||
#### 6. Normalización y escritura del YAML (`generate_construct_map`)
|
|
||||||
|
|
||||||
Los contadores de co-ocurrencia se normalizan a `[0, 1]` dividiéndolos por el máximo observado. El resultado se escribe en `construct_map.yaml` con dos secciones:
|
|
||||||
|
|
||||||
- `language_mappings` — la tabla de equivalencias (trazabilidad)
|
|
||||||
- `pair_weights` / `trio_weights` — pesos empíricos extraídos
|
|
||||||
|
|
||||||
#### 7. Propagación de pesos a subsets (`_propagate_subset_weights`)
|
|
||||||
|
|
||||||
Los tríos que contienen pares con alto peso heredan el 60% del peso del par más relevante que contienen. Esto garantiza que los tríos formados por pares comunes sean visitados antes que tríos de combinaciones raras.
|
|
||||||
|
|
||||||
#### 8. Fallback estático
|
|
||||||
|
|
||||||
Si no se dispone de conexión a GitHub o de un YAML previo, `ConstructPrior.static_fallback()` retorna un conjunto de 40+ pesos hard-coded basados en conocimiento experto (por ejemplo, `(try, exception)` = 1.0, `(function, return)` = 0.98).
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## `generate_mbap_v2.py`
|
|
||||||
|
|
||||||
### Uso (generate_mbap_v2)
|
|
||||||
|
|
||||||
#### Invocación básica
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Modo por defecto: MAP-Elites + ConstructPrior (Candidato F)
|
|
||||||
python generate_mbap_v2.py --lrm avap.md --parser http://localhost:8080
|
|
||||||
|
|
||||||
# Generar 5000 ejemplos con celdas de hasta 3 comandos
|
|
||||||
python generate_mbap_v2.py --lrm avap.md --parser http://localhost:8080 \
|
|
||||||
--problems 5000 --cell-size 3 --mode map-elites-prior
|
|
||||||
```
|
|
||||||
|
|
||||||
#### Opciones del prior
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Usar datos reales de GitHub para el prior (AST-level, requiere red)
|
|
||||||
python generate_mbap_v2.py --lrm avap.md --mode map-elites-prior \
|
|
||||||
--prior-sources github
|
|
||||||
|
|
||||||
# Guardar el prior tras la extracción (para reutilizarlo después)
|
|
||||||
python generate_mbap_v2.py --lrm avap.md --mode map-elites-prior \
|
|
||||||
--prior-sources github --prior-save prior_weights.json
|
|
||||||
|
|
||||||
# Cargar un prior pre-extraído (sin peticiones a GitHub)
|
|
||||||
python generate_mbap_v2.py --lrm avap.md --mode map-elites-prior \
|
|
||||||
--prior-load prior_weights.json
|
|
||||||
|
|
||||||
# Usar el prior generado por construct_prior.py
|
|
||||||
python generate_mbap_v2.py --lrm avap.md --mode map-elites-prior \
|
|
||||||
--prior-map construct_map.yaml
|
|
||||||
```
|
|
||||||
|
|
||||||
#### Modos de generación
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Candidato F: MAP-Elites + ConstructPrior (por defecto, recomendado)
|
|
||||||
python generate_mbap_v2.py --lrm avap.md --mode map-elites-prior
|
|
||||||
|
|
||||||
# Candidato E: MAP-Elites con pesos uniformes (baseline sin prior)
|
|
||||||
python generate_mbap_v2.py --lrm avap.md --mode map-elites
|
|
||||||
|
|
||||||
# Candidato A: CW-Reward pool (no implementado en v2, usar generate_mbap.py)
|
|
||||||
python generate_mbap_v2.py --lrm avap.md --mode reward
|
|
||||||
```
|
|
||||||
|
|
||||||
#### Parámetros CLI completos
|
|
||||||
|
|
||||||
| Parámetro | Tipo | Por defecto | Descripción |
|
|
||||||
|---|---|---|---|
|
|
||||||
| `--lrm` | str | `avap.md` | Ruta al Language Reference Manual de AVAP |
|
|
||||||
| `--output` | str | `output/mbpp_avap_v2.json` | Ruta del dataset de salida |
|
|
||||||
| `--problems` | int | `5000` | Número de ejemplos a generar |
|
|
||||||
| `--parser` | str | `http://localhost:8080` | URL del parser AVAP |
|
|
||||||
| `--cell-size` | int | `3` | Tamaño máximo de celda: 2=solo pares, 3=pares+tríos |
|
|
||||||
| `--quality-threshold` | float | `0.80` | Calidad mínima para considerar una celda "buena" |
|
|
||||||
| `--alpha` | float | `0.30` | Peso de los comandos bonus en la calidad |
|
|
||||||
| `--beta` | float | `0.20` | Peso de la calidad de los tests en la calidad |
|
|
||||||
| `--gamma` | float | `0.10` | Peso de la riqueza del código en la calidad |
|
|
||||||
| `--mode` | choice | `map-elites-prior` | Modo de generación (ver arriba) |
|
|
||||||
| `--prior-map` | str | `construct_map.yaml` | Ruta al YAML generado por `construct_prior.py` |
|
|
||||||
| `--prior-epsilon` | float | `0.05` | Peso mínimo para celdas cola (tail cells) |
|
|
||||||
| `--prior-phase3-threshold` | float | `0.70` | Calidad a partir de la cual se activa la Fase 3 (tail) |
|
|
||||||
| `--api-key` | str | `$ANTHROPIC_API_KEY` | API key de Anthropic |
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
### Cómo funciona (generate_mbap_v2)
|
|
||||||
|
|
||||||
El generador implementa un algoritmo de **Quality-Diversity (QD)** llamado MAP-Elites aplicado a la generación de ejemplos de código AVAP. El objetivo es producir un dataset que cubra de forma garantizada todas las combinaciones de pares y tríos de los 38 comandos del DSL, sin sesgos de distribución.
|
|
||||||
|
|
||||||
#### 1. Vocabulario y detección de constructs (`AVAP_NODE_TYPES`, `CellValidator`)
|
|
||||||
|
|
||||||
Se define un diccionario que mapea cada comando AVAP a sus patrones de keyword. `CellValidator` detecta qué comandos están presentes en un ejemplo generado usando dos estrategias:
|
|
||||||
|
|
||||||
- **Desde el AST del parser AVAP**: se recorre el árbol recursivamente buscando nodos por `type`.
|
|
||||||
- **Desde el código fuente** (fallback): se buscan los patrones de keyword del diccionario. `if_mode2` se comprueba antes que `if_mode1` para evitar ambigüedad.
|
|
||||||
|
|
||||||
Además calcula una **puntuación de calidad** compuesta:
|
|
||||||
|
|
||||||
```
|
|
||||||
quality = fidelity
|
|
||||||
+ alpha * bonus_constructs_ratio
|
|
||||||
+ beta * test_quality
|
|
||||||
+ gamma * code_richness
|
|
||||||
```
|
|
||||||
|
|
||||||
- `fidelity`: fracción de los constructs requeridos por la celda que están presentes (componente principal).
|
|
||||||
- `bonus_ratio`: constructs adicionales más allá de los requeridos.
|
|
||||||
- `test_quality`: proporción de tests con patrón `re.match()` y longitud > 10.
|
|
||||||
- `richness`: número de líneas normalizadas a 30 (proxy de complejidad).
|
|
||||||
|
|
||||||
#### 2. Mapa de cobertura MAP-Elites (`CoverageMap`)
|
|
||||||
|
|
||||||
Es la estructura central del algoritmo. Mantiene **una celda por cada combinación posible de 2 o 3 comandos AVAP** (con `cell_size=3`: 703 pares + N tríos de un total de 38 comandos). Cada celda almacena el mejor ejemplo encontrado hasta el momento para esa combinación.
|
|
||||||
|
|
||||||
Una celda solo se considera válida si el ejemplo que contiene usa **todos** los constructs de su clave. El mapa expone métricas en tiempo real: tasa de llenado, entropía de distribución (Shannon), y celdas de baja calidad.
|
|
||||||
|
|
||||||
#### 3. Selector de celdas — Candidato E (`CellSelector`)
|
|
||||||
|
|
||||||
Implementa la estrategia de selección sin prior en tres fases:
|
|
||||||
|
|
||||||
- **Fase 1**: Celdas vacías (round-robin aleatorio con semilla fija para reproducibilidad).
|
|
||||||
- **Fase 2**: Celdas con calidad por debajo del umbral.
|
|
||||||
- **Fase 3**: UCB (Upper Confidence Bound) sobre todas las celdas — equilibrio entre explotar celdas de alta calidad y explorar celdas poco visitadas.
|
|
||||||
|
|
||||||
#### 4. Selector con prior — Candidato F (`CellSelectorPrior`)
|
|
||||||
|
|
||||||
Extiende `CellSelector` incorporando los pesos de `ConstructPrior`:
|
|
||||||
|
|
||||||
- **Fase 1**: Las celdas vacías con peso de prior alto (> 1.5× epsilon) se priorizan mediante muestreo ponderado. Esto hace que el dataset sea útil para RAG desde los primeros ejemplos (los patrones más frecuentes en producción se cubren primero).
|
|
||||||
- **Fase 2**: Las celdas de baja calidad se seleccionan con UCB multiplicado por el peso del prior.
|
|
||||||
- **Fase 3**: Una vez que las celdas de prior alto alcanzan el umbral de calidad, se activa la cobertura de las celdas cola (tail cells, prior ≈ epsilon).
|
|
||||||
|
|
||||||
#### 5. Construcción del prompt y llamada a la API (`Generator`)
|
|
||||||
|
|
||||||
Para cada celda seleccionada se construye un prompt estructurado que incluye el LRM completo de AVAP y una especificación precisa: "genera UN problema de benchmark estilo MBPP que use **exactamente** estos constructs: `{cell}`". El generador nunca pide exploración libre — cada llamada tiene una especificación de cobertura forzada.
|
|
||||||
|
|
||||||
La respuesta se parsea esperando JSON con campos `prompt`, `code`, y `tests`.
|
|
||||||
|
|
||||||
#### 6. Validación y actualización del mapa
|
|
||||||
|
|
||||||
Cada ejemplo generado pasa por `CellValidator`:
|
|
||||||
|
|
||||||
1. Se intenta parsear con el parser AVAP (si está disponible).
|
|
||||||
2. Se detectan los constructs presentes.
|
|
||||||
3. Se calcula la puntuación de calidad.
|
|
||||||
4. Si la calidad supera el ejemplo actual de la celda (o la celda está vacía), el mapa se actualiza.
|
|
||||||
|
|
||||||
#### 7. Checkpoints y métricas
|
|
||||||
|
|
||||||
Cada 100 llamadas a la API se imprime un checkpoint con: tamaño del dataset, tasa de éxito, estado del mapa, entropía de distribución, y divergencia KL entre el dataset y el prior (KL = 0 significa alineación perfecta con patrones de producción).
|
|
||||||
|
|
||||||
#### 8. Guardado (`_save`)
|
|
||||||
|
|
||||||
Al finalizar se guardan dos ficheros:
|
|
||||||
|
|
||||||
- `<output>.json` — el dataset completo.
|
|
||||||
- `<output>_coverage_stats.json` — estadísticas de cobertura, entropía, frecuencia por nodo, y KL-divergence respecto al prior.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Flujo de trabajo recomendado
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Paso 1: Generar el prior estadístico (una sola vez, o al actualizar la gramática)
|
|
||||||
python construct_prior.py --generate-map --github-token $GITHUB_TOKEN --max-files 200
|
|
||||||
|
|
||||||
# Paso 2: Verificar el prior generado
|
|
||||||
python construct_prior.py --verify --map construct_map.yaml
|
|
||||||
|
|
||||||
# Paso 3: Generar el dataset
|
|
||||||
python generate_mbap_v2.py \
|
|
||||||
--lrm avap.md \
|
|
||||||
--parser http://localhost:8080 \
|
|
||||||
--prior-map construct_map.yaml \
|
|
||||||
--problems 5000 \
|
|
||||||
--output output/dataset_v2.json \
|
|
||||||
--mode map-elites-prior
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Archivos generados
|
|
||||||
|
|
||||||
| Fichero | Generado por | Descripción |
|
|
||||||
|---|---|---|
|
|
||||||
| `construct_map.yaml` | `construct_prior.py` | Pesos de co-ocurrencia de pares/tríos AVAP extraídos de GitHub |
|
|
||||||
| `<output>.json` | `generate_mbap_v2.py` | Dataset de ejemplos AVAP estilo MBPP |
|
|
||||||
| `<output>_coverage_stats.json` | `generate_mbap_v2.py` | Estadísticas de cobertura, entropía y KL-divergence |
|
|
||||||
File diff suppressed because it is too large
Load Diff
|
|
@ -1,884 +0,0 @@
|
||||||
#!/usr/bin/env python3
|
|
||||||
"""
|
|
||||||
AVAP Dataset Generator v2 — MAP-Elites Quality-Diversity Pipeline
|
|
||||||
==================================================================
|
|
||||||
|
|
||||||
View reference
|
|
||||||
|
|
||||||
"""
|
|
||||||
|
|
||||||
import argparse
|
|
||||||
import json
|
|
||||||
import math
|
|
||||||
import os
|
|
||||||
import sys
|
|
||||||
import time
|
|
||||||
from collections import defaultdict
|
|
||||||
from itertools import combinations
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
import anthropic
|
|
||||||
import requests
|
|
||||||
|
|
||||||
from construct_prior import ConstructPrior, AVAP_NODE_NAMES
|
|
||||||
|
|
||||||
AVAP_NODE_TYPES = {
|
|
||||||
"addParam": ["addParam("],
|
|
||||||
"addResult": ["addResult("],
|
|
||||||
"_status": ["_status"],
|
|
||||||
"addVar": ["addVar("],
|
|
||||||
"getListLen": ["getListLen("],
|
|
||||||
"getQueryParamList": ["getQueryParamList("],
|
|
||||||
"itemFromList": ["itemFromList("],
|
|
||||||
"replace": ["replace("],
|
|
||||||
"randomString": ["randomString("],
|
|
||||||
"if_mode1": ["if("],
|
|
||||||
"if_mode2": ["if(None, None,"],
|
|
||||||
"else": ["else()"],
|
|
||||||
"end": ["end()"],
|
|
||||||
"startLoop": ["startLoop("],
|
|
||||||
"endLoop": ["endLoop()"],
|
|
||||||
"try": ["try()"],
|
|
||||||
"exception": ["exception()"],
|
|
||||||
"return": ["return("],
|
|
||||||
"go": ["go("],
|
|
||||||
"gather": ["gather("],
|
|
||||||
"avapConnector": ["avapConnector("],
|
|
||||||
"ormCheckTable": ["ormCheckTable("],
|
|
||||||
"ormDirect": ["ormDirect("],
|
|
||||||
"ormAccessSelect": ["ormAccessSelect("],
|
|
||||||
"ormAccessInsert": ["ormAccessInsert("],
|
|
||||||
"ormAccessUpdate": ["ormAccessUpdate("],
|
|
||||||
"variableFromJSON": ["variableFromJSON("],
|
|
||||||
"AddVariableToJSON": ["AddVariableToJSON("],
|
|
||||||
"encodeSHA256": ["encodeSHA256("],
|
|
||||||
"encodeMD5": ["encodeMD5("],
|
|
||||||
"getTimeStamp": ["getTimeStamp("],
|
|
||||||
"getDateTime": ["getDateTime("],
|
|
||||||
"stampToDatetime": ["stampToDatetime("],
|
|
||||||
"RequestGet": ["RequestGet("],
|
|
||||||
"RequestPost": ["RequestPost("],
|
|
||||||
"function": ["function "],
|
|
||||||
"import": ["import "],
|
|
||||||
"include": ["include("],
|
|
||||||
}
|
|
||||||
|
|
||||||
NODE_TYPE_NAMES = AVAP_NODE_NAMES
|
|
||||||
_PRIOR_EPSILON = 0.05
|
|
||||||
|
|
||||||
class CellValidator:
|
|
||||||
|
|
||||||
def __init__(self, parser_url: str, parser_timeout: int = 5):
|
|
||||||
self.parser_url = parser_url.rstrip("/")
|
|
||||||
self.parser_timeout = parser_timeout
|
|
||||||
self._parser_available = True
|
|
||||||
|
|
||||||
|
|
||||||
def parse(self, code: str) -> tuple[bool, dict, str]:
|
|
||||||
|
|
||||||
if not self._parser_available:
|
|
||||||
return None, {}, "parser_unavailable"
|
|
||||||
try:
|
|
||||||
resp = requests.post(
|
|
||||||
f"{self.parser_url}/parse",
|
|
||||||
json={"code": code},
|
|
||||||
timeout=self.parser_timeout,
|
|
||||||
)
|
|
||||||
data = resp.json()
|
|
||||||
if data.get("valid", False):
|
|
||||||
return True, data.get("ast", {}), ""
|
|
||||||
return False, {}, data.get("error", "parse error")
|
|
||||||
except requests.exceptions.ConnectionError:
|
|
||||||
self._parser_available = False
|
|
||||||
return None, {}, "parser_unavailable"
|
|
||||||
except Exception as e:
|
|
||||||
return False, {}, str(e)
|
|
||||||
def detect_constructs(self, code: str, ast: dict) -> set:
|
|
||||||
if ast:
|
|
||||||
return self._from_ast(ast)
|
|
||||||
return self._from_source(code)
|
|
||||||
|
|
||||||
def _from_ast(self, ast: dict) -> set:
|
|
||||||
found = set()
|
|
||||||
if isinstance(ast, dict):
|
|
||||||
if "type" in ast:
|
|
||||||
found.add(ast["type"])
|
|
||||||
for v in ast.values():
|
|
||||||
found |= self._from_ast(v)
|
|
||||||
elif isinstance(ast, list):
|
|
||||||
for item in ast:
|
|
||||||
found |= self._from_ast(item)
|
|
||||||
return found
|
|
||||||
|
|
||||||
def _from_source(self, code: str) -> set:
|
|
||||||
found = set()
|
|
||||||
if "if(None, None," in code:
|
|
||||||
found.add("if_mode2")
|
|
||||||
elif "if(" in code:
|
|
||||||
found.add("if_mode1")
|
|
||||||
for name, patterns in AVAP_NODE_TYPES.items():
|
|
||||||
if name in ("if_mode1", "if_mode2"):
|
|
||||||
continue # already handled
|
|
||||||
for pat in patterns:
|
|
||||||
if pat in code:
|
|
||||||
found.add(name)
|
|
||||||
break
|
|
||||||
return found
|
|
||||||
|
|
||||||
def cell_quality(
|
|
||||||
self,
|
|
||||||
code: str,
|
|
||||||
ast: dict,
|
|
||||||
test_list: list,
|
|
||||||
cell: frozenset,
|
|
||||||
alpha: float = 0.3,
|
|
||||||
beta: float = 0.2,
|
|
||||||
gamma: float = 0.1,
|
|
||||||
) -> tuple[float, dict]:
|
|
||||||
|
|
||||||
detected = self.detect_constructs(code, ast)
|
|
||||||
all_types = set(NODE_TYPE_NAMES)
|
|
||||||
|
|
||||||
cell_constructs = set(cell)
|
|
||||||
present_required = cell_constructs & detected
|
|
||||||
fidelity = len(present_required) / max(len(cell_constructs), 1)
|
|
||||||
|
|
||||||
extra = detected - cell_constructs
|
|
||||||
bonus_ratio = len(extra) / max(len(all_types) - len(cell_constructs), 1)
|
|
||||||
|
|
||||||
tq = sum(
|
|
||||||
1 for t in test_list
|
|
||||||
if isinstance(t, str) and "re.match(" in t and len(t.strip()) > 10
|
|
||||||
) / max(len(test_list), 1)
|
|
||||||
|
|
||||||
lines = [l.strip() for l in code.split("\n") if l.strip()]
|
|
||||||
richness = min(len(lines) / 30.0, 1.0) # cap at 30 lines = 1.0
|
|
||||||
|
|
||||||
quality = fidelity + alpha * bonus_ratio + beta * tq + gamma * richness
|
|
||||||
|
|
||||||
return quality, {
|
|
||||||
"fidelity": round(fidelity, 3),
|
|
||||||
"bonus_ratio": round(bonus_ratio, 3),
|
|
||||||
"test_quality": round(tq, 3),
|
|
||||||
"richness": round(richness, 3),
|
|
||||||
"quality": round(quality, 3),
|
|
||||||
"detected": sorted(detected),
|
|
||||||
"cell": sorted(cell),
|
|
||||||
"extra": sorted(extra),
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
class CoverageMap:
|
|
||||||
|
|
||||||
|
|
||||||
def __init__(self, cell_size: int = 3):
|
|
||||||
|
|
||||||
self.cell_size = cell_size
|
|
||||||
self._map: dict[frozenset, tuple[dict, float, dict]] = {}
|
|
||||||
self._attempts: dict[frozenset, int] = defaultdict(int)
|
|
||||||
self._all_cells = self._build_cells()
|
|
||||||
|
|
||||||
def _build_cells(self) -> list[frozenset]:
|
|
||||||
cells = []
|
|
||||||
for size in range(2, self.cell_size + 1):
|
|
||||||
for combo in combinations(NODE_TYPE_NAMES, size):
|
|
||||||
cells.append(frozenset(combo))
|
|
||||||
return cells
|
|
||||||
|
|
||||||
@property
|
|
||||||
def total_cells(self) -> int:
|
|
||||||
return len(self._all_cells)
|
|
||||||
|
|
||||||
@property
|
|
||||||
def filled_cells(self) -> int:
|
|
||||||
return len(self._map)
|
|
||||||
|
|
||||||
@property
|
|
||||||
def fill_rate(self) -> float:
|
|
||||||
return self.filled_cells / max(self.total_cells, 1)
|
|
||||||
|
|
||||||
def update(
|
|
||||||
self,
|
|
||||||
cell: frozenset,
|
|
||||||
example: dict,
|
|
||||||
quality: float,
|
|
||||||
components: dict,
|
|
||||||
) -> bool:
|
|
||||||
self._attempts[cell] += 1
|
|
||||||
current = self._map.get(cell)
|
|
||||||
if current is None or quality > current[1]:
|
|
||||||
self._map[cell] = (example, quality, components)
|
|
||||||
return True
|
|
||||||
return False
|
|
||||||
|
|
||||||
def get_empty_cells(self) -> list[frozenset]:
|
|
||||||
return [c for c in self._all_cells if c not in self._map]
|
|
||||||
|
|
||||||
def get_low_quality_cells(self, threshold: float = 0.7) -> list[frozenset]:
|
|
||||||
return [
|
|
||||||
c for c, (_, q, _) in self._map.items()
|
|
||||||
if q < threshold
|
|
||||||
]
|
|
||||||
|
|
||||||
def get_example(self, cell: frozenset) -> dict | None:
|
|
||||||
entry = self._map.get(cell)
|
|
||||||
return entry[0] if entry else None
|
|
||||||
|
|
||||||
def all_examples(self) -> list[dict]:
|
|
||||||
return [ex for ex, _, _ in self._map.values()]
|
|
||||||
|
|
||||||
def node_type_frequency(self) -> dict[str, int]:
|
|
||||||
|
|
||||||
freq = defaultdict(int)
|
|
||||||
for cell in self._map:
|
|
||||||
for nt in cell:
|
|
||||||
freq[nt] += 1
|
|
||||||
return dict(freq)
|
|
||||||
|
|
||||||
def distribution_entropy(self) -> float:
|
|
||||||
|
|
||||||
freq = self.node_type_frequency()
|
|
||||||
total = sum(freq.values())
|
|
||||||
if total == 0:
|
|
||||||
return 0.0
|
|
||||||
entropy = 0.0
|
|
||||||
for count in freq.values():
|
|
||||||
p = count / total
|
|
||||||
if p > 0:
|
|
||||||
entropy -= p * math.log2(p)
|
|
||||||
return round(entropy, 3)
|
|
||||||
|
|
||||||
def fill_summary(self) -> str:
|
|
||||||
empty = len(self.get_empty_cells())
|
|
||||||
low = len(self.get_low_quality_cells())
|
|
||||||
entropy = self.distribution_entropy()
|
|
||||||
return (
|
|
||||||
f"Cells: {self.filled_cells}/{self.total_cells} filled "
|
|
||||||
f"({100*self.fill_rate:.1f}%) | "
|
|
||||||
f"Low quality: {low} | "
|
|
||||||
f"Empty: {empty} | "
|
|
||||||
f"Entropy: {entropy:.2f} bits"
|
|
||||||
)
|
|
||||||
|
|
||||||
class CellSelector:
|
|
||||||
|
|
||||||
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
coverage_map: CoverageMap,
|
|
||||||
quality_threshold: float = 0.80,
|
|
||||||
ucb_c: float = 1.0,
|
|
||||||
):
|
|
||||||
self.map = coverage_map
|
|
||||||
self.quality_threshold = quality_threshold
|
|
||||||
self.ucb_c = ucb_c
|
|
||||||
self._total_calls = 0
|
|
||||||
import random
|
|
||||||
self._rng = random.Random(42)
|
|
||||||
|
|
||||||
def select(self) -> frozenset:
|
|
||||||
self._total_calls += 1
|
|
||||||
empty = self.map.get_empty_cells()
|
|
||||||
if empty:
|
|
||||||
return self._rng.choice(empty)
|
|
||||||
|
|
||||||
low = self.map.get_low_quality_cells(self.quality_threshold)
|
|
||||||
if low:
|
|
||||||
return self._rng.choice(low)
|
|
||||||
|
|
||||||
return self._ucb_select()
|
|
||||||
|
|
||||||
def _ucb_select(self) -> frozenset:
|
|
||||||
best_cell = None
|
|
||||||
best_score = -float("inf")
|
|
||||||
total = max(self._total_calls, 1)
|
|
||||||
|
|
||||||
for cell in self.map._all_cells:
|
|
||||||
attempts = max(self.map._attempts.get(cell, 0), 1)
|
|
||||||
entry = self.map._map.get(cell)
|
|
||||||
quality = entry[1] if entry else 0.0
|
|
||||||
score = quality + self.ucb_c * math.sqrt(math.log(total) / attempts)
|
|
||||||
if score > best_score:
|
|
||||||
best_score = score
|
|
||||||
best_cell = cell
|
|
||||||
|
|
||||||
return best_cell
|
|
||||||
|
|
||||||
class CellSelectorPrior(CellSelector):
|
|
||||||
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
coverage_map: CoverageMap,
|
|
||||||
prior: ConstructPrior,
|
|
||||||
quality_threshold: float = 0.80,
|
|
||||||
ucb_c: float = 1.0,
|
|
||||||
phase3_threshold: float = 0.70,
|
|
||||||
):
|
|
||||||
super().__init__(coverage_map, quality_threshold, ucb_c)
|
|
||||||
self.prior = prior
|
|
||||||
self.phase3_threshold = phase3_threshold
|
|
||||||
self._tail_cells: set[frozenset] = set()
|
|
||||||
self._phase3_active = False
|
|
||||||
|
|
||||||
def select(self) -> frozenset:
|
|
||||||
self._total_calls += 1
|
|
||||||
empty = self.map.get_empty_cells()
|
|
||||||
|
|
||||||
if empty:
|
|
||||||
high_prior_empty = [
|
|
||||||
c for c in empty
|
|
||||||
if self.prior.cell_weight(c) > self.prior.epsilon * 1.5
|
|
||||||
]
|
|
||||||
if high_prior_empty:
|
|
||||||
return self._weighted_sample(high_prior_empty)
|
|
||||||
return self._weighted_sample(empty)
|
|
||||||
|
|
||||||
low = self.map.get_low_quality_cells(self.quality_threshold)
|
|
||||||
if low:
|
|
||||||
return self._ucb_prior_select(low)
|
|
||||||
|
|
||||||
return self._ucb_prior_select(self.map._all_cells)
|
|
||||||
|
|
||||||
def _weighted_sample(self, cells: list[frozenset]) -> frozenset:
|
|
||||||
weights = [self.prior.cell_weight(c) for c in cells]
|
|
||||||
total = sum(weights)
|
|
||||||
if total == 0:
|
|
||||||
return self._rng.choice(cells)
|
|
||||||
r = self._rng.random() * total
|
|
||||||
cumsum = 0.0
|
|
||||||
for cell, w in zip(cells, weights):
|
|
||||||
cumsum += w
|
|
||||||
if r <= cumsum:
|
|
||||||
return cell
|
|
||||||
return cells[-1]
|
|
||||||
|
|
||||||
def _ucb_prior_select(self, cells) -> frozenset:
|
|
||||||
|
|
||||||
best_cell = None
|
|
||||||
best_score = -float("inf")
|
|
||||||
total = max(self._total_calls, 1)
|
|
||||||
|
|
||||||
for cell in cells:
|
|
||||||
attempts = max(self.map._attempts.get(cell, 0), 1)
|
|
||||||
entry = self.map._map.get(cell)
|
|
||||||
quality = entry[1] if entry else 0.0
|
|
||||||
prior_w = self.prior.cell_weight(cell)
|
|
||||||
ucb_term = self.ucb_c * math.sqrt(math.log(total) / attempts)
|
|
||||||
score = prior_w * (quality + ucb_term)
|
|
||||||
if score > best_score:
|
|
||||||
best_score = score
|
|
||||||
best_cell = cell
|
|
||||||
|
|
||||||
return best_cell
|
|
||||||
|
|
||||||
SYSTEM_PROMPT = """Eres un experto en el lenguaje AVAP.
|
|
||||||
Se te proporciona el Language Reference Manual (LRM) completo de AVAP.
|
|
||||||
Tu tarea es generar UN problema de benchmark estilo MBPP para evaluar
|
|
||||||
modelos de lenguaje en su capacidad de generar código AVAP correcto.
|
|
||||||
|
|
||||||
REGLAS ESTRICTAS para el código AVAP generado:
|
|
||||||
1. Una instrucción por línea. EOL es el terminador absoluto.
|
|
||||||
2. Sin indentación significativa (es solo decorativa).
|
|
||||||
3. Bloques: if()...else()...end(), startLoop()...endLoop(), try()...exception()...end()
|
|
||||||
4. Funciones: function name(args) { ... return(val) }
|
|
||||||
5. if() Modo 1: if(var_o_literal, var_o_literal, "operador")
|
|
||||||
6. if() Modo 2: if(None, None, `expresion_completa_como_string`)
|
|
||||||
7. _status se asigna con: addVar(_status, 404)
|
|
||||||
8. ormAccessSelect firma: ormAccessSelect(campos, "tabla", selector, varTarget)
|
|
||||||
9. ormCheckTable firma: ormCheckTable(nombre_tabla, varTarget)
|
|
||||||
10. ormDirect firma: ormDirect("SELECT ... %s" % var, varTarget)
|
|
||||||
11. getQueryParamList firma: getQueryParamList(param_name, varTarget)
|
|
||||||
12. NUNCA uses registerEndpoint(), NUNCA uses mainHandler().
|
|
||||||
13. El código se ejecuta DIRECTAMENTE, línea a línea.
|
|
||||||
|
|
||||||
FORMATO DE SALIDA: responde ÚNICAMENTE con UN objeto JSON válido (no array).
|
|
||||||
Sin texto adicional, sin bloques de código markdown.
|
|
||||||
{
|
|
||||||
"task_id": 1,
|
|
||||||
"text": "<enunciado del problema en español>",
|
|
||||||
"code": "<código AVAP con saltos de línea como \\n>",
|
|
||||||
"test_inputs": { "<param1>": <valor1> },
|
|
||||||
"test_list": ["re.match(r'<patrón>', <variable>)", ...]
|
|
||||||
}
|
|
||||||
|
|
||||||
test_list: USA ÚNICAMENTE re.match(). NUNCA comparaciones directas (==, !=).
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
def build_cell_prompt(
|
|
||||||
lrm: str,
|
|
||||||
cell: frozenset,
|
|
||||||
existing_example: dict | None,
|
|
||||||
map_summary: str,
|
|
||||||
) -> str:
|
|
||||||
constructs_list = ", ".join(f"`{c}`" for c in sorted(cell))
|
|
||||||
|
|
||||||
improvement_note = ""
|
|
||||||
if existing_example:
|
|
||||||
improvement_note = f"""
|
|
||||||
El siguiente ejemplo YA existe para esta combinación con calidad mejorable.
|
|
||||||
Genera algo DISTINTO y MÁS COMPLEJO que lo supere:
|
|
||||||
|
|
||||||
```
|
|
||||||
{existing_example.get('code', '')}
|
|
||||||
```
|
|
||||||
"""
|
|
||||||
|
|
||||||
return f"""# LRM AVAP — Language Reference Manual
|
|
||||||
|
|
||||||
{lrm}
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
# ESTADO DEL MAPA DE COBERTURA
|
|
||||||
|
|
||||||
{map_summary}
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
# TAREA — ESPECIFICACIÓN OBLIGATORIA
|
|
||||||
|
|
||||||
Genera UN ejemplo AVAP que use OBLIGATORIAMENTE TODOS estos constructs:
|
|
||||||
|
|
||||||
**{constructs_list}**
|
|
||||||
|
|
||||||
El ejemplo DEBE contener todos los constructs listados arriba.
|
|
||||||
Si tu código no los usa todos, la tarea fracasa.
|
|
||||||
|
|
||||||
Adicionalmente:
|
|
||||||
- Combina los constructs requeridos en un escenario realista de microservicio HTTP
|
|
||||||
- Añade constructs adicionales donde sea natural (aumenta la puntuación)
|
|
||||||
- Código complejo y rico — no ejemplos triviales de 3 líneas
|
|
||||||
- 2-3 aserciones re.match() en test_list
|
|
||||||
{improvement_note}
|
|
||||||
Responde ÚNICAMENTE con el objeto JSON. Sin texto antes ni después.
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
def call_api(
|
|
||||||
client: anthropic.Anthropic,
|
|
||||||
lrm: str,
|
|
||||||
cell: frozenset,
|
|
||||||
task_id: int,
|
|
||||||
existing_example: dict | None,
|
|
||||||
map_summary: str,
|
|
||||||
retries: int = 3,
|
|
||||||
) -> dict | None:
|
|
||||||
|
|
||||||
for attempt in range(1, retries + 1):
|
|
||||||
try:
|
|
||||||
message = client.messages.create(
|
|
||||||
model="claude-sonnet-4-20250514",
|
|
||||||
max_tokens=4000,
|
|
||||||
system=SYSTEM_PROMPT,
|
|
||||||
messages=[{
|
|
||||||
"role": "user",
|
|
||||||
"content": build_cell_prompt(lrm, cell, existing_example, map_summary),
|
|
||||||
}],
|
|
||||||
)
|
|
||||||
raw = message.content[0].text.strip()
|
|
||||||
|
|
||||||
if raw.startswith("```"):
|
|
||||||
lines = raw.splitlines()
|
|
||||||
raw = "\n".join(lines[1:-1] if lines[-1].strip() == "```" else lines[1:])
|
|
||||||
|
|
||||||
problem = json.loads(raw)
|
|
||||||
if not isinstance(problem, dict):
|
|
||||||
raise ValueError("Response is not a JSON object")
|
|
||||||
for field in ("text", "code", "test_list"):
|
|
||||||
if field not in problem:
|
|
||||||
raise ValueError(f"Missing field '{field}'")
|
|
||||||
if "test_inputs" not in problem:
|
|
||||||
problem["test_inputs"] = {}
|
|
||||||
problem["task_id"] = task_id
|
|
||||||
return problem
|
|
||||||
|
|
||||||
except (json.JSONDecodeError, ValueError) as e:
|
|
||||||
print(f"\n Attempt {attempt}/{retries} — parse error: {e}")
|
|
||||||
if attempt < retries:
|
|
||||||
time.sleep(2 ** attempt)
|
|
||||||
except anthropic.RateLimitError:
|
|
||||||
wait = 30 * attempt
|
|
||||||
print(f"\n Rate limit — waiting {wait}s...")
|
|
||||||
time.sleep(wait)
|
|
||||||
except anthropic.APIError as e:
|
|
||||||
print(f"\n API error at attempt {attempt}: {e}")
|
|
||||||
if attempt < retries:
|
|
||||||
time.sleep(5)
|
|
||||||
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def run_map_elites(args, client, lrm, output_path):
|
|
||||||
|
|
||||||
validator = CellValidator(parser_url=args.parser)
|
|
||||||
cmap = CoverageMap(cell_size=args.cell_size)
|
|
||||||
selector = CellSelector(cmap, quality_threshold=args.quality_threshold)
|
|
||||||
dataset = []
|
|
||||||
task_id = 1
|
|
||||||
call_count = 0
|
|
||||||
valid_count = 0
|
|
||||||
cell_updates = 0
|
|
||||||
|
|
||||||
print(f"\n MAP-Elites mode | cells: {cmap.total_cells} | target: {args.problems} examples")
|
|
||||||
print(f" Cell size: {args.cell_size} | Quality threshold: {args.quality_threshold}")
|
|
||||||
print("─" * 65)
|
|
||||||
|
|
||||||
max_calls = args.problems * 4
|
|
||||||
|
|
||||||
while len(dataset) < args.problems and call_count < max_calls:
|
|
||||||
|
|
||||||
cell = selector.select()
|
|
||||||
existing = cmap.get_example(cell)
|
|
||||||
call_count += 1
|
|
||||||
|
|
||||||
print(
|
|
||||||
f" [{call_count:04d}] Cell {sorted(cell)} "
|
|
||||||
f"| filled={cmap.filled_cells}/{cmap.total_cells} "
|
|
||||||
f"| dataset={len(dataset)} ... ",
|
|
||||||
end="", flush=True,
|
|
||||||
)
|
|
||||||
|
|
||||||
problem = call_api(
|
|
||||||
client, lrm, cell, task_id,
|
|
||||||
existing_example=existing,
|
|
||||||
map_summary=cmap.fill_summary(),
|
|
||||||
)
|
|
||||||
|
|
||||||
if problem is None:
|
|
||||||
print("SKIP (generation failed)")
|
|
||||||
continue
|
|
||||||
|
|
||||||
code = problem["code"]
|
|
||||||
test_list = problem.get("test_list", [])
|
|
||||||
|
|
||||||
is_valid, ast, error_msg = validator.parse(code)
|
|
||||||
|
|
||||||
if is_valid is None:
|
|
||||||
is_valid, ast = True, {}
|
|
||||||
if call_count == 1:
|
|
||||||
print(f"\n Parser unavailable — using keyword fallback", flush=True)
|
|
||||||
|
|
||||||
if is_valid is False:
|
|
||||||
print(f"INVALID ({error_msg[:40]})")
|
|
||||||
problem["_validation"] = {"valid": False, "error": error_msg}
|
|
||||||
continue
|
|
||||||
|
|
||||||
valid_count += 1
|
|
||||||
|
|
||||||
# Compute cell quality
|
|
||||||
quality, components = validator.cell_quality(
|
|
||||||
code, ast, test_list, cell,
|
|
||||||
alpha=args.alpha, beta=args.beta, gamma=args.gamma,
|
|
||||||
)
|
|
||||||
problem["_cell"] = sorted(cell)
|
|
||||||
problem["_quality"] = components
|
|
||||||
|
|
||||||
if components["fidelity"] < 1.0:
|
|
||||||
missing = set(cell) - set(components["detected"])
|
|
||||||
print(f"MISSING constructs: {sorted(missing)}")
|
|
||||||
continue
|
|
||||||
|
|
||||||
updated = cmap.update(cell, problem, quality, components)
|
|
||||||
if updated:
|
|
||||||
cell_updates += 1
|
|
||||||
|
|
||||||
dataset.append(problem)
|
|
||||||
task_id += 1
|
|
||||||
|
|
||||||
print(
|
|
||||||
f"OK quality={quality:.3f} "
|
|
||||||
f"fidelity={components['fidelity']:.2f} "
|
|
||||||
f"extra={len(components['extra'])}"
|
|
||||||
)
|
|
||||||
|
|
||||||
if len(dataset) % 50 == 0:
|
|
||||||
_save(dataset, output_path, cmap)
|
|
||||||
freq = cmap.node_type_frequency()
|
|
||||||
entropy = cmap.distribution_entropy()
|
|
||||||
print(f"\n ── Checkpoint ──────────────────────────────────")
|
|
||||||
print(f" Dataset: {len(dataset)} | Valid: {valid_count}/{call_count}")
|
|
||||||
print(f" {cmap.fill_summary()}")
|
|
||||||
print(f" Top-5 most frequent: {sorted(freq, key=freq.get, reverse=True)[:5]}")
|
|
||||||
print(f" Top-5 least frequent: {sorted(freq, key=freq.get)[:5]}")
|
|
||||||
print(f" ────────────────────────────────────────────────\n")
|
|
||||||
|
|
||||||
time.sleep(0.5)
|
|
||||||
|
|
||||||
_save(dataset, output_path, cmap)
|
|
||||||
return dataset, cmap, valid_count, call_count
|
|
||||||
|
|
||||||
def run_map_elites_prior(args, client, lrm, output_path):
|
|
||||||
|
|
||||||
print("\n Loading ConstructPrior...", flush=True)
|
|
||||||
prior_map = getattr(args, "prior_map","construct_map.yaml")
|
|
||||||
epsilon = getattr(args, "prior_epsilon", _PRIOR_EPSILON)
|
|
||||||
yaml_path = Path(prior_map)
|
|
||||||
|
|
||||||
if yaml_path.exists():
|
|
||||||
prior = ConstructPrior.from_yaml(yaml_path, epsilon=epsilon)
|
|
||||||
else:
|
|
||||||
# Fallback: yaml not found — use static prior and warn
|
|
||||||
print(f" [WARN] construct_map.yaml not found at '{yaml_path}'.")
|
|
||||||
print(f" [WARN] Using static fallback prior. Generate the real prior with:")
|
|
||||||
print(f" [WARN] python construct_prior.py --generate-map --github-token TOKEN")
|
|
||||||
prior = ConstructPrior.from_static_fallback(epsilon=epsilon)
|
|
||||||
|
|
||||||
print(f" {prior.coverage_summary()}")
|
|
||||||
|
|
||||||
validator = CellValidator(parser_url=args.parser)
|
|
||||||
cmap = CoverageMap(cell_size=args.cell_size)
|
|
||||||
selector = CellSelectorPrior(
|
|
||||||
cmap, prior,
|
|
||||||
quality_threshold=args.quality_threshold,
|
|
||||||
phase3_threshold=getattr(args, "prior_phase3_threshold", 0.70),
|
|
||||||
)
|
|
||||||
dataset = []
|
|
||||||
task_id = 1
|
|
||||||
call_count = 0
|
|
||||||
valid_count = 0
|
|
||||||
cell_updates = 0
|
|
||||||
|
|
||||||
print(f"\n MAP-Elites+Prior mode | cells: {cmap.total_cells} | target: {args.problems} examples")
|
|
||||||
print(f" Cell size: {args.cell_size} | Quality threshold: {args.quality_threshold}")
|
|
||||||
print("─" * 65)
|
|
||||||
|
|
||||||
max_calls = args.problems * 4
|
|
||||||
|
|
||||||
while len(dataset) < args.problems and call_count < max_calls:
|
|
||||||
|
|
||||||
cell = selector.select()
|
|
||||||
existing = cmap.get_example(cell)
|
|
||||||
prior_w = prior.cell_weight(cell)
|
|
||||||
call_count += 1
|
|
||||||
|
|
||||||
print(
|
|
||||||
f" [{call_count:04d}] Cell {sorted(cell)} "
|
|
||||||
f"| prior={prior_w:.3f} "
|
|
||||||
f"| filled={cmap.filled_cells}/{cmap.total_cells} "
|
|
||||||
f"| dataset={len(dataset)} ... ",
|
|
||||||
end="", flush=True,
|
|
||||||
)
|
|
||||||
|
|
||||||
problem = call_api(
|
|
||||||
client, lrm, cell, task_id,
|
|
||||||
existing_example=existing,
|
|
||||||
map_summary=cmap.fill_summary(),
|
|
||||||
)
|
|
||||||
|
|
||||||
if problem is None:
|
|
||||||
print("SKIP (generation failed)")
|
|
||||||
continue
|
|
||||||
|
|
||||||
code = problem["code"]
|
|
||||||
test_list = problem.get("test_list", [])
|
|
||||||
|
|
||||||
is_valid, ast, error_msg = validator.parse(code)
|
|
||||||
|
|
||||||
if is_valid is None:
|
|
||||||
is_valid, ast = True, {}
|
|
||||||
if call_count == 1:
|
|
||||||
print(f"\n Parser unavailable — using keyword fallback", flush=True)
|
|
||||||
|
|
||||||
if is_valid is False:
|
|
||||||
print(f"INVALID ({error_msg[:40]})")
|
|
||||||
problem["_validation"] = {"valid": False, "error": error_msg}
|
|
||||||
continue
|
|
||||||
|
|
||||||
valid_count += 1
|
|
||||||
|
|
||||||
quality, components = validator.cell_quality(
|
|
||||||
code, ast, test_list, cell,
|
|
||||||
alpha=args.alpha, beta=args.beta, gamma=args.gamma,
|
|
||||||
)
|
|
||||||
problem["_cell"] = sorted(cell)
|
|
||||||
problem["_prior_weight"] = round(prior_w, 4)
|
|
||||||
problem["_quality"] = components
|
|
||||||
|
|
||||||
if components["fidelity"] < 1.0:
|
|
||||||
missing = set(cell) - set(components["detected"])
|
|
||||||
print(f"MISSING constructs: {sorted(missing)}")
|
|
||||||
continue
|
|
||||||
|
|
||||||
updated = cmap.update(cell, problem, quality, components)
|
|
||||||
if updated:
|
|
||||||
cell_updates += 1
|
|
||||||
|
|
||||||
dataset.append(problem)
|
|
||||||
task_id += 1
|
|
||||||
|
|
||||||
print(
|
|
||||||
f"OK quality={quality:.3f} "
|
|
||||||
f"fidelity={components['fidelity']:.2f} "
|
|
||||||
f"prior={prior_w:.3f} "
|
|
||||||
f"extra={len(components['extra'])}"
|
|
||||||
)
|
|
||||||
|
|
||||||
if len(dataset) % 50 == 0:
|
|
||||||
_save(dataset, output_path, cmap, prior=prior)
|
|
||||||
freq = cmap.node_type_frequency()
|
|
||||||
entropy = cmap.distribution_entropy()
|
|
||||||
kl = prior.kl_divergence(freq)
|
|
||||||
print(f"\n ── Checkpoint ──────────────────────────────────")
|
|
||||||
print(f" Dataset: {len(dataset)} | Valid: {valid_count}/{call_count}")
|
|
||||||
print(f" {cmap.fill_summary()}")
|
|
||||||
print(f" KL(dataset ‖ prior): {kl:.4f} (lower = closer to production patterns)")
|
|
||||||
print(f" Top-5 most frequent: {sorted(freq, key=freq.get, reverse=True)[:5]}")
|
|
||||||
print(f" Top-5 least frequent: {sorted(freq, key=freq.get)[:5]}")
|
|
||||||
print(f" ────────────────────────────────────────────────\n")
|
|
||||||
|
|
||||||
time.sleep(0.5)
|
|
||||||
|
|
||||||
_save(dataset, output_path, cmap, prior=prior)
|
|
||||||
return dataset, cmap, valid_count, call_count, prior
|
|
||||||
|
|
||||||
|
|
||||||
def _save(dataset: list, path: Path, cmap: CoverageMap, prior: ConstructPrior = None):
|
|
||||||
with open(path, "w", encoding="utf-8") as f:
|
|
||||||
json.dump(dataset, f, ensure_ascii=False, indent=2)
|
|
||||||
|
|
||||||
# Save coverage map statistics alongside dataset
|
|
||||||
stats_path = path.with_name(path.stem + "_coverage_stats.json")
|
|
||||||
freq = cmap.node_type_frequency()
|
|
||||||
stats = {
|
|
||||||
"total_cells": cmap.total_cells,
|
|
||||||
"filled_cells": cmap.filled_cells,
|
|
||||||
"fill_rate": round(cmap.fill_rate, 4),
|
|
||||||
"distribution_entropy": cmap.distribution_entropy(),
|
|
||||||
"node_type_frequency": freq,
|
|
||||||
"low_quality_cells": len(cmap.get_low_quality_cells()),
|
|
||||||
"empty_cells": len(cmap.get_empty_cells()),
|
|
||||||
}
|
|
||||||
if prior is not None:
|
|
||||||
stats["kl_divergence_dataset_vs_prior"] = prior.kl_divergence(freq)
|
|
||||||
stats["prior_summary"] = prior.coverage_summary()
|
|
||||||
with open(stats_path, "w", encoding="utf-8") as f:
|
|
||||||
json.dump(stats, f, ensure_ascii=False, indent=2)
|
|
||||||
|
|
||||||
def main():
|
|
||||||
parser = argparse.ArgumentParser(
|
|
||||||
description="AVAP Dataset Generator v2 — MAP-Elites Quality-Diversity Pipeline"
|
|
||||||
)
|
|
||||||
parser.add_argument("--lrm", default="avap.md")
|
|
||||||
parser.add_argument("--output", default="output/mbpp_avap_v2.json")
|
|
||||||
parser.add_argument("--problems", type=int, default=5000)
|
|
||||||
parser.add_argument("--parser", default="http://localhost:8080",
|
|
||||||
help="AVAP parser URL")
|
|
||||||
parser.add_argument("--cell-size", type=int, default=3,
|
|
||||||
help="Max constructs per cell: 2=pairs, 3=pairs+trios (default: 3)")
|
|
||||||
parser.add_argument("--quality-threshold", type=float, default=0.80,
|
|
||||||
help="Min quality to consider a cell 'good' (default: 0.80)")
|
|
||||||
parser.add_argument("--alpha", type=float, default=0.30,
|
|
||||||
help="Weight for bonus constructs in cell quality (default: 0.30)")
|
|
||||||
parser.add_argument("--beta", type=float, default=0.20,
|
|
||||||
help="Weight for test quality in cell quality (default: 0.20)")
|
|
||||||
parser.add_argument("--gamma", type=float, default=0.10,
|
|
||||||
help="Weight for code richness in cell quality (default: 0.10)")
|
|
||||||
parser.add_argument(
|
|
||||||
"--mode",
|
|
||||||
choices=["map-elites-prior", "map-elites", "reward"],
|
|
||||||
default="map-elites-prior",
|
|
||||||
help=(
|
|
||||||
"map-elites-prior: Candidate F — MAP-Elites + ConstructPrior (default)\n"
|
|
||||||
"map-elites: Candidate E — MAP-Elites, uniform cell weighting\n"
|
|
||||||
"reward: Candidate A — CW-Reward pool (comparison baseline)"
|
|
||||||
),
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--prior-map",
|
|
||||||
default="construct_map.yaml",
|
|
||||||
metavar="FILE",
|
|
||||||
help=(
|
|
||||||
"Path to construct_map.yaml generated by construct_prior.py.\n"
|
|
||||||
"Generate it first: python construct_prior.py --generate-map\n"
|
|
||||||
"Default: construct_map.yaml (in current directory)"
|
|
||||||
),
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--prior-epsilon",
|
|
||||||
type=float,
|
|
||||||
default=_PRIOR_EPSILON,
|
|
||||||
help=f"Minimum prior weight for tail cells (default: {_PRIOR_EPSILON})",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--prior-phase3-threshold",
|
|
||||||
type=float,
|
|
||||||
default=0.70,
|
|
||||||
help=(
|
|
||||||
"Quality threshold above which Phase 2 ends and tail (low-prior) "
|
|
||||||
"cells become the focus. Default: 0.70"
|
|
||||||
),
|
|
||||||
)
|
|
||||||
parser.add_argument("--api-key", default=None)
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
api_key = args.api_key or os.environ.get("ANTHROPIC_API_KEY")
|
|
||||||
if not api_key:
|
|
||||||
sys.exit("ERROR: ANTHROPIC_API_KEY not set.")
|
|
||||||
|
|
||||||
lrm_path = Path(args.lrm)
|
|
||||||
if not lrm_path.exists():
|
|
||||||
sys.exit(f"ERROR: LRM '{lrm_path}' not found.")
|
|
||||||
lrm = lrm_path.read_text(encoding="utf-8")
|
|
||||||
|
|
||||||
output_path = Path(args.output)
|
|
||||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
||||||
|
|
||||||
client = anthropic.Anthropic(api_key=api_key)
|
|
||||||
|
|
||||||
mode_label = {
|
|
||||||
"map-elites-prior": "Candidate F — MAP-Elites + ConstructPrior",
|
|
||||||
"map-elites": "Candidate E — MAP-Elites (uniform)",
|
|
||||||
"reward": "Candidate A — CW-Reward pool",
|
|
||||||
}[args.mode]
|
|
||||||
|
|
||||||
print("=" * 65)
|
|
||||||
print(" AVAP Dataset Generator v2 — MAP-Elites Pipeline")
|
|
||||||
print("=" * 65)
|
|
||||||
print(f" Mode : {mode_label}")
|
|
||||||
print(f" LRM : {lrm_path}")
|
|
||||||
print(f" Output : {output_path}")
|
|
||||||
print(f" Target examples: {args.problems}")
|
|
||||||
print(f" Parser URL : {args.parser}")
|
|
||||||
print(f" Cell size : {args.cell_size}")
|
|
||||||
print(f" Quality thresh : {args.quality_threshold}")
|
|
||||||
if args.mode == "map-elites-prior":
|
|
||||||
yaml_exists = Path(args.prior_map).exists()
|
|
||||||
print(f" Prior map : {args.prior_map} ({'✓ found' if yaml_exists else '✗ not found — will use static fallback'})")
|
|
||||||
print(f" Prior epsilon : {args.prior_epsilon}")
|
|
||||||
print("=" * 65)
|
|
||||||
|
|
||||||
prior = None
|
|
||||||
|
|
||||||
if args.mode == "map-elites-prior":
|
|
||||||
result = run_map_elites_prior(args, client, lrm, output_path)
|
|
||||||
dataset, cmap, valid_count, call_count, prior = result
|
|
||||||
elif args.mode == "map-elites":
|
|
||||||
dataset, cmap, valid_count, call_count = run_map_elites(args, client, lrm, output_path)
|
|
||||||
else:
|
|
||||||
sys.exit("ERROR: --mode reward (Candidate A) is not yet implemented in v2. "
|
|
||||||
"Use generate_mbap.py for the v1 reward baseline.")
|
|
||||||
|
|
||||||
# Final report
|
|
||||||
freq = cmap.node_type_frequency()
|
|
||||||
entropy = cmap.distribution_entropy()
|
|
||||||
|
|
||||||
print("\n" + "=" * 65)
|
|
||||||
print(" Pipeline complete")
|
|
||||||
print(f" Mode : {mode_label}")
|
|
||||||
print(f" Total API calls : {call_count}")
|
|
||||||
print(f" Valid examples : {valid_count} ({100*valid_count/max(call_count,1):.1f}%)")
|
|
||||||
print(f" Dataset size : {len(dataset)}")
|
|
||||||
print(f" {cmap.fill_summary()}")
|
|
||||||
print(f" Distribution entropy : {entropy:.3f} bits (max={math.log2(len(NODE_TYPE_NAMES)):.2f})")
|
|
||||||
if prior is not None:
|
|
||||||
kl = prior.kl_divergence(freq)
|
|
||||||
print(f" KL(dataset ‖ prior) : {kl:.4f} (0 = perfect alignment with production code)")
|
|
||||||
print(f" Most covered : {sorted(freq, key=freq.get, reverse=True)[:5]}")
|
|
||||||
print(f" Least covered : {sorted(freq, key=freq.get)[:5]}")
|
|
||||||
print(f" Output : {output_path}")
|
|
||||||
print("=" * 65)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
Loading…
Reference in New Issue