ADR0010
This commit is contained in:
parent
4ea11c6a52
commit
81125eae2a
|
|
@ -122,6 +122,10 @@ celerybeat.pid
|
||||||
# Environments
|
# Environments
|
||||||
.env
|
.env
|
||||||
.venv
|
.venv
|
||||||
|
|
||||||
|
# Classifier — model binaries and production label exports
|
||||||
|
Docker/data/
|
||||||
|
*.pkl
|
||||||
env/
|
env/
|
||||||
venv/
|
venv/
|
||||||
ENV/
|
ENV/
|
||||||
|
|
|
||||||
|
|
@ -7,6 +7,9 @@ services:
|
||||||
ports:
|
ports:
|
||||||
- "50052:50051"
|
- "50052:50051"
|
||||||
- "8000:8000"
|
- "8000:8000"
|
||||||
|
volumes:
|
||||||
|
- ./data:/data
|
||||||
|
|
||||||
environment:
|
environment:
|
||||||
ELASTICSEARCH_URL: ${ELASTICSEARCH_URL}
|
ELASTICSEARCH_URL: ${ELASTICSEARCH_URL}
|
||||||
ELASTICSEARCH_INDEX: ${ELASTICSEARCH_INDEX}
|
ELASTICSEARCH_INDEX: ${ELASTICSEARCH_INDEX}
|
||||||
|
|
@ -16,7 +19,12 @@ services:
|
||||||
LANGFUSE_SECRET_KEY: ${LANGFUSE_SECRET_KEY}
|
LANGFUSE_SECRET_KEY: ${LANGFUSE_SECRET_KEY}
|
||||||
OLLAMA_URL: ${OLLAMA_URL}
|
OLLAMA_URL: ${OLLAMA_URL}
|
||||||
OLLAMA_MODEL_NAME: ${OLLAMA_MODEL_NAME}
|
OLLAMA_MODEL_NAME: ${OLLAMA_MODEL_NAME}
|
||||||
|
OLLAMA_MODEL_NAME_CONVERSATIONAL: ${OLLAMA_MODEL_NAME_CONVERSATIONAL}
|
||||||
OLLAMA_EMB_MODEL_NAME: ${OLLAMA_EMB_MODEL_NAME}
|
OLLAMA_EMB_MODEL_NAME: ${OLLAMA_EMB_MODEL_NAME}
|
||||||
|
CLASSIFIER_EXPORT_THRESHOLD: ${CLASSIFIER_EXPORT_THRESHOLD}
|
||||||
|
CLASSIFIER_EXPORT_DIR: ${CLASSIFIER_EXPORT_DIR}
|
||||||
|
CLASSIFIER_MODEL_PATH: ${CLASSIFIER_MODEL_PATH}
|
||||||
|
CLASSIFIER_CONFIDENCE_THRESHOLD: ${CLASSIFIER_CONFIDENCE_THRESHOLD}
|
||||||
PROXY_THREAD_WORKERS: 10
|
PROXY_THREAD_WORKERS: 10
|
||||||
|
|
||||||
extra_hosts:
|
extra_hosts:
|
||||||
|
|
|
||||||
|
|
@ -40,6 +40,13 @@ message AgentRequest {
|
||||||
string extra_context = 5;
|
string extra_context = 5;
|
||||||
|
|
||||||
string user_info = 6;
|
string user_info = 6;
|
||||||
|
|
||||||
|
// ── Caller-declared type (v3 — ADR-0008 Phase 3) ─────────────────────────
|
||||||
|
// Optional. When set by the caller, all three classifier layers are bypassed
|
||||||
|
// and the engine routes directly using this value.
|
||||||
|
// Valid values: RETRIEVAL | CODE_GENERATION | CONVERSATIONAL | PLATFORM
|
||||||
|
// Empty string (default) means: run the normal L1 → L2 → L3 pipeline.
|
||||||
|
string query_type = 7;
|
||||||
}
|
}
|
||||||
|
|
||||||
message AgentResponse {
|
message AgentResponse {
|
||||||
|
|
|
||||||
|
|
@ -95,6 +95,9 @@ joblib==1.5.3
|
||||||
# via
|
# via
|
||||||
# model2vec
|
# model2vec
|
||||||
# nltk
|
# nltk
|
||||||
|
# scikit-learn
|
||||||
|
scikit-learn>=1.4.0
|
||||||
|
# via assistance-engine (ADR-0008 Layer 2 classifier)
|
||||||
jsonpatch==1.33
|
jsonpatch==1.33
|
||||||
# via langchain-core
|
# via langchain-core
|
||||||
jsonpointer==3.0.0
|
jsonpointer==3.0.0
|
||||||
|
|
|
||||||
|
|
@ -1,11 +1,22 @@
|
||||||
import logging
|
import logging
|
||||||
|
import os
|
||||||
|
import re as _re
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
from elasticsearch import Elasticsearch
|
from elasticsearch import Elasticsearch
|
||||||
from langchain_core.documents import Document
|
from langchain_core.documents import Document
|
||||||
from langchain_core.messages import AIMessage, SystemMessage, HumanMessage, BaseMessage
|
from langchain_core.messages import AIMessage, SystemMessage, HumanMessage, BaseMessage
|
||||||
from langgraph.graph import END, StateGraph
|
from langgraph.graph import END, StateGraph
|
||||||
from langgraph.graph.state import CompiledStateGraph
|
from langgraph.graph.state import CompiledStateGraph
|
||||||
|
|
||||||
|
try:
|
||||||
|
import joblib
|
||||||
|
import numpy as _np
|
||||||
|
_LAYER2_AVAILABLE = True
|
||||||
|
except ImportError:
|
||||||
|
_LAYER2_AVAILABLE = False
|
||||||
|
|
||||||
from prompts import (
|
from prompts import (
|
||||||
CLASSIFY_PROMPT_TEMPLATE,
|
CLASSIFY_PROMPT_TEMPLATE,
|
||||||
CODE_GENERATION_PROMPT,
|
CODE_GENERATION_PROMPT,
|
||||||
|
|
@ -221,10 +232,118 @@ def _format_intent_history(classify_history: list) -> str:
|
||||||
return "\n".join(lines)
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
|
||||||
|
_RC01_PREFIXES = (
|
||||||
|
# System-injected prefixes — always PLATFORM (RC-01)
|
||||||
|
"you are a direct and concise assistant",
|
||||||
|
)
|
||||||
|
|
||||||
|
_RC02_KEYWORDS = (
|
||||||
|
# Account / subscription vocabulary (RC-02)
|
||||||
|
"usage percentage",
|
||||||
|
"project usage",
|
||||||
|
"account usage",
|
||||||
|
"your quota",
|
||||||
|
"your subscription",
|
||||||
|
"your plan",
|
||||||
|
"your account",
|
||||||
|
"your limits",
|
||||||
|
"your consumption",
|
||||||
|
"api calls",
|
||||||
|
"llamadas este mes",
|
||||||
|
"uso de la cuenta",
|
||||||
|
"plan contratado",
|
||||||
|
"límite de",
|
||||||
|
"limite de",
|
||||||
|
)
|
||||||
|
|
||||||
|
# RC-02 structural signals: "N%" in context of usage/quota/consumption
|
||||||
|
_RC02_PERCENTAGE_RE = _re.compile(
|
||||||
|
r'\b\d+(\.\d+)?\s*%', # any percentage figure
|
||||||
|
)
|
||||||
|
_RC02_USAGE_CONTEXT_WORDS = (
|
||||||
|
"usage", "quota", "consumption", "limit", "billing",
|
||||||
|
"uso", "cuota", "consumo", "límite", "facturación",
|
||||||
|
"project", "account", "plan", "subscription",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def _is_platform_query(question: str) -> bool:
|
def _is_platform_query(question: str) -> bool:
|
||||||
"""Fast-path: skip LLM classifier for known platform prompt prefixes."""
|
"""Deterministic fast-path for PLATFORM classification.
|
||||||
|
|
||||||
|
RC-01: known system-injected prefixes → always PLATFORM, no LLM call.
|
||||||
|
RC-02: messages containing account/metrics/usage signals → always PLATFORM,
|
||||||
|
regardless of conversation history or LLM classifier output.
|
||||||
|
|
||||||
|
Both rules are hard-coded here, not in the prompt, so they cannot be
|
||||||
|
overridden by model behavior.
|
||||||
|
"""
|
||||||
q = question.strip().lower()
|
q = question.strip().lower()
|
||||||
return "you are a direct and concise assistant" in q
|
|
||||||
|
# RC-01 — known platform-injected prefixes
|
||||||
|
if any(prefix in q for prefix in _RC01_PREFIXES):
|
||||||
|
return True
|
||||||
|
|
||||||
|
# RC-02 — vocabulary signals
|
||||||
|
if any(kw in q for kw in _RC02_KEYWORDS):
|
||||||
|
return True
|
||||||
|
|
||||||
|
# RC-02 — structural signal: percentage figure + usage context word
|
||||||
|
if _RC02_PERCENTAGE_RE.search(q):
|
||||||
|
if any(ctx in q for ctx in _RC02_USAGE_CONTEXT_WORDS):
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
# ── Layer 2 — embedding classifier (ADR-0008 Phase 2) ─────────────────────────
|
||||||
|
|
||||||
|
_LAYER2_MODEL_PATH = os.getenv("CLASSIFIER_MODEL_PATH", "/data/classifier_model.pkl")
|
||||||
|
_LAYER2_CONFIDENCE_THRESHOLD = float(os.getenv("CLASSIFIER_CONFIDENCE_THRESHOLD", "0.85"))
|
||||||
|
_layer2_model: dict | None = None
|
||||||
|
|
||||||
|
|
||||||
|
def _load_layer2_model() -> None:
|
||||||
|
"""Load the serialized Layer 2 classifier from disk (called once at startup).
|
||||||
|
|
||||||
|
Safe to call even if the model file does not exist yet — the engine falls
|
||||||
|
back to the LLM classifier (Layer 3) for every request.
|
||||||
|
"""
|
||||||
|
global _layer2_model
|
||||||
|
if not _LAYER2_AVAILABLE:
|
||||||
|
logger.info("[classifier/L2] joblib/numpy not available — Layer 2 disabled")
|
||||||
|
return
|
||||||
|
p = Path(_LAYER2_MODEL_PATH)
|
||||||
|
if p.exists():
|
||||||
|
_layer2_model = joblib.load(p)
|
||||||
|
meta = {k: v for k, v in _layer2_model.items() if k != "clf"}
|
||||||
|
logger.info(f"[classifier/L2] model loaded from {p} — {meta}")
|
||||||
|
else:
|
||||||
|
logger.info(f"[classifier/L2] model not found at {p} — using LLM fallback only")
|
||||||
|
|
||||||
|
|
||||||
|
def _classify_layer2(query: str, embeddings) -> str | None:
|
||||||
|
"""Embed query and classify via Layer 2 model.
|
||||||
|
|
||||||
|
Returns the predicted type if confidence >= threshold, else None (falls
|
||||||
|
through to the LLM classifier, Layer 3).
|
||||||
|
|
||||||
|
Never raises — any error returns None to preserve the Layer 3 fallback.
|
||||||
|
"""
|
||||||
|
if _layer2_model is None or not _LAYER2_AVAILABLE:
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
vec = _np.array(embeddings.embed_query(query)).reshape(1, -1)
|
||||||
|
proba = _layer2_model["clf"].predict_proba(vec)[0]
|
||||||
|
conf = float(proba.max())
|
||||||
|
if conf >= _LAYER2_CONFIDENCE_THRESHOLD:
|
||||||
|
label = _layer2_model["classes"][int(proba.argmax())]
|
||||||
|
logger.info(f"[classifier/L2] → {label} (conf={conf:.2f})")
|
||||||
|
return label
|
||||||
|
logger.info(f"[classifier/L2] low confidence ({conf:.2f}) → LLM fallback")
|
||||||
|
return None
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"[classifier/L2] error: {e} — falling through to LLM")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
def _parse_query_type(raw: str) -> tuple[str, bool]:
|
def _parse_query_type(raw: str) -> tuple[str, bool]:
|
||||||
|
|
@ -263,8 +382,20 @@ def build_graph(llm, embeddings, es_client, index_name, llm_conversational=None)
|
||||||
|
|
||||||
topic_snippet = question.strip()[:60].replace("\n", " ")
|
topic_snippet = question.strip()[:60].replace("\n", " ")
|
||||||
|
|
||||||
|
# Phase 3 — caller-declared type: bypass all layers (ADR-0008 RC)
|
||||||
|
declared = state.get("query_type", "")
|
||||||
|
if declared in ("RETRIEVAL", "CODE_GENERATION", "CONVERSATIONAL", "PLATFORM"):
|
||||||
|
logger.info(f"[classify] caller-declared → {declared}")
|
||||||
|
entry: ClassifyEntry = {"type": declared, "topic": topic_snippet}
|
||||||
|
return {
|
||||||
|
"query_type": declared,
|
||||||
|
"use_editor_context": False,
|
||||||
|
"classify_history": classify_history + [entry],
|
||||||
|
}
|
||||||
|
|
||||||
|
# Layer 1 — hard rules (RC-01, RC-02)
|
||||||
if _is_platform_query(question):
|
if _is_platform_query(question):
|
||||||
logger.info(f"[classify] platform prefix detected -> PLATFORM")
|
logger.info("[classify] L1 → PLATFORM")
|
||||||
entry: ClassifyEntry = {"type": "PLATFORM", "topic": topic_snippet}
|
entry: ClassifyEntry = {"type": "PLATFORM", "topic": topic_snippet}
|
||||||
return {
|
return {
|
||||||
"query_type": "PLATFORM",
|
"query_type": "PLATFORM",
|
||||||
|
|
@ -272,13 +403,24 @@ def build_graph(llm, embeddings, es_client, index_name, llm_conversational=None)
|
||||||
"classify_history": classify_history + [entry],
|
"classify_history": classify_history + [entry],
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Layer 2 — embedding classifier (ADR-0008 Phase 2)
|
||||||
|
l2_type = _classify_layer2(question, embeddings)
|
||||||
|
if l2_type:
|
||||||
|
entry: ClassifyEntry = {"type": l2_type, "topic": topic_snippet}
|
||||||
|
return {
|
||||||
|
"query_type": l2_type,
|
||||||
|
"use_editor_context": False,
|
||||||
|
"classify_history": classify_history + [entry],
|
||||||
|
}
|
||||||
|
|
||||||
|
# Layer 3 — LLM classifier (bootstrap fallback)
|
||||||
intent_history_text = _format_intent_history(classify_history)
|
intent_history_text = _format_intent_history(classify_history)
|
||||||
prompt_content = _build_classify_prompt(question, intent_history_text, selected_text)
|
prompt_content = _build_classify_prompt(question, intent_history_text, selected_text)
|
||||||
|
|
||||||
resp = llm.invoke([SystemMessage(content=prompt_content)])
|
resp = llm.invoke([SystemMessage(content=prompt_content)])
|
||||||
raw = resp.content.strip().upper()
|
raw = resp.content.strip().upper()
|
||||||
query_type, use_editor_ctx = _parse_query_type(raw)
|
query_type, use_editor_ctx = _parse_query_type(raw)
|
||||||
logger.info(f"[classify] selected={bool(selected_text)} raw='{raw}' -> {query_type} editor={use_editor_ctx}")
|
logger.info(f"[classify] L3 selected={bool(selected_text)} raw='{raw}' -> {query_type} editor={use_editor_ctx}")
|
||||||
|
|
||||||
entry: ClassifyEntry = {"type": query_type, "topic": topic_snippet}
|
entry: ClassifyEntry = {"type": query_type, "topic": topic_snippet}
|
||||||
return {
|
return {
|
||||||
|
|
@ -438,10 +580,23 @@ def build_prepare_graph(llm, embeddings, es_client, index_name):
|
||||||
history_msgs = messages[:-1]
|
history_msgs = messages[:-1]
|
||||||
selected_text = state.get("selected_text", "")
|
selected_text = state.get("selected_text", "")
|
||||||
|
|
||||||
if _is_platform_query(question):
|
|
||||||
logger.info(f"[prepare/classify] platform prefix detected -> PLATFORM")
|
|
||||||
classify_history = state.get("classify_history") or []
|
classify_history = state.get("classify_history") or []
|
||||||
topic_snippet = question.strip()[:60].replace("\n", " ")
|
topic_snippet = question.strip()[:60].replace("\n", " ")
|
||||||
|
|
||||||
|
# Phase 3 — caller-declared type: bypass all layers (ADR-0008 RC)
|
||||||
|
declared = state.get("query_type", "")
|
||||||
|
if declared in ("RETRIEVAL", "CODE_GENERATION", "CONVERSATIONAL", "PLATFORM"):
|
||||||
|
logger.info(f"[prepare/classify] caller-declared → {declared}")
|
||||||
|
entry: ClassifyEntry = {"type": declared, "topic": topic_snippet}
|
||||||
|
return {
|
||||||
|
"query_type": declared,
|
||||||
|
"use_editor_context": False,
|
||||||
|
"classify_history": classify_history + [entry],
|
||||||
|
}
|
||||||
|
|
||||||
|
# Layer 1 — hard rules (RC-01, RC-02)
|
||||||
|
if _is_platform_query(question):
|
||||||
|
logger.info("[prepare/classify] L1 → PLATFORM")
|
||||||
entry: ClassifyEntry = {"type": "PLATFORM", "topic": topic_snippet}
|
entry: ClassifyEntry = {"type": "PLATFORM", "topic": topic_snippet}
|
||||||
return {
|
return {
|
||||||
"query_type": "PLATFORM",
|
"query_type": "PLATFORM",
|
||||||
|
|
@ -449,15 +604,24 @@ def build_prepare_graph(llm, embeddings, es_client, index_name):
|
||||||
"classify_history": classify_history + [entry],
|
"classify_history": classify_history + [entry],
|
||||||
}
|
}
|
||||||
|
|
||||||
classify_history = state.get("classify_history") or []
|
# Layer 2 — embedding classifier (ADR-0008 Phase 2)
|
||||||
topic_snippet = question.strip()[:60].replace("\n", " ")
|
l2_type = _classify_layer2(question, embeddings)
|
||||||
|
if l2_type:
|
||||||
|
entry: ClassifyEntry = {"type": l2_type, "topic": topic_snippet}
|
||||||
|
return {
|
||||||
|
"query_type": l2_type,
|
||||||
|
"use_editor_context": False,
|
||||||
|
"classify_history": classify_history + [entry],
|
||||||
|
}
|
||||||
|
|
||||||
|
# Layer 3 — LLM classifier (bootstrap fallback)
|
||||||
intent_history_text = _format_intent_history(classify_history)
|
intent_history_text = _format_intent_history(classify_history)
|
||||||
prompt_content = _build_classify_prompt(question, intent_history_text, selected_text)
|
prompt_content = _build_classify_prompt(question, intent_history_text, selected_text)
|
||||||
|
|
||||||
resp = llm.invoke([SystemMessage(content=prompt_content)])
|
resp = llm.invoke([SystemMessage(content=prompt_content)])
|
||||||
raw = resp.content.strip().upper()
|
raw = resp.content.strip().upper()
|
||||||
query_type, use_editor_ctx = _parse_query_type(raw)
|
query_type, use_editor_ctx = _parse_query_type(raw)
|
||||||
logger.info(f"[prepare/classify] selected={bool(selected_text)} raw='{raw}' -> {query_type} editor={use_editor_ctx}")
|
logger.info(f"[prepare/classify] L3 selected={bool(selected_text)} raw='{raw}' -> {query_type} editor={use_editor_ctx}")
|
||||||
entry: ClassifyEntry = {"type": query_type, "topic": topic_snippet}
|
entry: ClassifyEntry = {"type": query_type, "topic": topic_snippet}
|
||||||
return {
|
return {
|
||||||
"query_type": query_type,
|
"query_type": query_type,
|
||||||
|
|
|
||||||
|
|
@ -14,7 +14,8 @@ from langchain_core.messages import AIMessage
|
||||||
|
|
||||||
from utils.llm_factory import create_chat_model
|
from utils.llm_factory import create_chat_model
|
||||||
from utils.emb_factory import create_embedding_model
|
from utils.emb_factory import create_embedding_model
|
||||||
from graph import build_graph, build_prepare_graph, build_final_messages, session_store, classify_history_store
|
from graph import build_graph, build_prepare_graph, build_final_messages, session_store, classify_history_store, _load_layer2_model
|
||||||
|
from utils.classifier_export import maybe_export, force_export
|
||||||
|
|
||||||
from evaluate import run_evaluation
|
from evaluate import run_evaluation
|
||||||
|
|
||||||
|
|
@ -87,6 +88,7 @@ class BrunixEngine(brunix_pb2_grpc.AssistanceEngineServicer):
|
||||||
index_name = self.index_name,
|
index_name = self.index_name,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
_load_layer2_model()
|
||||||
logger.info("Brunix Engine initialized.")
|
logger.info("Brunix Engine initialized.")
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -114,10 +116,12 @@ class BrunixEngine(brunix_pb2_grpc.AssistanceEngineServicer):
|
||||||
logger.warning("[AskAgent] extra_context base64 decode failed")
|
logger.warning("[AskAgent] extra_context base64 decode failed")
|
||||||
|
|
||||||
user_info = request.user_info or "{}"
|
user_info = request.user_info or "{}"
|
||||||
|
query_type = request.query_type or ""
|
||||||
|
|
||||||
logger.info(
|
logger.info(
|
||||||
f"[AskAgent] session={session_id} "
|
f"[AskAgent] session={session_id} "
|
||||||
f"editor={bool(editor_content)} selected={bool(selected_text)} "
|
f"editor={bool(editor_content)} selected={bool(selected_text)} "
|
||||||
|
f"declared_type={query_type or 'none'} "
|
||||||
f"query='{query[:80]}'"
|
f"query='{query[:80]}'"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -131,7 +135,7 @@ class BrunixEngine(brunix_pb2_grpc.AssistanceEngineServicer):
|
||||||
"session_id": session_id,
|
"session_id": session_id,
|
||||||
"reformulated_query": "",
|
"reformulated_query": "",
|
||||||
"context": "",
|
"context": "",
|
||||||
"query_type": "",
|
"query_type": query_type,
|
||||||
"classify_history": classify_history,
|
"classify_history": classify_history,
|
||||||
|
|
||||||
"editor_content": editor_content,
|
"editor_content": editor_content,
|
||||||
|
|
@ -148,6 +152,7 @@ class BrunixEngine(brunix_pb2_grpc.AssistanceEngineServicer):
|
||||||
|
|
||||||
if session_id:
|
if session_id:
|
||||||
classify_history_store[session_id] = final_state.get("classify_history", classify_history)
|
classify_history_store[session_id] = final_state.get("classify_history", classify_history)
|
||||||
|
maybe_export(classify_history_store)
|
||||||
|
|
||||||
logger.info(f"[AskAgent] query_type={final_state.get('query_type')} "
|
logger.info(f"[AskAgent] query_type={final_state.get('query_type')} "
|
||||||
f"answer='{result_text[:100]}'")
|
f"answer='{result_text[:100]}'")
|
||||||
|
|
@ -190,10 +195,12 @@ class BrunixEngine(brunix_pb2_grpc.AssistanceEngineServicer):
|
||||||
logger.warning("[AskAgent] extra_context base64 decode failed")
|
logger.warning("[AskAgent] extra_context base64 decode failed")
|
||||||
|
|
||||||
user_info = request.user_info or "{}"
|
user_info = request.user_info or "{}"
|
||||||
|
query_type = request.query_type or ""
|
||||||
|
|
||||||
logger.info(
|
logger.info(
|
||||||
f"[AskAgentStream] session={session_id} "
|
f"[AskAgentStream] session={session_id} "
|
||||||
f"editor={bool(editor_content)} selected={bool(selected_text)} context={extra_context} "
|
f"editor={bool(editor_content)} selected={bool(selected_text)} "
|
||||||
|
f"declared_type={query_type or 'none'} "
|
||||||
f"query='{query[:80]}'"
|
f"query='{query[:80]}'"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -207,7 +214,7 @@ class BrunixEngine(brunix_pb2_grpc.AssistanceEngineServicer):
|
||||||
"session_id": session_id,
|
"session_id": session_id,
|
||||||
"reformulated_query": "",
|
"reformulated_query": "",
|
||||||
"context": "",
|
"context": "",
|
||||||
"query_type": "",
|
"query_type": query_type,
|
||||||
"classify_history": classify_history,
|
"classify_history": classify_history,
|
||||||
|
|
||||||
"editor_content": editor_content,
|
"editor_content": editor_content,
|
||||||
|
|
@ -243,6 +250,7 @@ class BrunixEngine(brunix_pb2_grpc.AssistanceEngineServicer):
|
||||||
list(prepared["messages"]) + [AIMessage(content=complete_text)]
|
list(prepared["messages"]) + [AIMessage(content=complete_text)]
|
||||||
)
|
)
|
||||||
classify_history_store[session_id] = prepared.get("classify_history", classify_history)
|
classify_history_store[session_id] = prepared.get("classify_history", classify_history)
|
||||||
|
maybe_export(classify_history_store)
|
||||||
|
|
||||||
logger.info(
|
logger.info(
|
||||||
f"[AskAgentStream] done — "
|
f"[AskAgentStream] done — "
|
||||||
|
|
@ -323,7 +331,11 @@ def serve():
|
||||||
server.add_insecure_port("[::]:50051")
|
server.add_insecure_port("[::]:50051")
|
||||||
logger.info("[ENGINE] listen on 50051 (gRPC)")
|
logger.info("[ENGINE] listen on 50051 (gRPC)")
|
||||||
server.start()
|
server.start()
|
||||||
|
try:
|
||||||
server.wait_for_termination()
|
server.wait_for_termination()
|
||||||
|
finally:
|
||||||
|
force_export(classify_history_store)
|
||||||
|
logger.info("[ENGINE] classifier labels flushed on shutdown")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,138 @@
|
||||||
|
"""
|
||||||
|
classifier_export.py — ADR-0008 Phase 1 + ADR-0010 automatic retraining trigger
|
||||||
|
|
||||||
|
Exports classify_history_store to JSONL when the session count reaches
|
||||||
|
EXPORT_THRESHOLD. Each exported record is a labeled (query, type) pair.
|
||||||
|
|
||||||
|
After a successful export, triggers the Champion/Challenger retraining pipeline
|
||||||
|
(ADR-0010) in a background thread if RETRAIN_ON_EXPORT=true.
|
||||||
|
|
||||||
|
Export format (one JSON object per line):
|
||||||
|
{"query": "60-char topic snippet", "type": "RETRIEVAL", "session_id": "abc"}
|
||||||
|
|
||||||
|
The store is flushed after export to prevent unbounded memory growth.
|
||||||
|
Files are written to CLASSIFIER_EXPORT_DIR (default: /data/classifier_labels/).
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
import threading
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
EXPORT_THRESHOLD = int(os.getenv("CLASSIFIER_EXPORT_THRESHOLD", "100"))
|
||||||
|
EXPORT_DIR = Path(os.getenv("CLASSIFIER_EXPORT_DIR", "/data/classifier_labels"))
|
||||||
|
RETRAIN_ON_EXPORT = os.getenv("RETRAIN_ON_EXPORT", "false").lower() == "true"
|
||||||
|
RETRAIN_SCRIPT = Path(os.getenv("RETRAIN_SCRIPT_PATH",
|
||||||
|
"/app/scripts/pipelines/classifier/retrain_pipeline.py"))
|
||||||
|
|
||||||
|
_export_lock = threading.Lock()
|
||||||
|
|
||||||
|
|
||||||
|
def maybe_export(classify_history_store: dict) -> bool:
|
||||||
|
"""Check session count and export if threshold is reached.
|
||||||
|
|
||||||
|
Called after every request. Thread-safe. Returns True if export occurred.
|
||||||
|
Flushes the store after a successful export.
|
||||||
|
"""
|
||||||
|
if len(classify_history_store) < EXPORT_THRESHOLD:
|
||||||
|
return False
|
||||||
|
|
||||||
|
with _export_lock:
|
||||||
|
# Re-check inside lock — another thread may have already exported
|
||||||
|
if len(classify_history_store) < EXPORT_THRESHOLD:
|
||||||
|
return False
|
||||||
|
|
||||||
|
return _do_export(classify_history_store)
|
||||||
|
|
||||||
|
|
||||||
|
def _do_export(classify_history_store: dict) -> bool:
|
||||||
|
try:
|
||||||
|
EXPORT_DIR.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
ts = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
|
||||||
|
path = EXPORT_DIR / f"classifier_labels_{ts}.jsonl"
|
||||||
|
|
||||||
|
records = []
|
||||||
|
for session_id, entries in classify_history_store.items():
|
||||||
|
for entry in entries:
|
||||||
|
records.append({
|
||||||
|
"query": entry.get("topic", ""),
|
||||||
|
"type": entry.get("type", ""),
|
||||||
|
"session_id": session_id,
|
||||||
|
})
|
||||||
|
|
||||||
|
with open(path, "w", encoding="utf-8") as f:
|
||||||
|
for record in records:
|
||||||
|
f.write(json.dumps(record, ensure_ascii=False) + "\n")
|
||||||
|
|
||||||
|
n_sessions = len(classify_history_store)
|
||||||
|
n_records = len(records)
|
||||||
|
|
||||||
|
classify_history_store.clear()
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
f"[classifier_export] exported {n_records} records "
|
||||||
|
f"from {n_sessions} sessions → {path}"
|
||||||
|
)
|
||||||
|
|
||||||
|
if RETRAIN_ON_EXPORT:
|
||||||
|
_trigger_retrain()
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"[classifier_export] export failed: {e}", exc_info=True)
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def _trigger_retrain() -> None:
|
||||||
|
"""Launch retrain_pipeline.py in a background thread.
|
||||||
|
|
||||||
|
Runs as a subprocess so it does not block the gRPC server. The engine
|
||||||
|
continues serving requests while retraining happens. The new model is
|
||||||
|
loaded on the next engine restart — live reload is not performed to avoid
|
||||||
|
serving inconsistent results mid-session.
|
||||||
|
"""
|
||||||
|
def _run():
|
||||||
|
if not RETRAIN_SCRIPT.exists():
|
||||||
|
logger.warning(
|
||||||
|
f"[classifier_export] retrain script not found at {RETRAIN_SCRIPT} "
|
||||||
|
"— set RETRAIN_SCRIPT_PATH or disable RETRAIN_ON_EXPORT"
|
||||||
|
)
|
||||||
|
return
|
||||||
|
try:
|
||||||
|
logger.info(f"[classifier_export] launching retraining pipeline → {RETRAIN_SCRIPT}")
|
||||||
|
result = subprocess.run(
|
||||||
|
[sys.executable, str(RETRAIN_SCRIPT)],
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
timeout=600, # 10 min hard limit
|
||||||
|
)
|
||||||
|
if result.returncode == 0:
|
||||||
|
logger.info("[classifier_export] retraining completed — restart engine to load new model")
|
||||||
|
else:
|
||||||
|
logger.error(
|
||||||
|
f"[classifier_export] retraining failed (exit {result.returncode})\n"
|
||||||
|
f"{result.stderr[-500:]}"
|
||||||
|
)
|
||||||
|
except subprocess.TimeoutExpired:
|
||||||
|
logger.error("[classifier_export] retraining timed out after 600s")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"[classifier_export] retraining error: {e}", exc_info=True)
|
||||||
|
|
||||||
|
threading.Thread(target=_run, daemon=True, name="retrain-pipeline").start()
|
||||||
|
|
||||||
|
|
||||||
|
def force_export(classify_history_store: dict) -> bool:
|
||||||
|
"""Export immediately regardless of threshold. Useful for shutdown hooks."""
|
||||||
|
if not classify_history_store:
|
||||||
|
logger.info("[classifier_export] nothing to export")
|
||||||
|
return False
|
||||||
|
with _export_lock:
|
||||||
|
return _do_export(classify_history_store)
|
||||||
|
|
@ -1,7 +1,8 @@
|
||||||
# ADR-0008: Adaptive Query Routing — Taxonomy, Contract, and Classifier Strategy
|
# ADR-0008: Adaptive Query Routing — Taxonomy, Contract, and Classifier Strategy
|
||||||
|
|
||||||
**Date:** 2026-04-09
|
**Date:** 2026-04-09
|
||||||
**Status:** Accepted
|
**Last updated:** 2026-04-10
|
||||||
|
**Status:** Implemented
|
||||||
**Deciders:** Rafael Ruiz (CTO)
|
**Deciders:** Rafael Ruiz (CTO)
|
||||||
**Related ADRs:** ADR-0002 (Two-Phase Streaming), ADR-0003 (Hybrid Retrieval RRF)
|
**Related ADRs:** ADR-0002 (Two-Phase Streaming), ADR-0003 (Hybrid Retrieval RRF)
|
||||||
|
|
||||||
|
|
@ -103,7 +104,11 @@ Adding a prefix requires updating `_PLATFORM_PATTERNS` and this list.
|
||||||
|
|
||||||
If the query contains usage percentages, account metrics, consumption figures, quota data, or billing information, the output **MUST** be `PLATFORM` regardless of history or classifier confidence.
|
If the query contains usage percentages, account metrics, consumption figures, quota data, or billing information, the output **MUST** be `PLATFORM` regardless of history or classifier confidence.
|
||||||
|
|
||||||
In the current bootstrap implementation this is enforced via `<platform_priority_rule>`. In the future discriminative classifier it should be a hard pre-filter in Layer 1.
|
**Implementation status: hard-coded in `_is_platform_query()` (2026-04-10).** Two detection mechanisms:
|
||||||
|
- Vocabulary signals: keywords like `"usage percentage"`, `"your quota"`, `"your account"`, `"api calls"`, etc. (see `_RC02_KEYWORDS` in `graph.py`)
|
||||||
|
- Structural signal: regex `\b\d+(\.\d+)?\s*%` (any percentage figure) combined with a usage context word (`usage`, `quota`, `consumption`, `limit`, `billing`, etc.)
|
||||||
|
|
||||||
|
This rule is no longer enforced via `<platform_priority_rule>` in the prompt — it is deterministic code and cannot be overridden by model behavior.
|
||||||
|
|
||||||
### RC-03 — Intent history scoping (priority: medium)
|
### RC-03 — Intent history scoping (priority: medium)
|
||||||
|
|
||||||
|
|
@ -224,14 +229,16 @@ The LLM classifier is the **teacher**. The embedding classifier is the **student
|
||||||
|
|
||||||
The platform generates `PLATFORM` prompts and knows the type at generation time. Adding `query_type` to `AgentRequest` (proto field 7) lets the caller declare the type explicitly, bypassing all three layers. This makes RC-01 and RC-02 redundant for platform-generated traffic.
|
The platform generates `PLATFORM` prompts and knows the type at generation time. Adding `query_type` to `AgentRequest` (proto field 7) lets the caller declare the type explicitly, bypassing all three layers. This makes RC-01 and RC-02 redundant for platform-generated traffic.
|
||||||
|
|
||||||
|
**Implementation status: complete (2026-04-10).** `query_type` is proto field 7. When set, the classify node in both `build_graph` and `build_prepare_graph` returns the declared type immediately without invoking any layer.
|
||||||
|
|
||||||
### Convergence path
|
### Convergence path
|
||||||
|
|
||||||
| Phase | What changes | Layer 3 traffic |
|
| Phase | What changes | Layer 3 traffic | Status |
|
||||||
|---|---|---|
|
|---|---|---|---|
|
||||||
| Now — bootstrap | LLM classifier for all unmatched queries | ~95% |
|
| Bootstrap | LLM classifier for all unmatched queries | ~95% | ✅ Implemented |
|
||||||
| Phase 1 | Collect labels via `classify_history_store` | ~95% |
|
| Phase 1 | Collect labels via `classify_history_store` + export JSONL | ~95% | ✅ Complete — `classifier_export.py` |
|
||||||
| Phase 2 | Deploy embedding classifier (Layer 2) | ~10–20% |
|
| Phase 2 | Deploy embedding classifier (Layer 2) | ~10–20% | ✅ Complete — bge-m3 + LogisticRegression, seed dataset, `_classify_layer2()` in `graph.py` |
|
||||||
| Phase 3 | Caller-declared type for platform prompts | <5% |
|
| Phase 3 | Caller-declared type for platform prompts | <5% | ✅ Complete — proto field 7, bypasses all layers |
|
||||||
| Phase 4 | LLM classifier as anomaly handler only | <2% |
|
| Phase 4 | LLM classifier as anomaly handler only | <2% | ⏳ Production outcome — no code change required |
|
||||||
|
|
||||||
Phase 2 is the highest-leverage step: it replaces the dominant code path (LLM inference per request) with CPU-only inference, with no change to the routing contract or the downstream graph.
|
Phase 2 is the highest-leverage step: it replaces the dominant code path (LLM inference per request) with CPU-only inference, with no change to the routing contract or the downstream graph. Phase 4 is reached when the platform adopts Phase 3 and Layer 2 is retrained with production data.
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,335 @@
|
||||||
|
# ADR-0009: Per-Type Response Validation Layer
|
||||||
|
|
||||||
|
**Date:** 2026-04-10
|
||||||
|
**Status:** Accepted
|
||||||
|
**Deciders:** Rafael Ruiz (CTO)
|
||||||
|
**Related ADRs:** ADR-0007 (MSVL for RAG Evaluation), ADR-0008 (Adaptive Query Routing), ADR-0003 (Hybrid Retrieval RRF)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Context
|
||||||
|
|
||||||
|
### Problem 1 — Syntactically invalid code reaches users
|
||||||
|
|
||||||
|
ADR-0007 documents that 10–16% of `CODE_GENERATION` responses contain syntactically invalid AVAP code — foreign language injection (Go, Python, JavaScript) or hallucinated commands (`getSHA256`, `readParam`, `returnResult`). The LLM judge used in `EvaluateRAG` does not detect these failures because it evaluates semantic coherence, not syntactic validity.
|
||||||
|
|
||||||
|
This problem exists in production today. A user receiving a `CODE_GENERATION` response has no indication that the generated code would fail on the PLATON kernel.
|
||||||
|
|
||||||
|
The AVAP Parser gRPC service — established in ADR-0007 as a hard dependency of the evaluation pipeline — is already available in the stack. It returns not just `VALID / INVALID` but a **complete line-by-line execution trace** on failure:
|
||||||
|
|
||||||
|
```
|
||||||
|
Line 3: unknown command 'getSHA256' — expected known identifier
|
||||||
|
Line 7: unexpected construct 'for i in range(...)' — AVAP loop syntax required
|
||||||
|
Line 12: 'returnResult' not defined — did you mean 'addResult'?
|
||||||
|
```
|
||||||
|
|
||||||
|
This trace is structured, specific, and directly actionable by the LLM. A retry informed by the parser trace is fundamentally different from a blind retry — the model knows exactly what failed and where.
|
||||||
|
|
||||||
|
### Problem 2 — Context relevance is not evaluated pre-generation
|
||||||
|
|
||||||
|
The engine retrieves 8 chunks from Elasticsearch for every `RETRIEVAL` and `CODE_GENERATION` query without checking whether those chunks actually answer the question. `CONFIDENCE_PROMPT_TEMPLATE` has been scaffolded in `prompts.py` since the initial implementation but is not wired into the graph.
|
||||||
|
|
||||||
|
Undetected low-relevance retrieval produces responses that are semantically fluent but factually ungrounded — the model generates plausible-sounding AVAP explanations or code not supported by the retrieved documentation.
|
||||||
|
|
||||||
|
### Why these are one ADR
|
||||||
|
|
||||||
|
Both problems share the same architectural response: adding validation nodes to the production graph with type-specific logic and feedback-informed retry. The decision is one — **add a per-type validation layer** — and the implementation shares the same graph positions (post-retrieve, post-generate), the same retry contract (maximum 1 retry per request), and the same rationale (the engine must not silently return responses it has evidence to question).
|
||||||
|
|
||||||
|
Splitting into separate ADRs would produce two documents that cannot be understood independently.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Decision
|
||||||
|
|
||||||
|
Add a **Per-Type Response Validation Layer (PTVL)** to the production LangGraph pipeline. Each query type has a distinct validation strategy matching its failure modes.
|
||||||
|
|
||||||
|
### Validation contract by type
|
||||||
|
|
||||||
|
| Type | When | What | Mechanism |
|
||||||
|
|---|---|---|---|
|
||||||
|
| `CODE_GENERATION` | Post-generation | Syntactic validity of generated AVAP code | AVAP Parser gRPC — deterministic |
|
||||||
|
| `RETRIEVAL` | Pre-generation | Relevance of retrieved context to the query | LLM relevance check — `CONFIDENCE_PROMPT_TEMPLATE` |
|
||||||
|
| `CONVERSATIONAL` | None | — | No retrieval, no code generated |
|
||||||
|
| `PLATFORM` | None | — | No retrieval, no code generated |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Decision 1 — CODE_GENERATION: parser validation with trace-guided retry
|
||||||
|
|
||||||
|
#### Flow
|
||||||
|
|
||||||
|
```
|
||||||
|
generate_code node
|
||||||
|
│
|
||||||
|
▼
|
||||||
|
[V1] AVAP Parser gRPC
|
||||||
|
│
|
||||||
|
├── VALID ──────────────────────────────► return response
|
||||||
|
│
|
||||||
|
└── INVALID + line-by-line trace
|
||||||
|
│
|
||||||
|
▼
|
||||||
|
[inject trace into retry prompt]
|
||||||
|
│
|
||||||
|
▼
|
||||||
|
generate_code_retry node (1 attempt only)
|
||||||
|
│
|
||||||
|
▼
|
||||||
|
[V2] AVAP Parser gRPC
|
||||||
|
│
|
||||||
|
├── VALID ──────────────────────► return response
|
||||||
|
│
|
||||||
|
└── INVALID ────────────────────► return response + validation_status flag
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Trace-guided retry
|
||||||
|
|
||||||
|
The parser trace is injected into the generation prompt as a structured correction context:
|
||||||
|
|
||||||
|
```
|
||||||
|
<parser_feedback>
|
||||||
|
The previous attempt produced invalid AVAP code. Specific failures:
|
||||||
|
|
||||||
|
Line 3: unknown command 'getSHA256' — expected known identifier
|
||||||
|
Line 7: unexpected construct 'for i in range(...)' — AVAP loop syntax required
|
||||||
|
|
||||||
|
Correct these errors. Do not repeat the same constructs.
|
||||||
|
</parser_feedback>
|
||||||
|
```
|
||||||
|
|
||||||
|
This is not a blind retry. The LLM receives the exact failure points and can target its corrections. ADR-0007 documented the mapping between common hallucinated commands and their valid AVAP equivalents (`getSHA256` → `encodeSHA256`, `returnResult` → `addResult`, etc.) — the trace makes these corrections automatic without hardcoding the mapping.
|
||||||
|
|
||||||
|
#### Parser SLA
|
||||||
|
|
||||||
|
Inherited from ADR-0007: ≤2 seconds per call. **Silent fallback is permitted in production** (unlike evaluation, where ADR-0007 mandates abort). The distinction is that evaluation scores must be trustworthy; production responses degrade gracefully.
|
||||||
|
|
||||||
|
#### Parser availability — circuit breaker
|
||||||
|
|
||||||
|
A single timeout or connection error does not mean the parser is down. A sustained outage does. Hammering an unavailable gRPC service on every `CODE_GENERATION` request adds latency to every user request with zero benefit.
|
||||||
|
|
||||||
|
The engine implements a **circuit breaker** with three states:
|
||||||
|
|
||||||
|
```
|
||||||
|
CLOSED ──[N consecutive failures]──► OPEN
|
||||||
|
OPEN ──[cooldown expires]────────► HALF-OPEN
|
||||||
|
HALF-OPEN ──[probe succeeds]────────► CLOSED
|
||||||
|
HALF-OPEN ──[probe fails]───────────► OPEN
|
||||||
|
```
|
||||||
|
|
||||||
|
| Parameter | Default | Env var |
|
||||||
|
|---|---|---|
|
||||||
|
| Failure threshold to open | 3 consecutive failures | `PARSER_CB_THRESHOLD` |
|
||||||
|
| Cooldown before half-open | 30 seconds | `PARSER_CB_COOLDOWN` |
|
||||||
|
| Timeout per call | 2 seconds | `AVAP_PARSER_TIMEOUT` |
|
||||||
|
|
||||||
|
While the circuit is **OPEN**, `CODE_GENERATION` responses are returned immediately with `validation_status = PARSER_UNAVAILABLE` — no gRPC call is attempted. The cooldown prevents thundering-herd reconnection attempts.
|
||||||
|
|
||||||
|
```
|
||||||
|
[ptvl] circuit OPEN — skipping parser, returning unvalidated
|
||||||
|
[ptvl] circuit HALF-OPEN — probing parser availability
|
||||||
|
[ptvl] circuit CLOSED — parser reachable
|
||||||
|
```
|
||||||
|
|
||||||
|
Setting `AVAP_PARSER_TIMEOUT=0` permanently opens the circuit — disables parser validation entirely. Useful during development when the parser service is not deployed.
|
||||||
|
|
||||||
|
#### New environment variables
|
||||||
|
|
||||||
|
```
|
||||||
|
AVAP_PARSER_URL=grpc://... # URL of AVAP Parser gRPC service
|
||||||
|
AVAP_PARSER_TIMEOUT=2 # seconds per call; 0 = disable validation
|
||||||
|
PARSER_CB_THRESHOLD=3 # consecutive failures before circuit opens
|
||||||
|
PARSER_CB_COOLDOWN=30 # seconds before circuit attempts half-open probe
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Decision 2 — RETRIEVAL: context relevance check with reformulation retry
|
||||||
|
|
||||||
|
#### Flow
|
||||||
|
|
||||||
|
```
|
||||||
|
retrieve node
|
||||||
|
│
|
||||||
|
▼
|
||||||
|
[V1] CONFIDENCE_PROMPT_TEMPLATE
|
||||||
|
YES / NO
|
||||||
|
│
|
||||||
|
├── YES ────────────────────────────────► generate node (normal path)
|
||||||
|
│
|
||||||
|
└── NO
|
||||||
|
│
|
||||||
|
▼
|
||||||
|
reformulate_with_hint node
|
||||||
|
[reformulate query signalling context was insufficient]
|
||||||
|
│
|
||||||
|
▼
|
||||||
|
retrieve (retry)
|
||||||
|
│
|
||||||
|
▼
|
||||||
|
generate node (regardless of second retrieval result)
|
||||||
|
```
|
||||||
|
|
||||||
|
On second retrieval, generation proceeds unconditionally. The retry is a single best-effort improvement, not a gate. The model generates with whatever context is available — the user receives a response, not a refusal.
|
||||||
|
|
||||||
|
#### Relevance check
|
||||||
|
|
||||||
|
`CONFIDENCE_PROMPT_TEMPLATE` evaluates whether the retrieved context contains at least one passage relevant to the question. It returns `YES` or `NO` — no graduated score.
|
||||||
|
|
||||||
|
The relevance check is only applied to `RETRIEVAL` queries here. `CODE_GENERATION` has the parser as a stronger post-generation signal; the pre-generation relevance check would add latency with lower signal value than the parser provides.
|
||||||
|
|
||||||
|
#### Reformulation hint
|
||||||
|
|
||||||
|
When context is insufficient, the reformulation node receives a hint that standard context was not found. This produces a semantically different reformulation — broader synonyms, alternative phrasings — rather than a near-duplicate of the original query.
|
||||||
|
|
||||||
|
```
|
||||||
|
[CONTEXT_INSUFFICIENT]
|
||||||
|
The previous retrieval did not return relevant context for: "{original_query}"
|
||||||
|
Reformulate this query using broader terms or alternative phrasing.
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Graph changes
|
||||||
|
|
||||||
|
### New nodes
|
||||||
|
|
||||||
|
| Node | Graph | Trigger |
|
||||||
|
|---|---|---|
|
||||||
|
| `validate_code` | `build_graph` | After `generate_code` |
|
||||||
|
| `generate_code_retry` | `build_graph` | After `validate_code` when INVALID |
|
||||||
|
| `check_context_relevance` | `build_graph` + `build_prepare_graph` | After `retrieve`, before `generate` (RETRIEVAL only) |
|
||||||
|
| `reformulate_with_hint` | `build_graph` + `build_prepare_graph` | After `check_context_relevance` when NO |
|
||||||
|
|
||||||
|
### Updated flow — `build_graph`
|
||||||
|
|
||||||
|
```mermaid
|
||||||
|
flowchart TD
|
||||||
|
START([start]) --> CL[classify]
|
||||||
|
|
||||||
|
CL -->|RETRIEVAL| RF[reformulate]
|
||||||
|
CL -->|CODE_GENERATION| RF
|
||||||
|
CL -->|CONVERSATIONAL| RC[respond_conversational]
|
||||||
|
CL -->|PLATFORM| RP[respond_platform]
|
||||||
|
|
||||||
|
RF --> RT[retrieve]
|
||||||
|
|
||||||
|
RT -->|CODE_GENERATION| GC[generate_code]
|
||||||
|
RT -->|RETRIEVAL| CR{check_context\nrelevance}
|
||||||
|
|
||||||
|
CR -->|YES| GE[generate]
|
||||||
|
CR -->|NO| RH[reformulate_with_hint]
|
||||||
|
RH --> RT2[retrieve retry]
|
||||||
|
RT2 --> GE
|
||||||
|
|
||||||
|
GC --> VC{validate_code\nParser gRPC}
|
||||||
|
VC -->|VALID| END([end])
|
||||||
|
VC -->|INVALID + trace| GCR[generate_code_retry\ntrace-guided]
|
||||||
|
GCR --> VC2{validate_code\nParser gRPC}
|
||||||
|
VC2 -->|VALID| END
|
||||||
|
VC2 -->|INVALID| END
|
||||||
|
|
||||||
|
GE --> END
|
||||||
|
RC --> END
|
||||||
|
RP --> END
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## New AgentState fields
|
||||||
|
|
||||||
|
```python
|
||||||
|
class AgentState(TypedDict):
|
||||||
|
...
|
||||||
|
# PTVL fields
|
||||||
|
parser_trace: str # raw parser trace from first validation attempt (empty if valid)
|
||||||
|
validation_status: str # see validation status values below
|
||||||
|
context_relevant: bool # result of CONFIDENCE_PROMPT check (RETRIEVAL only)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Validation status values
|
||||||
|
|
||||||
|
| Value | Meaning | When set |
|
||||||
|
|---|---|---|
|
||||||
|
| `""` (empty) | Valid — no issues detected | Parser returned VALID on first or second attempt |
|
||||||
|
| `INVALID_UNRESOLVED` | Parser ran, code failed both attempts | Two parser calls made, both returned INVALID |
|
||||||
|
| `PARSER_UNAVAILABLE` | Parser was unreachable or circuit is open | No parser call was made or all calls timed out |
|
||||||
|
|
||||||
|
These are semantically distinct signals. `INVALID_UNRESOLVED` means the engine has evidence the code is wrong. `PARSER_UNAVAILABLE` means the engine has no evidence either way — the code may be correct. Clients must not treat them equivalently.
|
||||||
|
|
||||||
|
`validation_status` is surfaced to the client via `AgentResponse`:
|
||||||
|
|
||||||
|
```protobuf
|
||||||
|
message AgentResponse {
|
||||||
|
string text = 1;
|
||||||
|
string avap_code = 2;
|
||||||
|
bool is_final = 3;
|
||||||
|
string validation_status = 4; // "" | "INVALID_UNRESOLVED" | "PARSER_UNAVAILABLE"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Clients that do not read `validation_status` are unaffected — the field defaults to empty string.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Routing contract additions (RC-07, RC-08)
|
||||||
|
|
||||||
|
These rules extend the contract defined in ADR-0008.
|
||||||
|
|
||||||
|
### RC-07 — Parser validation gate (priority: high)
|
||||||
|
|
||||||
|
Every `CODE_GENERATION` response **MUST** be submitted to the AVAP Parser before delivery to the client, unless `AVAP_PARSER_TIMEOUT=0` or the parser service is unreachable.
|
||||||
|
|
||||||
|
```
|
||||||
|
route(q) = CODE_GENERATION → parser_validate(response) before yield
|
||||||
|
```
|
||||||
|
|
||||||
|
A `CODE_GENERATION` response returned without parser validation due to parser unavailability **MUST** be logged as `[ptvl] parser unavailable — returning unvalidated`.
|
||||||
|
|
||||||
|
### RC-08 — Retry budget (priority: medium)
|
||||||
|
|
||||||
|
Each request has a maximum of **1 retry** regardless of type. A `CODE_GENERATION` request that fails parser validation twice returns the second attempt with `validation_status=true`. A `RETRIEVAL` request whose context is insufficient reformulates once and generates unconditionally on the second retrieval.
|
||||||
|
|
||||||
|
No request may enter more than one retry cycle.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Consequences
|
||||||
|
|
||||||
|
### Positive
|
||||||
|
|
||||||
|
- Syntactically invalid AVAP code no longer reaches users silently. `validation_status` gives the client a typed signal: `INVALID_UNRESOLVED` (evidence of bad code) vs `PARSER_UNAVAILABLE` (no evidence either way) — clients can respond differently to each.
|
||||||
|
- The parser trace makes retries targeted rather than blind — the LLM corrects specific lines, not the whole response.
|
||||||
|
- Circuit breaker prevents parser outages from adding latency to every `CODE_GENERATION` request. After 3 consecutive failures the engine stops trying for 30 seconds.
|
||||||
|
- Context relevance check catches retrievals that return topically adjacent but non-answering chunks, reducing fluent-but-ungrounded responses.
|
||||||
|
- `AVAP_PARSER_TIMEOUT=0` allows development without the parser service — no hard dependency at startup.
|
||||||
|
|
||||||
|
### Negative / Trade-offs
|
||||||
|
|
||||||
|
- **`CODE_GENERATION` latency**: +1 parser gRPC call per request (~50–200ms for valid code). +1 LLM generation call + 1 parser call on invalid code (~1–2s additional).
|
||||||
|
- **`RETRIEVAL` latency**: +1 LLM call (relevance check) on every request. At `qwen3:1.7b` local inference, this adds ~300–500ms to every RETRIEVAL request — not negligible.
|
||||||
|
- The parser becomes a **soft production dependency** for CODE_GENERATION. Parser outages degrade validation silently; monitoring must alert on sustained `parser unavailable` log volume.
|
||||||
|
- The context relevance check is a **generative model doing a binary classification task** — the same architectural mismatch noted in ADR-0008 for the classifier. It is the correct interim solution while no discriminative relevance model exists.
|
||||||
|
|
||||||
|
### Open questions
|
||||||
|
|
||||||
|
1. **`RETRIEVAL` latency budget**: The +300–500ms from the relevance LLM call may be unacceptable for the VS Code extension use case where streaming latency is user-visible. A discriminative relevance model (embedding similarity between query vector and context vector, cosine threshold) would be ~1ms and eliminate this cost entirely. Deferred to a future amendment.
|
||||||
|
|
||||||
|
2. **`validation_status` UX**: The proto field is defined but the client behavior is not specified. What should the VS Code extension or AVS Platform display when `validation_status=true`? Requires a product decision outside this ADR's scope.
|
||||||
|
|
||||||
|
3. **Parser version pinning**: Inherited from ADR-0007 open question 2. Parser upgrades may alter what is considered valid AVAP. A policy for handling parser version changes in the production pipeline has not been defined.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Future Path
|
||||||
|
|
||||||
|
The context relevance check for `RETRIEVAL` (Decision 2) uses a generative LLM for a discriminative task — the same pattern that ADR-0008 identified as tactical debt for the classifier. The correct steady-state implementation is a cosine similarity threshold between the query embedding vector and the average context embedding vector:
|
||||||
|
|
||||||
|
```
|
||||||
|
relevance_score = cosine(embed(query), mean(embed(chunks)))
|
||||||
|
if relevance_score < RELEVANCE_THRESHOLD:
|
||||||
|
reformulate_with_hint()
|
||||||
|
```
|
||||||
|
|
||||||
|
This runs in microseconds using the `bge-m3` embeddings already computed during retrieval. It replaces the `CONFIDENCE_PROMPT_TEMPLATE` LLM call entirely and eliminates the +300–500ms latency penalty on every RETRIEVAL request.
|
||||||
|
|
||||||
|
**Trigger for this upgrade:** once the RETRIEVAL validation LLM call appears as a measurable latency contribution in Langfuse traces.
|
||||||
|
|
@ -0,0 +1,156 @@
|
||||||
|
# ADR-0010: Classifier Continuous Retraining — Champion/Challenger Pipeline
|
||||||
|
|
||||||
|
**Date:** 2026-04-10
|
||||||
|
**Status:** Accepted
|
||||||
|
**Deciders:** Rafael Ruiz (CTO)
|
||||||
|
**Related ADRs:** ADR-0008 (Adaptive Query Routing — Layer 2 classifier), ADR-0009 (Per-Type Response Validation)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Context
|
||||||
|
|
||||||
|
ADR-0008 Phase 2 deployed a Layer 2 embedding classifier trained on a **seed dataset of 94 hand-crafted examples**. This model works well for the initial distribution of queries but has two structural limitations:
|
||||||
|
|
||||||
|
1. **The seed dataset does not reflect production traffic.** Hand-crafted examples are idealized. Real users ask questions with typos, mixed languages, ambiguous phrasing, and domain-specific vocabulary that is not in the seed.
|
||||||
|
|
||||||
|
2. **The model never improves without manual intervention.** The data flywheel (ADR-0008 Phase 1) accumulates labeled examples automatically via `classify_history_store`, but nothing uses them. Data piles up in `/data/classifier_labels/` and the model stays frozen at its initial accuracy.
|
||||||
|
|
||||||
|
The consequence is that Layer 2 confidence degrades over time relative to the actual query distribution, pushing more requests to Layer 3 (LLM classifier) than necessary.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Decision
|
||||||
|
|
||||||
|
Implement a **Champion/Challenger automatic retraining pipeline** that triggers every time a new batch of labeled data is exported.
|
||||||
|
|
||||||
|
### Core design
|
||||||
|
|
||||||
|
```mermaid
|
||||||
|
flowchart TD
|
||||||
|
S[classify_history_store\naccumulates sessions] -->|100 sessions| EX[classifier_export.py\nexport JSONL]
|
||||||
|
EX -->|RETRAIN_ON_EXPORT=true| RT[retrain_pipeline.py\nbackground thread]
|
||||||
|
|
||||||
|
RT --> LOAD[Load seed + all exports]
|
||||||
|
LOAD --> EMB[Embed with bge-m3]
|
||||||
|
EMB --> SPLIT[Stratified 80/20 split\ntrain / held-out]
|
||||||
|
|
||||||
|
SPLIT --> CV[Cross-validate challenger\nStratifiedKFold]
|
||||||
|
CV -->|CV < 0.90| ABORT[Abort — do not deploy\nlog alert]
|
||||||
|
CV -->|CV ≥ 0.90| TRAIN[Train challenger\non full train split]
|
||||||
|
|
||||||
|
TRAIN --> EVAL_CH[Evaluate challenger\non held-out set]
|
||||||
|
EVAL_CH --> EVAL_CP[Evaluate champion\non same held-out set]
|
||||||
|
|
||||||
|
EVAL_CP --> DECISION{challenger ≥ champion?}
|
||||||
|
DECISION -->|yes| BACKUP[Backup champion]
|
||||||
|
BACKUP --> DEPLOY[Deploy challenger\noverwrite CLASSIFIER_MODEL_PATH]
|
||||||
|
DEPLOY --> ARCHIVE[Archive processed exports]
|
||||||
|
|
||||||
|
DECISION -->|no| KEEP[Keep champion\nlog alert\ndiscard challenger]
|
||||||
|
KEEP --> ARCHIVE
|
||||||
|
```
|
||||||
|
|
||||||
|
### Champion/Challenger semantics
|
||||||
|
|
||||||
|
The model currently in production is the **champion**. The newly trained model is the **challenger**. The challenger is only promoted if its accuracy on the held-out set is **greater than or equal to** the champion's accuracy on the same set.
|
||||||
|
|
||||||
|
This guarantees that the production model never regresses. A retraining run triggered by noisy or unbalanced data will produce a challenger that loses the comparison and is discarded automatically.
|
||||||
|
|
||||||
|
If no champion exists (first deployment), the challenger is promoted unconditionally provided CV accuracy ≥ 0.90.
|
||||||
|
|
||||||
|
### Trigger
|
||||||
|
|
||||||
|
The pipeline is triggered by `classifier_export.py` after every successful export — which happens every `CLASSIFIER_EXPORT_THRESHOLD` sessions (default: **100**).
|
||||||
|
|
||||||
|
The pipeline runs in a **background daemon thread** inside the engine container. It does not block gRPC request handling. A 10-minute hard timeout prevents runaway retraining from consuming resources indefinitely.
|
||||||
|
|
||||||
|
### Model loading after promotion
|
||||||
|
|
||||||
|
The engine does **not** hot-reload the model mid-operation. The new champion is written to `CLASSIFIER_MODEL_PATH` and loaded on the next engine restart. This is intentional:
|
||||||
|
|
||||||
|
- Hot-reload would require locking around every inference call, adding latency.
|
||||||
|
- Mid-session model changes could produce inconsistent classification for the same user within a conversation.
|
||||||
|
- Docker container restarts are cheap and already part of the deployment workflow.
|
||||||
|
|
||||||
|
The engine logs a clear message after promotion:
|
||||||
|
|
||||||
|
```
|
||||||
|
[classifier_export] retraining completed — restart engine to load new model
|
||||||
|
```
|
||||||
|
|
||||||
|
### Safety mechanisms
|
||||||
|
|
||||||
|
| Mechanism | Purpose |
|
||||||
|
|---|---|
|
||||||
|
| CV accuracy gate (≥ 0.90) | Rejects challengers trained on insufficient or unbalanced data before the held-out comparison |
|
||||||
|
| Champion comparison on held-out | Prevents regression — challenger must equal or beat the current production model |
|
||||||
|
| Champion backup before overwrite | `classifier_model_backup_{timestamp}.pkl` — roll back manually if needed |
|
||||||
|
| Export archiving | Processed files moved to `CLASSIFIER_ARCHIVE_DIR` — prevents re-inclusion in future runs |
|
||||||
|
| 10-minute subprocess timeout | Prevents runaway retraining from blocking the engine indefinitely |
|
||||||
|
| `RETRAIN_ON_EXPORT=false` | Disables automatic retraining without code changes — useful in staging or during debugging |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Environment variables
|
||||||
|
|
||||||
|
| Variable | Default | Purpose |
|
||||||
|
|---|---|---|
|
||||||
|
| `CLASSIFIER_EXPORT_THRESHOLD` | `100` | Sessions before export + retrain trigger |
|
||||||
|
| `RETRAIN_ON_EXPORT` | `true` | Enable/disable automatic retraining |
|
||||||
|
| `RETRAIN_SCRIPT_PATH` | `/app/scripts/pipelines/classifier/retrain_pipeline.py` | Path to retrain script inside container |
|
||||||
|
| `CLASSIFIER_ARCHIVE_DIR` | `/data/classifier_labels/archived` | Where processed exports are moved after retraining |
|
||||||
|
| `CLASSIFIER_SEED_DATASET` | `/app/scripts/pipelines/classifier/seed_classifier_dataset.jsonl` | Seed dataset always included in retraining |
|
||||||
|
| `CLASSIFIER_MIN_CV_ACCURACY` | `0.90` | Minimum CV accuracy for challenger to proceed |
|
||||||
|
| `CLASSIFIER_HELD_OUT_RATIO` | `0.20` | Fraction of merged dataset reserved for champion/challenger comparison |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Files
|
||||||
|
|
||||||
|
| File | Role |
|
||||||
|
|---|---|
|
||||||
|
| `scripts/pipelines/classifier/retrain_pipeline.py` | Champion/Challenger training, evaluation, promotion, and archiving |
|
||||||
|
| `Docker/src/utils/classifier_export.py` | Export trigger — launches `retrain_pipeline.py` in background after export |
|
||||||
|
| `scripts/pipelines/classifier/seed_classifier_dataset.jsonl` | Always included in retraining — anchors the model on known-good examples |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Convergence behavior
|
||||||
|
|
||||||
|
Each retraining cycle merges the seed dataset with all accumulated production exports. As production traffic grows, the model progressively reflects real user queries rather than the hand-crafted seed.
|
||||||
|
|
||||||
|
```mermaid
|
||||||
|
flowchart LR
|
||||||
|
T0["Cycle 0\n94 seed examples\nCV 1.0 on seed"] -->
|
||||||
|
T1["Cycle 1\n94 + ~100 production\nreal query distribution"] -->
|
||||||
|
T2["Cycle 2\n94 + ~200 production\nincreasing coverage"] -->
|
||||||
|
TN["Cycle N\nseed becomes minority\nmodel reflects production traffic"]
|
||||||
|
```
|
||||||
|
|
||||||
|
The seed dataset is never removed — it acts as a regularizer that prevents the model from drifting entirely to production distribution edge cases.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Consequences
|
||||||
|
|
||||||
|
### Positive
|
||||||
|
|
||||||
|
- Layer 2 accuracy improves automatically with usage — no human intervention required after initial deployment.
|
||||||
|
- The champion/challenger gate prevents production regressions from noisy batches.
|
||||||
|
- `RETRAIN_ON_EXPORT=false` provides a complete off switch without code changes.
|
||||||
|
- Export archiving keeps `/data/classifier_labels/` clean — only unprocessed exports accumulate.
|
||||||
|
- The backup mechanism allows manual rollback in the rare case a promoted challenger performs unexpectedly in production.
|
||||||
|
|
||||||
|
### Negative / Trade-offs
|
||||||
|
|
||||||
|
- **Retraining uses Ollama (bge-m3) on the host.** The retrain script runs inside the container but needs `OLLAMA_LOCAL_URL` reachable. If Ollama is down at retraining time, the pipeline fails and logs an error — the champion is unchanged.
|
||||||
|
- **The engine requires a restart to load the new model.** Promotions are invisible to users until restart. In low-traffic periods this is acceptable; high-traffic deployments may need an orchestrated restart strategy.
|
||||||
|
- **Accumulated exports grow unboundedly if retraining fails.** If the pipeline consistently fails (e.g., Ollama unreachable), exports accumulate in `/data/classifier_labels/` without being archived. A monitoring alert on directory size is recommended.
|
||||||
|
|
||||||
|
### Open questions
|
||||||
|
|
||||||
|
1. **Orchestrated restart policy:** Who triggers the engine restart after a promotion? Currently manual. Could be automated via a health-check endpoint that returns a `model_updated` flag, allowing the orchestrator to restart when traffic is low.
|
||||||
|
|
||||||
|
2. **Held-out set stability:** The held-out set is resampled on every retraining cycle from the merged dataset. As the dataset grows, the held-out set changes between cycles, making champion accuracy scores not directly comparable across cycles. A fixed held-out set (frozen after the first N examples) would improve comparability. Deferred.
|
||||||
|
|
||||||
|
3. **Class imbalance over time:** Production traffic may not be balanced across the four query types. If `CODE_GENERATION` is rare in production, its representation in the training set shrinks relative to the seed. The CV gate catches catastrophic imbalance but not gradual drift. A per-class recall threshold could be added to the gate.
|
||||||
|
|
@ -1,8 +1,8 @@
|
||||||
# Brunix Assistance Engine — Architecture Reference
|
# Brunix Assistance Engine — Architecture Reference
|
||||||
|
|
||||||
> **Audience:** Engineers contributing to this repository, architects reviewing the system design, and operators responsible for its deployment.
|
> **Audience:** Engineers contributing to this repository, architects reviewing the system design, and operators responsible for its deployment.
|
||||||
> **Last updated:** 2026-04-09
|
> **Last updated:** 2026-04-10
|
||||||
> **Version:** 1.8.x
|
> **Version:** 1.9.x
|
||||||
> **Architect:** Rafael Ruiz (CTO, 101OBEX Corp)
|
> **Architect:** Rafael Ruiz (CTO, 101OBEX Corp)
|
||||||
> **Related ADRs:** ADR-0001 · ADR-0002 · ADR-0003 · ADR-0004 · ADR-0005 · ADR-0006 · ADR-0007 · ADR-0008
|
> **Related ADRs:** ADR-0001 · ADR-0002 · ADR-0003 · ADR-0004 · ADR-0005 · ADR-0006 · ADR-0007 · ADR-0008
|
||||||
> **Related PRDs:** PRD-0001 · PRD-0002 · PRD-0003
|
> **Related PRDs:** PRD-0001 · PRD-0002 · PRD-0003
|
||||||
|
|
@ -116,13 +116,15 @@ Langfuse is the exception — it has a public IP (`45.77.119.180`) and is access
|
||||||
|---|---|---|
|
|---|---|---|
|
||||||
| gRPC server | `server.py` | Entry point for all AI requests. Manages session store, model selection, and state initialization |
|
| gRPC server | `server.py` | Entry point for all AI requests. Manages session store, model selection, and state initialization |
|
||||||
| HTTP proxy | `openai_proxy.py` | OpenAI + Ollama compatible HTTP layer. Translates REST → gRPC |
|
| HTTP proxy | `openai_proxy.py` | OpenAI + Ollama compatible HTTP layer. Translates REST → gRPC |
|
||||||
| LangGraph orchestrator | `graph.py` | Builds and executes the agentic routing graph |
|
| LangGraph orchestrator | `graph.py` | Builds and executes the agentic routing graph. Hosts L1, L2, and L3 classifier layers |
|
||||||
| Prompt definitions | `prompts.py` | All prompt templates in one place: classifier, reformulator, generators, platform |
|
| Prompt definitions | `prompts.py` | All prompt templates in one place: classifier, reformulator, generators, platform |
|
||||||
| Agent state | `state.py` | `AgentState` TypedDict shared across all graph nodes |
|
| Agent state | `state.py` | `AgentState` TypedDict shared across all graph nodes |
|
||||||
| LLM factory | `utils/llm_factory.py` | Provider-agnostic model instantiation (Ollama, OpenAI, Anthropic, Bedrock) |
|
| LLM factory | `utils/llm_factory.py` | Provider-agnostic model instantiation (Ollama, OpenAI, Anthropic, Bedrock) |
|
||||||
| Embedding factory | `utils/emb_factory.py` | Provider-agnostic embedding model instantiation |
|
| Embedding factory | `utils/emb_factory.py` | Provider-agnostic embedding model instantiation |
|
||||||
|
| Classifier export | `utils/classifier_export.py` | Exports `classify_history_store` to labeled JSONL when threshold is reached. Data flywheel for Layer 2 retraining |
|
||||||
| Evaluation pipeline | `evaluate.py` | RAGAS evaluation with Claude as judge |
|
| Evaluation pipeline | `evaluate.py` | RAGAS evaluation with Claude as judge |
|
||||||
| Proto contract | `protos/brunix.proto` | Source of truth for the gRPC API |
|
| Proto contract | `protos/brunix.proto` | Source of truth for the gRPC API |
|
||||||
|
| Classifier training | `scripts/pipelines/classifier/train_classifier.py` | Offline script. Embeds labeled queries with bge-m3, trains LogisticRegression, serializes model |
|
||||||
|
|
||||||
**Model slots:**
|
**Model slots:**
|
||||||
|
|
||||||
|
|
@ -199,21 +201,36 @@ The classifier is the most critical node. It determines the entire execution pat
|
||||||
|
|
||||||
### Classification pipeline
|
### Classification pipeline
|
||||||
|
|
||||||
|
Three-layer pipeline. Each layer is only invoked if the previous one cannot produce a confident answer.
|
||||||
|
|
||||||
```mermaid
|
```mermaid
|
||||||
flowchart TD
|
flowchart TD
|
||||||
Q[Incoming query] --> FP{Fast-path check\n_is_platform_query}
|
Q([Incoming query]) --> CD{Caller-declared\nquery_type?}
|
||||||
FP -->|known platform prefix| PLATFORM[PLATFORM\nno LLM call]
|
CD -->|proto field 7 set| DECL[Use declared type\nbypass all layers]
|
||||||
FP -->|no match| LLM[LLM Classifier\nqwen3:1.7b]
|
CD -->|empty| L1
|
||||||
|
|
||||||
LLM --> IH[Intent history\nlast 6 entries as context]
|
L1{"Layer 1\nHard rules\nRC-01 · RC-02\nO(1) deterministic"}
|
||||||
IH --> OUT[Output: TYPE + EDITOR/NO_EDITOR]
|
L1 -->|match| R1[Classification result]
|
||||||
|
L1 -->|no match| L2
|
||||||
|
|
||||||
OUT --> R[RETRIEVAL]
|
L2{"Layer 2\nEmbedding classifier\nbge-m3 + LogisticRegression\n~1ms · CPU · no LLM"}
|
||||||
OUT --> C[CODE_GENERATION]
|
L2 -->|confidence ≥ 0.85| R1
|
||||||
OUT --> V[CONVERSATIONAL]
|
L2 -->|confidence < 0.85| L3
|
||||||
OUT --> PL[PLATFORM]
|
|
||||||
|
L3{"Layer 3\nLLM classifier\nqwen3:1.7b\nfallback for ambiguous queries"}
|
||||||
|
L3 --> R1
|
||||||
|
|
||||||
|
DECL --> R1
|
||||||
|
R1 --> RT[RETRIEVAL]
|
||||||
|
R1 --> CG[CODE_GENERATION]
|
||||||
|
R1 --> CV[CONVERSATIONAL]
|
||||||
|
R1 --> PL[PLATFORM]
|
||||||
```
|
```
|
||||||
|
|
||||||
|
**Caller-declared type (Phase 3):** When the calling system already knows the query type — for example, the AVS Platform generating a `PLATFORM` prompt — it sets `query_type` in the proto request. All three classifier layers are skipped entirely.
|
||||||
|
|
||||||
|
**Layer 2** is loaded from `CLASSIFIER_MODEL_PATH` (`/data/classifier_model.pkl`) at startup. If the file does not exist, the engine starts normally and uses Layer 3 only.
|
||||||
|
|
||||||
### Intent history — solving anchoring bias
|
### Intent history — solving anchoring bias
|
||||||
|
|
||||||
The classifier does not receive raw conversation messages. It receives a compact trace of prior classifications:
|
The classifier does not receive raw conversation messages. It receives a compact trace of prior classifications:
|
||||||
|
|
@ -386,16 +403,23 @@ Implemented in PRD-0003. Queries about account, metrics, usage, or billing bypas
|
||||||
|
|
||||||
```mermaid
|
```mermaid
|
||||||
flowchart TD
|
flowchart TD
|
||||||
Q[Incoming query] --> FP{_is_platform_query?}
|
Q[Incoming query] --> CD{query_type\ndeclared in proto?}
|
||||||
FP -->|yes — known prefix| SKIP[skip classifier LLM\nroute = PLATFORM]
|
CD -->|yes| DECL[bypass all layers\nroute = declared type]
|
||||||
FP -->|no| CL[LLM classifier]
|
CD -->|no| FP
|
||||||
CL -->|PLATFORM| ROUTE[route to respond_platform]
|
|
||||||
SKIP --> ROUTE
|
FP{Layer 1\n_is_platform_query?\nRC-01 · RC-02}
|
||||||
|
FP -->|yes| SKIP[skip L2 + L3\nroute = PLATFORM]
|
||||||
|
FP -->|no| L2[Layer 2 + Layer 3\nnormal classification]
|
||||||
|
L2 -->|PLATFORM| ROUTE
|
||||||
|
|
||||||
|
DECL -->|PLATFORM| ROUTE
|
||||||
|
SKIP --> ROUTE[route to respond_platform]
|
||||||
|
|
||||||
ROUTE --> PROMPT["PLATFORM_PROMPT\n+ extra_context injection\n+ user_info available"]
|
ROUTE --> PROMPT["PLATFORM_PROMPT\n+ extra_context injection\n+ user_info available"]
|
||||||
PROMPT --> LLM[qwen3:0.6b\nconversational model slot]
|
PROMPT --> LLM[qwen3:0.6b\nconversational model slot]
|
||||||
LLM --> RESP[response]
|
LLM --> RESP[response]
|
||||||
|
|
||||||
|
style DECL fill:#2d6a4f,color:#fff
|
||||||
style SKIP fill:#2d6a4f,color:#fff
|
style SKIP fill:#2d6a4f,color:#fff
|
||||||
style ROUTE fill:#2d6a4f,color:#fff
|
style ROUTE fill:#2d6a4f,color:#fff
|
||||||
```
|
```
|
||||||
|
|
@ -422,7 +446,7 @@ flowchart LR
|
||||||
CHS --> CE
|
CHS --> CE
|
||||||
```
|
```
|
||||||
|
|
||||||
**`classify_history_store` is also a data flywheel.** Every session generates labeled `(topic, type)` pairs automatically. When sufficient sessions accumulate (~500), this store can be exported to train the Layer 2 embedding classifier described in ADR-0008 Future Path — eliminating the need for the LLM classifier on the majority of requests.
|
**`classify_history_store` is the data flywheel for Layer 2 retraining.** Every session generates labeled `(topic, type)` pairs automatically. `utils/classifier_export.py` exports them to JSONL when `CLASSIFIER_EXPORT_THRESHOLD` sessions accumulate (default: 500), and flushes on shutdown. These files feed directly into `scripts/pipelines/classifier/train_classifier.py` for retraining Layer 2 with production data — no manual labeling required.
|
||||||
|
|
||||||
### AgentState fields
|
### AgentState fields
|
||||||
|
|
||||||
|
|
@ -524,21 +548,24 @@ Langfuse is accessed directly via public IP — no kubectl tunnel required. The
|
||||||
|
|
||||||
| Item | Description | ADR |
|
| Item | Description | ADR |
|
||||||
|---|---|---|
|
|---|---|---|
|
||||||
| LLM classifier | Generative model doing discriminative work — non-deterministic, pays full inference cost for a 4-class label | ADR-0008 |
|
| LLM classifier (Layer 3) | Still the dominant path until Layer 2 is retrained with production data and confidence improves | ADR-0008 |
|
||||||
| RC-02 is soft | Platform data signal enforced via prompt `<platform_priority_rule>`, not code — can be overridden by model | ADR-0008 |
|
|
||||||
| `classify_history` not exported | Data flywheel accumulates but has no export mechanism yet | ADR-0008 |
|
|
||||||
| `user_info` unused | `dev_id`, `project_id`, `org_id` are in state but not consumed by any graph node | PRD-0002 |
|
| `user_info` unused | `dev_id`, `project_id`, `org_id` are in state but not consumed by any graph node | PRD-0002 |
|
||||||
| `CONFIDENCE_PROMPT_TEMPLATE` unused | Self-RAG capability is scaffolded in `prompts.py` but not wired into the graph | — |
|
| `CONFIDENCE_PROMPT_TEMPLATE` unused | Self-RAG capability is scaffolded in `prompts.py` but not wired into the graph | — |
|
||||||
|
| Layer 2 seed dataset only | Current model trained on 94 hand-crafted examples. Must be retrained with production exports to reduce L3 fallback rate | ADR-0008 |
|
||||||
|
|
||||||
### Roadmap (ADR-0008 Future Path)
|
### ADR-0008 implementation status
|
||||||
|
|
||||||
```mermaid
|
```mermaid
|
||||||
flowchart LR
|
flowchart LR
|
||||||
P0["Now\nLLM classifier\n~95% of traffic"] -->
|
P0["Bootstrap\nLLM classifier\n✅ Implemented"] -->
|
||||||
P1["Phase 1\nExport classify_history_store\nlabeled dataset"] -->
|
P1["Phase 1\nData flywheel\nclassifier_export.py\n✅ Complete"] -->
|
||||||
P2["Phase 2\nEmbedding classifier Layer 2\nbge-m3 + logistic regression\n~1ms CPU"] -->
|
P2["Phase 2\nLayer 2 — bge-m3\n+ LogisticRegression\n✅ Complete"] -->
|
||||||
P3["Phase 3\nCaller-declared query_type\nproto field 7"] -->
|
P3["Phase 3\nCaller-declared type\nproto field 7\n✅ Complete"] -->
|
||||||
P4["Phase 4\nLLM classifier = anomaly handler\n<2% of traffic"]
|
P4["Phase 4\nLLM = anomaly handler\n<2% traffic\n⏳ Production outcome"]
|
||||||
```
|
```
|
||||||
|
|
||||||
Target steady-state: the LLM classifier handles fewer than 2% of requests — only genuinely ambiguous queries that neither hard rules nor the trained embedding classifier can resolve with confidence.
|
Phase 4 is not a code deliverable — it is a production state reached when the platform client sends `query_type` for generated prompts and Layer 2 confidence improves with production data. Monitor with:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker logs brunix-assistance-engine 2>&1 | grep "classifier/L" | grep -c "L3"
|
||||||
|
```
|
||||||
|
|
|
||||||
|
|
@ -1,7 +1,8 @@
|
||||||
# PRD-0003: Adaptive Query Routing with Platform Intent and Model Specialization
|
# PRD-0003: Adaptive Query Routing with Platform Intent and Model Specialization
|
||||||
|
|
||||||
**Date:** 2026-04-09
|
**Date:** 2026-04-09
|
||||||
**Status:** Implemented
|
**Last updated:** 2026-04-10
|
||||||
|
**Status:** Implemented (Phases 1–3 complete)
|
||||||
**Requested by:** Rafael Ruiz (CTO)
|
**Requested by:** Rafael Ruiz (CTO)
|
||||||
**Purpose:** Route platform queries correctly and reduce inference cost for non-RAG requests
|
**Purpose:** Route platform queries correctly and reduce inference cost for non-RAG requests
|
||||||
**Related ADR:** ADR-0008 (Adaptive Query Routing — Taxonomy, Contract, and Classifier Strategy)
|
**Related ADR:** ADR-0008 (Adaptive Query Routing — Taxonomy, Contract, and Classifier Strategy)
|
||||||
|
|
@ -100,11 +101,15 @@ A user asks a general AVAP question, then `"en menos palabras"`. The engine clas
|
||||||
- Select `active_llm` per request in `AskAgentStream` based on `query_type`
|
- Select `active_llm` per request in `AskAgentStream` based on `query_type`
|
||||||
- Add `classify_history` field to `AgentState` and `ClassifyEntry` type to `state.py`
|
- Add `classify_history` field to `AgentState` and `ClassifyEntry` type to `state.py`
|
||||||
|
|
||||||
|
**Also in scope (ADR-0008 Phases 1–3):**
|
||||||
|
- Export `classify_history_store` to labeled JSONL when threshold is reached (`classifier_export.py`) — data flywheel for Layer 2 retraining
|
||||||
|
- Embedding classifier Layer 2: bge-m3 + LogisticRegression trained on seed dataset, loaded at startup, intercepts queries before LLM with ≥0.85 confidence threshold
|
||||||
|
- Caller-declared `query_type`: proto field 7 in `AgentRequest` — when set, all three classifier layers are bypassed
|
||||||
|
|
||||||
**Out of scope:**
|
**Out of scope:**
|
||||||
- Changes to `EvaluateRAG` — golden dataset does not include platform queries
|
- Changes to `EvaluateRAG` — golden dataset does not include platform queries
|
||||||
- `user_info` consumption in graph logic — available in state, not yet acted upon
|
- `user_info` consumption in graph logic — available in state, not yet acted upon
|
||||||
- Sub-typing `PLATFORM` (e.g. `PLATFORM_METRICS` vs `PLATFORM_BILLING`) — deferred
|
- Sub-typing `PLATFORM` (e.g. `PLATFORM_METRICS` vs `PLATFORM_BILLING`) — deferred
|
||||||
- Proto changes to add caller-declared `query_type` field — deferred to Phase 3 of Future Path (ADR-0008)
|
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
|
|
@ -151,6 +156,10 @@ Classifier prompt additions:
|
||||||
|---|---|---|
|
|---|---|---|
|
||||||
| `OLLAMA_MODEL_NAME` | Main model: RETRIEVAL + CODE_GENERATION | required |
|
| `OLLAMA_MODEL_NAME` | Main model: RETRIEVAL + CODE_GENERATION | required |
|
||||||
| `OLLAMA_MODEL_NAME_CONVERSATIONAL` | Light model: CONVERSATIONAL + PLATFORM | falls back to `OLLAMA_MODEL_NAME` |
|
| `OLLAMA_MODEL_NAME_CONVERSATIONAL` | Light model: CONVERSATIONAL + PLATFORM | falls back to `OLLAMA_MODEL_NAME` |
|
||||||
|
| `CLASSIFIER_EXPORT_THRESHOLD` | Sessions before auto-export of labeled data | `500` |
|
||||||
|
| `CLASSIFIER_EXPORT_DIR` | Directory for exported JSONL label files | `/data/classifier_labels` |
|
||||||
|
| `CLASSIFIER_MODEL_PATH` | Path to serialized Layer 2 model | `/data/classifier_model.pkl` |
|
||||||
|
| `CLASSIFIER_CONFIDENCE_THRESHOLD` | Minimum confidence for Layer 2 to classify (else fall through to LLM) | `0.85` |
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
|
|
@ -169,10 +178,14 @@ Classifier prompt additions:
|
||||||
|
|
||||||
| Log line | Expected |
|
| Log line | Expected |
|
||||||
|---|---|
|
|---|---|
|
||||||
| `[classify] platform prefix detected -> PLATFORM` | Fast-path firing for known prefixes |
|
| `[classify] L1 → PLATFORM` | Layer 1 fast-path firing for known prefixes |
|
||||||
| `[prepare/classify] ... -> PLATFORM` | LLM classifier correctly routing platform queries |
|
| `[classifier/L2] → PLATFORM (conf=0.97)` | Layer 2 classifying with high confidence |
|
||||||
|
| `[classifier/L2] low confidence (0.72) → LLM fallback` | Layer 2 falling through to Layer 3 |
|
||||||
|
| `[classify] caller-declared → PLATFORM` | Phase 3 — caller bypassing all layers |
|
||||||
|
| `[prepare/classify] L3 ... -> PLATFORM` | Layer 3 LLM classifier correctly routing |
|
||||||
| `[AskAgentStream] query_type=PLATFORM context_len=0` | Zero retrieval for PLATFORM |
|
| `[AskAgentStream] query_type=PLATFORM context_len=0` | Zero retrieval for PLATFORM |
|
||||||
| `[hybrid] RRF -> N final docs` | Must NOT appear for PLATFORM queries |
|
| `[hybrid] RRF -> N final docs` | Must NOT appear for PLATFORM queries |
|
||||||
|
| `[classifier/L2] model loaded from /data/classifier_model.pkl` | Layer 2 loaded at startup |
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,208 @@
|
||||||
|
# Layer 2 Classifier — Training Pipeline
|
||||||
|
|
||||||
|
**Author:** Rafael Ruiz (CTO, 101OBEX Corp)
|
||||||
|
**Related:** ADR-0008 — Adaptive Query Routing
|
||||||
|
|
||||||
|
Part of **ADR-0008 Phase 2**: trains the embedding-based classifier that intercepts
|
||||||
|
queries before they reach the LLM (Layer 3), reducing per-request Ollama calls to
|
||||||
|
near zero for well-represented query types.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
The classifier embeds each query with **bge-m3** (already running in the stack),
|
||||||
|
trains a **LogisticRegression** on the resulting vectors, and serializes the model
|
||||||
|
with joblib. At engine startup, `graph.py` loads the model and uses it as Layer 2
|
||||||
|
in the classification pipeline.
|
||||||
|
|
||||||
|
```mermaid
|
||||||
|
flowchart TD
|
||||||
|
Q([Query]) --> L1
|
||||||
|
|
||||||
|
L1["Layer 1 — Hard rules\nRC-01 · RC-02\nO(1), deterministic"]
|
||||||
|
L1 -->|match| R([Classification result])
|
||||||
|
L1 -->|no match| L2
|
||||||
|
|
||||||
|
L2["Layer 2 — Embedding classifier\nbge-m3 + LogisticRegression\n~1ms · CPU only · no LLM"]
|
||||||
|
L2 -->|confidence ≥ 0.85| R
|
||||||
|
L2 -->|confidence < 0.85| L3
|
||||||
|
|
||||||
|
L3["Layer 3 — LLM classifier\nOllama fallback\n~300–800ms"]
|
||||||
|
L3 --> R
|
||||||
|
```
|
||||||
|
|
||||||
|
If the model file does not exist, the engine starts normally and uses L3 only.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Files
|
||||||
|
|
||||||
|
| File | Purpose |
|
||||||
|
|---|---|
|
||||||
|
| `train_classifier.py` | Training script |
|
||||||
|
| `seed_classifier_dataset.jsonl` | Labeled dataset (seed, 100 examples) |
|
||||||
|
| `requirements.txt` | Python dependencies for the training venv |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Setup
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cd scripts/pipelines/classifier
|
||||||
|
|
||||||
|
python3 -m venv .venv
|
||||||
|
source .venv/bin/activate
|
||||||
|
pip install -r requirements.txt
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Running the training
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python train_classifier.py
|
||||||
|
```
|
||||||
|
|
||||||
|
Default behaviour (no flags needed if running from this directory):
|
||||||
|
- **data**: `seed_classifier_dataset.jsonl` in the same folder
|
||||||
|
- **ollama**: `http://localhost:11434` (or `$OLLAMA_LOCAL_URL`)
|
||||||
|
- **output**: `/data/classifier_model.pkl` (or `$CLASSIFIER_MODEL_PATH`)
|
||||||
|
- **min CV accuracy**: 0.90
|
||||||
|
|
||||||
|
The script exits with code 1 if CV accuracy is below the threshold — the model is
|
||||||
|
**not saved** in that case.
|
||||||
|
|
||||||
|
### All arguments
|
||||||
|
|
||||||
|
| Argument | Default | Description |
|
||||||
|
|---|---|---|
|
||||||
|
| `--data` | `seed_classifier_dataset.jsonl` | Path to labeled JSONL dataset |
|
||||||
|
| `--output` | `/data/classifier_model.pkl` | Output path for serialized model |
|
||||||
|
| `--ollama` | `http://localhost:11434` | Ollama base URL |
|
||||||
|
| `--min-cv-accuracy` | `0.90` | Minimum CV accuracy to save the model |
|
||||||
|
|
||||||
|
### Custom output path (recommended when running from host)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python train_classifier.py \
|
||||||
|
--output ../../../Docker/data/classifier_model.pkl
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Deploying the model to Docker
|
||||||
|
|
||||||
|
```mermaid
|
||||||
|
sequenceDiagram
|
||||||
|
participant Host
|
||||||
|
participant Docker
|
||||||
|
|
||||||
|
Host->>Host: python train_classifier.py --output ./Docker/data/classifier_model.pkl
|
||||||
|
Host->>Docker: docker cp classifier_model.pkl brunix-assistance-engine:/data/
|
||||||
|
Host->>Docker: docker restart brunix-assistance-engine
|
||||||
|
Docker-->>Host: [classifier/L2] model loaded from /data/classifier_model.pkl
|
||||||
|
```
|
||||||
|
|
||||||
|
On startup you will see in the logs:
|
||||||
|
|
||||||
|
```
|
||||||
|
[classifier/L2] model loaded from /data/classifier_model.pkl — {'classes': [...], 'n_train': 100, 'cv_mean': 0.96, ...}
|
||||||
|
```
|
||||||
|
|
||||||
|
If the model is missing:
|
||||||
|
|
||||||
|
```
|
||||||
|
[classifier/L2] model not found at /data/classifier_model.pkl — using LLM fallback only
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Dataset format
|
||||||
|
|
||||||
|
Every line is a JSON object with two required fields:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{"query": "What is addVar in AVAP?", "type": "RETRIEVAL"}
|
||||||
|
```
|
||||||
|
|
||||||
|
Valid types: `RETRIEVAL`, `CODE_GENERATION`, `CONVERSATIONAL`, `PLATFORM`.
|
||||||
|
|
||||||
|
Lines with missing `query` or `type`, or invalid JSON, are skipped with a warning.
|
||||||
|
|
||||||
|
### Merging production exports with the seed
|
||||||
|
|
||||||
|
```mermaid
|
||||||
|
flowchart LR
|
||||||
|
S[seed_classifier_dataset.jsonl] --> M
|
||||||
|
E["/data/classifier_labels/\nclassifier_labels_*.jsonl\n(production exports)"] --> M
|
||||||
|
M([merge]) --> D[merged_dataset.jsonl]
|
||||||
|
D --> T[train_classifier.py]
|
||||||
|
T --> P[classifier_model.pkl]
|
||||||
|
P --> E2[engine restart]
|
||||||
|
```
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cat seed_classifier_dataset.jsonl \
|
||||||
|
/data/classifier_labels/classifier_labels_*.jsonl \
|
||||||
|
> merged_dataset.jsonl
|
||||||
|
|
||||||
|
python train_classifier.py --data merged_dataset.jsonl
|
||||||
|
```
|
||||||
|
|
||||||
|
The engine exports labeled data automatically to `CLASSIFIER_EXPORT_DIR`
|
||||||
|
(default `/data/classifier_labels/`) once `CLASSIFIER_EXPORT_THRESHOLD` sessions
|
||||||
|
accumulate. Those files use the same format and can be merged directly.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Expected output
|
||||||
|
|
||||||
|
```
|
||||||
|
[1/4] Loading data from seed_classifier_dataset.jsonl
|
||||||
|
100 examples loaded
|
||||||
|
Distribution: {'RETRIEVAL': 25, 'CODE_GENERATION': 25, 'CONVERSATIONAL': 25, 'PLATFORM': 25}
|
||||||
|
|
||||||
|
[2/4] Embedding with bge-m3 via http://localhost:11434
|
||||||
|
Embedding batch 1/4 (32 queries)...
|
||||||
|
Embedding batch 2/4 (32 queries)...
|
||||||
|
Embedding batch 3/4 (32 queries)...
|
||||||
|
Embedding batch 4/4 (4 queries)...
|
||||||
|
Embedding matrix: (100, 1024)
|
||||||
|
|
||||||
|
[3/4] Training LogisticRegression (C=1.0) with 5-fold CV
|
||||||
|
CV accuracy: 0.970 ± 0.021 (folds: [0.95, 1.0, 0.95, 0.95, 1.0])
|
||||||
|
|
||||||
|
Per-class report:
|
||||||
|
precision recall f1-score support
|
||||||
|
CODE_GENERATION 1.00 0.96 0.98 25
|
||||||
|
CONVERSATIONAL 0.96 1.00 0.98 25
|
||||||
|
PLATFORM 1.00 1.00 1.00 25
|
||||||
|
RETRIEVAL 0.96 0.96 0.96 25
|
||||||
|
|
||||||
|
[4/4] Saving model to ../../../Docker/data/classifier_model.pkl
|
||||||
|
Model saved → ../../../Docker/data/classifier_model.pkl
|
||||||
|
|
||||||
|
Done. Classes: ['CODE_GENERATION', 'CONVERSATIONAL', 'PLATFORM', 'RETRIEVAL']
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
**`CV accuracy below threshold`** — Add more examples to the underperforming class
|
||||||
|
(check the per-class recall column). 5–10 extra examples per class usually suffice.
|
||||||
|
|
||||||
|
**`langchain-ollama not installed`** — Run `pip install -r requirements.txt` inside
|
||||||
|
the venv.
|
||||||
|
|
||||||
|
**Ollama connection error** — Verify Ollama is running and bge-m3 is pulled:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
curl http://localhost:11434/api/tags | grep bge-m3
|
||||||
|
# if missing:
|
||||||
|
ollama pull bge-m3
|
||||||
|
```
|
||||||
|
|
||||||
|
**Container not picking up the model** — The engine loads the model once at startup.
|
||||||
|
A `docker restart` is required after every `docker cp`.
|
||||||
|
|
@ -0,0 +1,4 @@
|
||||||
|
scikit-learn>=1.4.0
|
||||||
|
joblib>=1.3.0
|
||||||
|
numpy>=1.26.0
|
||||||
|
langchain-ollama>=0.2.0
|
||||||
|
|
@ -0,0 +1,271 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
retrain_pipeline.py — Automatic Champion/Challenger retraining (ADR-0010).
|
||||||
|
|
||||||
|
Triggered automatically by classifier_export.py when RETRAIN_THRESHOLD new
|
||||||
|
sessions accumulate. Can also be run manually.
|
||||||
|
|
||||||
|
Flow:
|
||||||
|
1. Merge all JSONL exports in EXPORT_DIR with the seed dataset
|
||||||
|
2. Split into train (80%) and held-out (20%) — stratified
|
||||||
|
3. Train challenger model
|
||||||
|
4. Load champion model (current production model at CLASSIFIER_MODEL_PATH)
|
||||||
|
5. Evaluate both on held-out set
|
||||||
|
6. If challenger accuracy >= champion accuracy: deploy challenger → champion
|
||||||
|
7. If challenger < champion: discard challenger, keep champion, log alert
|
||||||
|
8. Archive processed export files to EXPORT_ARCHIVE_DIR
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python retrain_pipeline.py # auto mode (uses env vars)
|
||||||
|
python retrain_pipeline.py --force # skip champion comparison, always deploy
|
||||||
|
python retrain_pipeline.py --dry-run # evaluate only, do not deploy
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import shutil
|
||||||
|
from collections import Counter
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import joblib
|
||||||
|
import numpy as np
|
||||||
|
from sklearn.linear_model import LogisticRegression
|
||||||
|
from sklearn.metrics import accuracy_score
|
||||||
|
from sklearn.model_selection import StratifiedShuffleSplit, cross_val_score, StratifiedKFold
|
||||||
|
|
||||||
|
logging.basicConfig(level=logging.INFO, format="%(asctime)s [retrain] %(message)s")
|
||||||
|
logger = logging.getLogger("retrain_pipeline")
|
||||||
|
|
||||||
|
# ── Configuration ──────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
EXPORT_DIR = Path(os.getenv("CLASSIFIER_EXPORT_DIR", "/data/classifier_labels"))
|
||||||
|
EXPORT_ARCHIVE = Path(os.getenv("CLASSIFIER_ARCHIVE_DIR", "/data/classifier_labels/archived"))
|
||||||
|
CHAMPION_PATH = Path(os.getenv("CLASSIFIER_MODEL_PATH", "/data/classifier_model.pkl"))
|
||||||
|
CHALLENGER_PATH = CHAMPION_PATH.parent / "classifier_model_challenger.pkl"
|
||||||
|
SEED_DATASET = Path(os.getenv("CLASSIFIER_SEED_DATASET",
|
||||||
|
str(Path(__file__).parent / "seed_classifier_dataset.jsonl")))
|
||||||
|
OLLAMA_URL = os.getenv("OLLAMA_LOCAL_URL", "http://localhost:11434")
|
||||||
|
MIN_CV_ACCURACY = float(os.getenv("CLASSIFIER_MIN_CV_ACCURACY", "0.90"))
|
||||||
|
HELD_OUT_RATIO = float(os.getenv("CLASSIFIER_HELD_OUT_RATIO", "0.20"))
|
||||||
|
|
||||||
|
|
||||||
|
# ── Data loading ───────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def load_jsonl(path: Path) -> tuple[list[str], list[str]]:
|
||||||
|
queries, labels = [], []
|
||||||
|
with open(path, encoding="utf-8") as f:
|
||||||
|
for line in f:
|
||||||
|
line = line.strip()
|
||||||
|
if not line:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
rec = json.loads(line)
|
||||||
|
q = rec.get("query", "").strip()
|
||||||
|
t = rec.get("type", "").strip()
|
||||||
|
if q and t:
|
||||||
|
queries.append(q)
|
||||||
|
labels.append(t)
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
pass
|
||||||
|
return queries, labels
|
||||||
|
|
||||||
|
|
||||||
|
def load_all_data(export_dir: Path, seed_path: Path) -> tuple[list[str], list[str], list[Path]]:
|
||||||
|
"""Load seed dataset + all unarchived export files. Returns (queries, labels, export_files)."""
|
||||||
|
queries, labels = [], []
|
||||||
|
|
||||||
|
if seed_path.exists():
|
||||||
|
q, l = load_jsonl(seed_path)
|
||||||
|
queries.extend(q)
|
||||||
|
labels.extend(l)
|
||||||
|
logger.info(f"Seed dataset: {len(q)} examples from {seed_path}")
|
||||||
|
else:
|
||||||
|
logger.warning(f"Seed dataset not found at {seed_path}")
|
||||||
|
|
||||||
|
export_files = sorted(export_dir.glob("classifier_labels_*.jsonl"))
|
||||||
|
for f in export_files:
|
||||||
|
q, l = load_jsonl(f)
|
||||||
|
queries.extend(q)
|
||||||
|
labels.extend(l)
|
||||||
|
logger.info(f"Export file: {len(q)} examples from {f.name}")
|
||||||
|
|
||||||
|
logger.info(f"Total dataset: {len(queries)} examples — {dict(Counter(labels))}")
|
||||||
|
return queries, labels, export_files
|
||||||
|
|
||||||
|
|
||||||
|
# ── Embedding ──────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def embed_queries(queries: list[str], labels: list[str], base_url: str) -> tuple[np.ndarray, list[str]]:
|
||||||
|
"""Embed with bge-m3, one at a time to handle NaN vectors gracefully."""
|
||||||
|
try:
|
||||||
|
from langchain_ollama import OllamaEmbeddings
|
||||||
|
except ImportError:
|
||||||
|
logger.error("langchain-ollama not installed")
|
||||||
|
raise
|
||||||
|
|
||||||
|
emb = OllamaEmbeddings(model="bge-m3", base_url=base_url)
|
||||||
|
vectors, kept_labels = [], []
|
||||||
|
skipped = 0
|
||||||
|
|
||||||
|
for i, (query, label) in enumerate(zip(queries, labels)):
|
||||||
|
try:
|
||||||
|
vec = emb.embed_query(query)
|
||||||
|
if any(v != v for v in vec):
|
||||||
|
skipped += 1
|
||||||
|
continue
|
||||||
|
vectors.append(vec)
|
||||||
|
kept_labels.append(label)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Embedding failed for query {i}: {e}")
|
||||||
|
skipped += 1
|
||||||
|
if (i + 1) % 20 == 0:
|
||||||
|
logger.info(f" Embedded {i+1}/{len(queries)}...")
|
||||||
|
|
||||||
|
if skipped:
|
||||||
|
logger.warning(f"Skipped {skipped} queries due to NaN or embedding errors")
|
||||||
|
|
||||||
|
return np.array(vectors), kept_labels
|
||||||
|
|
||||||
|
|
||||||
|
# ── Training ───────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def train_model(X: np.ndarray, y: list[str]) -> LogisticRegression:
|
||||||
|
clf = LogisticRegression(max_iter=1000, C=1.0, random_state=42)
|
||||||
|
clf.fit(X, y)
|
||||||
|
return clf
|
||||||
|
|
||||||
|
|
||||||
|
def cross_validate(X: np.ndarray, y: list[str]) -> float:
|
||||||
|
dist = Counter(y)
|
||||||
|
n_splits = min(5, min(dist.values()))
|
||||||
|
if n_splits < 2:
|
||||||
|
logger.warning("Too few examples per class for cross-validation")
|
||||||
|
return 0.0
|
||||||
|
clf = LogisticRegression(max_iter=1000, C=1.0, random_state=42)
|
||||||
|
cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
|
||||||
|
scores = cross_val_score(clf, X, y, cv=cv, scoring="accuracy")
|
||||||
|
logger.info(f"CV accuracy: {scores.mean():.3f} ± {scores.std():.3f} (folds: {scores.round(3).tolist()})")
|
||||||
|
return float(scores.mean())
|
||||||
|
|
||||||
|
|
||||||
|
# ── Champion evaluation ────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def evaluate_champion(champion_path: Path, X_held: np.ndarray, y_held: list[str]) -> float:
|
||||||
|
"""Evaluate the current champion on the held-out set. Returns accuracy."""
|
||||||
|
if not champion_path.exists():
|
||||||
|
logger.info("No champion model found — challenger will be deployed unconditionally")
|
||||||
|
return 0.0
|
||||||
|
model = joblib.load(champion_path)
|
||||||
|
preds = model["clf"].predict(X_held)
|
||||||
|
acc = accuracy_score(y_held, preds)
|
||||||
|
logger.info(f"Champion accuracy on held-out set: {acc:.3f} (trained on {model.get('n_train', '?')} examples)")
|
||||||
|
return acc
|
||||||
|
|
||||||
|
|
||||||
|
# ── Archive ────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def archive_exports(export_files: list[Path], archive_dir: Path) -> None:
|
||||||
|
archive_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
for f in export_files:
|
||||||
|
dest = archive_dir / f.name
|
||||||
|
shutil.move(str(f), str(dest))
|
||||||
|
logger.info(f"Archived {f.name} → {archive_dir}")
|
||||||
|
|
||||||
|
|
||||||
|
# ── Main ───────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(description="Champion/Challenger classifier retraining (ADR-0010)")
|
||||||
|
parser.add_argument("--ollama", default=OLLAMA_URL)
|
||||||
|
parser.add_argument("--force", action="store_true", help="Deploy challenger without comparing to champion")
|
||||||
|
parser.add_argument("--dry-run", action="store_true", help="Evaluate only — do not deploy or archive")
|
||||||
|
parser.add_argument("--min-cv", type=float, default=MIN_CV_ACCURACY)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
ts = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
|
||||||
|
logger.info(f"=== Retraining pipeline started {ts} ===")
|
||||||
|
|
||||||
|
# ── 1. Load data ───────────────────────────────────────────────────────────
|
||||||
|
queries, labels, export_files = load_all_data(EXPORT_DIR, SEED_DATASET)
|
||||||
|
|
||||||
|
if len(queries) < 20:
|
||||||
|
logger.error("Too few examples (< 20). Aborting.")
|
||||||
|
return
|
||||||
|
|
||||||
|
# ── 2. Embed ───────────────────────────────────────────────────────────────
|
||||||
|
logger.info(f"Embedding {len(queries)} queries via bge-m3 at {args.ollama}...")
|
||||||
|
X, labels = embed_queries(queries, labels, args.ollama)
|
||||||
|
logger.info(f"Embedding matrix: {X.shape}")
|
||||||
|
|
||||||
|
# ── 3. Stratified train / held-out split ───────────────────────────────────
|
||||||
|
sss = StratifiedShuffleSplit(n_splits=1, test_size=HELD_OUT_RATIO, random_state=42)
|
||||||
|
train_idx, held_idx = next(sss.split(X, labels))
|
||||||
|
X_train, X_held = X[train_idx], X[held_idx]
|
||||||
|
y_train = [labels[i] for i in train_idx]
|
||||||
|
y_held = [labels[i] for i in held_idx]
|
||||||
|
logger.info(f"Train: {len(y_train)} | Held-out: {len(y_held)}")
|
||||||
|
|
||||||
|
# ── 4. Cross-validate challenger ───────────────────────────────────────────
|
||||||
|
logger.info("Cross-validating challenger...")
|
||||||
|
cv_acc = cross_validate(X_train, y_train)
|
||||||
|
|
||||||
|
if cv_acc < args.min_cv:
|
||||||
|
logger.error(f"Challenger CV accuracy {cv_acc:.3f} below minimum {args.min_cv:.2f}. Aborting.")
|
||||||
|
return
|
||||||
|
|
||||||
|
# ── 5. Train challenger on full train split ────────────────────────────────
|
||||||
|
challenger_clf = train_model(X_train, y_train)
|
||||||
|
challenger_acc = accuracy_score(y_held, challenger_clf.predict(X_held))
|
||||||
|
logger.info(f"Challenger accuracy on held-out: {challenger_acc:.3f}")
|
||||||
|
|
||||||
|
# ── 6. Evaluate champion on same held-out set ──────────────────────────────
|
||||||
|
champion_acc = evaluate_champion(CHAMPION_PATH, X_held, y_held)
|
||||||
|
|
||||||
|
# ── 7. Promotion decision ──────────────────────────────────────────────────
|
||||||
|
promote = args.force or (challenger_acc >= champion_acc)
|
||||||
|
|
||||||
|
if promote:
|
||||||
|
reason = "forced" if args.force else f"challenger ({challenger_acc:.3f}) >= champion ({champion_acc:.3f})"
|
||||||
|
logger.info(f"PROMOTING challenger — {reason}")
|
||||||
|
|
||||||
|
if not args.dry_run:
|
||||||
|
# Back up champion before overwriting
|
||||||
|
if CHAMPION_PATH.exists():
|
||||||
|
backup = CHAMPION_PATH.parent / f"classifier_model_backup_{ts}.pkl"
|
||||||
|
shutil.copy(str(CHAMPION_PATH), str(backup))
|
||||||
|
logger.info(f"Champion backed up → {backup.name}")
|
||||||
|
|
||||||
|
joblib.dump(
|
||||||
|
{
|
||||||
|
"clf": challenger_clf,
|
||||||
|
"classes": challenger_clf.classes_.tolist(),
|
||||||
|
"n_train": len(y_train),
|
||||||
|
"cv_mean": round(cv_acc, 4),
|
||||||
|
"held_acc": round(challenger_acc, 4),
|
||||||
|
"champion_acc": round(champion_acc, 4),
|
||||||
|
"retrained_at": ts,
|
||||||
|
},
|
||||||
|
CHAMPION_PATH,
|
||||||
|
)
|
||||||
|
logger.info(f"New champion saved → {CHAMPION_PATH}")
|
||||||
|
logger.info("Restart brunix-assistance-engine to load the new model.")
|
||||||
|
else:
|
||||||
|
logger.info("[dry-run] Promotion skipped")
|
||||||
|
else:
|
||||||
|
logger.warning(
|
||||||
|
f"KEEPING champion — challenger ({challenger_acc:.3f}) < champion ({champion_acc:.3f}). "
|
||||||
|
"Consider adding more labeled data."
|
||||||
|
)
|
||||||
|
|
||||||
|
# ── 8. Archive processed exports ──────────────────────────────────────────
|
||||||
|
if not args.dry_run and export_files:
|
||||||
|
archive_exports(export_files, EXPORT_ARCHIVE)
|
||||||
|
|
||||||
|
logger.info(f"=== Retraining pipeline finished {datetime.now(timezone.utc).strftime('%Y%m%dT%H%M%SZ')} ===")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
|
|
@ -0,0 +1,95 @@
|
||||||
|
{"query": "What is addVar in AVAP?", "type": "RETRIEVAL"}
|
||||||
|
{"query": "How do I declare a variable in AVAP Cloud?", "type": "RETRIEVAL"}
|
||||||
|
{"query": "What does the pipeline node do?", "type": "RETRIEVAL"}
|
||||||
|
{"query": "Explain the difference between addVar and addObject", "type": "RETRIEVAL"}
|
||||||
|
{"query": "What is a virtual API in AVAP?", "type": "RETRIEVAL"}
|
||||||
|
{"query": "How does the routing engine work in AVAP?", "type": "RETRIEVAL"}
|
||||||
|
{"query": "What is the purpose of the command node?", "type": "RETRIEVAL"}
|
||||||
|
{"query": "How do I handle errors in an AVAP flow?", "type": "RETRIEVAL"}
|
||||||
|
{"query": "What is the difference between a dataset and a variable?", "type": "RETRIEVAL"}
|
||||||
|
{"query": "How does AVAP handle authentication in API calls?", "type": "RETRIEVAL"}
|
||||||
|
{"query": "What are the available data types in AVAP?", "type": "RETRIEVAL"}
|
||||||
|
{"query": "Explain how the loop node works", "type": "RETRIEVAL"}
|
||||||
|
{"query": "What is addHeader used for?", "type": "RETRIEVAL"}
|
||||||
|
{"query": "How do I connect to an external REST API from AVAP?", "type": "RETRIEVAL"}
|
||||||
|
{"query": "What is the AVAP Cloud execution model?", "type": "RETRIEVAL"}
|
||||||
|
{"query": "How does conditional branching work in AVAP?", "type": "RETRIEVAL"}
|
||||||
|
{"query": "What is the difference between addRow and addObject?", "type": "RETRIEVAL"}
|
||||||
|
{"query": "How do I define a reusable function in AVAP?", "type": "RETRIEVAL"}
|
||||||
|
{"query": "What does the return node do in a pipeline?", "type": "RETRIEVAL"}
|
||||||
|
{"query": "How does AVAP handle JSON transformations?", "type": "RETRIEVAL"}
|
||||||
|
{"query": "What is the catalog in AVAP Cloud?", "type": "RETRIEVAL"}
|
||||||
|
{"query": "How do I paginate results in an AVAP API?", "type": "RETRIEVAL"}
|
||||||
|
{"query": "What logging options are available in AVAP?", "type": "RETRIEVAL"}
|
||||||
|
{"query": "How are environment variables managed in AVAP Cloud?", "type": "RETRIEVAL"}
|
||||||
|
{"query": "What is a connector in AVAP?", "type": "RETRIEVAL"}
|
||||||
|
{"query": "Write an API endpoint in AVAP that returns a list of users", "type": "CODE_GENERATION"}
|
||||||
|
{"query": "Generate AVAP code to call an external REST API and return the result", "type": "CODE_GENERATION"}
|
||||||
|
{"query": "Create a pipeline that reads from a dataset and filters by field", "type": "CODE_GENERATION"}
|
||||||
|
{"query": "Write a loop in AVAP that iterates over a JSON array", "type": "CODE_GENERATION"}
|
||||||
|
{"query": "Generate code to authenticate with OAuth2 and call a protected endpoint", "type": "CODE_GENERATION"}
|
||||||
|
{"query": "Write an AVAP function that transforms a JSON response", "type": "CODE_GENERATION"}
|
||||||
|
{"query": "Create an API endpoint that accepts POST and validates input", "type": "CODE_GENERATION"}
|
||||||
|
{"query": "Generate AVAP code to merge two datasets", "type": "CODE_GENERATION"}
|
||||||
|
{"query": "Write a pipeline that handles errors and returns a custom message", "type": "CODE_GENERATION"}
|
||||||
|
{"query": "Create a reusable AVAP function that formats dates", "type": "CODE_GENERATION"}
|
||||||
|
{"query": "Write code to call a database query node and return paginated results", "type": "CODE_GENERATION"}
|
||||||
|
{"query": "Generate an AVAP endpoint that proxies another API", "type": "CODE_GENERATION"}
|
||||||
|
{"query": "Write a pipeline that conditionally routes requests based on a field value", "type": "CODE_GENERATION"}
|
||||||
|
{"query": "Create AVAP code to parse and validate a JWT token", "type": "CODE_GENERATION"}
|
||||||
|
{"query": "Generate code for a webhook handler in AVAP", "type": "CODE_GENERATION"}
|
||||||
|
{"query": "Write an AVAP pipeline that aggregates data from multiple API calls", "type": "CODE_GENERATION"}
|
||||||
|
{"query": "Create a function that maps one JSON schema to another", "type": "CODE_GENERATION"}
|
||||||
|
{"query": "Write code to implement rate limiting in an AVAP API", "type": "CODE_GENERATION"}
|
||||||
|
{"query": "Generate an AVAP endpoint with custom response headers", "type": "CODE_GENERATION"}
|
||||||
|
{"query": "Create a pipeline that retries a failed API call up to 3 times", "type": "CODE_GENERATION"}
|
||||||
|
{"query": "Can you explain that again more simply?", "type": "CONVERSATIONAL"}
|
||||||
|
{"query": "What did you mean by that last part?", "type": "CONVERSATIONAL"}
|
||||||
|
{"query": "Can you summarize what you just said?", "type": "CONVERSATIONAL"}
|
||||||
|
{"query": "I didn't understand the second point", "type": "CONVERSATIONAL"}
|
||||||
|
{"query": "Can you give me a shorter version?", "type": "CONVERSATIONAL"}
|
||||||
|
{"query": "What was the example you mentioned before?", "type": "CONVERSATIONAL"}
|
||||||
|
{"query": "Can you be more specific about that?", "type": "CONVERSATIONAL"}
|
||||||
|
{"query": "Repeat the last answer but focus on the part about variables", "type": "CONVERSATIONAL"}
|
||||||
|
{"query": "That's not what I asked, can you try again?", "type": "CONVERSATIONAL"}
|
||||||
|
{"query": "OK but what does that mean in practice?", "type": "CONVERSATIONAL"}
|
||||||
|
{"query": "Can you expand on that?", "type": "CONVERSATIONAL"}
|
||||||
|
{"query": "I still don't get it, try a different explanation", "type": "CONVERSATIONAL"}
|
||||||
|
{"query": "What was the difference you mentioned earlier?", "type": "CONVERSATIONAL"}
|
||||||
|
{"query": "Can you give me an analogy?", "type": "CONVERSATIONAL"}
|
||||||
|
{"query": "That makes sense, and what about the second case?", "type": "CONVERSATIONAL"}
|
||||||
|
{"query": "Are you sure about that?", "type": "CONVERSATIONAL"}
|
||||||
|
{"query": "Say it in one sentence", "type": "CONVERSATIONAL"}
|
||||||
|
{"query": "Put that in simpler terms", "type": "CONVERSATIONAL"}
|
||||||
|
{"query": "What was the first thing you said about pipelines?", "type": "CONVERSATIONAL"}
|
||||||
|
{"query": "OK, now compare both options you described", "type": "CONVERSATIONAL"}
|
||||||
|
{"query": "Can you elaborate on the error handling part?", "type": "CONVERSATIONAL"}
|
||||||
|
{"query": "Go back to what you said about connectors", "type": "CONVERSATIONAL"}
|
||||||
|
{"query": "Give me a bullet point summary of your previous answer", "type": "CONVERSATIONAL"}
|
||||||
|
{"query": "I need the same explanation but focused on performance", "type": "CONVERSATIONAL"}
|
||||||
|
{"query": "Can you rephrase that as a step-by-step guide?", "type": "CONVERSATIONAL"}
|
||||||
|
{"query": "You have a project usage percentage of 20%. Provide an insight.", "type": "PLATFORM"}
|
||||||
|
{"query": "Your account has consumed 3000 API calls this month", "type": "PLATFORM"}
|
||||||
|
{"query": "You are a direct and concise assistant. Your quota is at 80%.", "type": "PLATFORM"}
|
||||||
|
{"query": "Your subscription plan allows 10000 requests per day", "type": "PLATFORM"}
|
||||||
|
{"query": "Your account usage is at 95% of the monthly limit", "type": "PLATFORM"}
|
||||||
|
{"query": "You have used 45% of your available API quota", "type": "PLATFORM"}
|
||||||
|
{"query": "Your current plan is Basic with 5000 API calls remaining", "type": "PLATFORM"}
|
||||||
|
{"query": "Your project has exceeded 90% of its allocated resources", "type": "PLATFORM"}
|
||||||
|
{"query": "How many API calls do I have left this month?", "type": "PLATFORM"}
|
||||||
|
{"query": "What is my current plan and its limits?", "type": "PLATFORM"}
|
||||||
|
{"query": "Show me my account usage statistics", "type": "PLATFORM"}
|
||||||
|
{"query": "How much of my quota have I consumed?", "type": "PLATFORM"}
|
||||||
|
{"query": "When does my subscription renew?", "type": "PLATFORM"}
|
||||||
|
{"query": "What are my billing details?", "type": "PLATFORM"}
|
||||||
|
{"query": "How many projects can I create on my current plan?", "type": "PLATFORM"}
|
||||||
|
{"query": "Am I close to my API limit?", "type": "PLATFORM"}
|
||||||
|
{"query": "What happens if I exceed my quota?", "type": "PLATFORM"}
|
||||||
|
{"query": "Can I upgrade my current subscription?", "type": "PLATFORM"}
|
||||||
|
{"query": "Your free trial expires in 3 days", "type": "PLATFORM"}
|
||||||
|
{"query": "Your account is currently suspended due to overuse", "type": "PLATFORM"}
|
||||||
|
{"query": "You have 150 API calls remaining in your current billing cycle", "type": "PLATFORM"}
|
||||||
|
{"query": "Your consumption this week is 2.5x higher than last week", "type": "PLATFORM"}
|
||||||
|
{"query": "What is my current monthly spend on AVAP Cloud?", "type": "PLATFORM"}
|
||||||
|
{"query": "How do I add more API capacity to my plan?", "type": "PLATFORM"}
|
||||||
|
{"query": "Your project usage percentage is critically high at 98%", "type": "PLATFORM"}
|
||||||
|
|
@ -0,0 +1,204 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Phase 2 — Train Layer 2 embedding classifier (ADR-0008).
|
||||||
|
|
||||||
|
Reads labeled (query, type) pairs from a JSONL file, embeds them with bge-m3
|
||||||
|
via Ollama, trains a LogisticRegression classifier, and serializes the model
|
||||||
|
with joblib.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python train_classifier.py
|
||||||
|
python train_classifier.py --data path/to/dataset.jsonl --output /data/classifier_model.pkl
|
||||||
|
|
||||||
|
The output file is loaded at engine startup by graph.py (_load_layer2_model).
|
||||||
|
Any JSONL file produced by classifier_export.py is compatible as additional
|
||||||
|
training data — merge with the seed dataset before retraining.
|
||||||
|
|
||||||
|
Requirements (add to requirements.txt if not present):
|
||||||
|
scikit-learn
|
||||||
|
joblib
|
||||||
|
numpy
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
from collections import Counter
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import joblib
|
||||||
|
import numpy as np
|
||||||
|
from sklearn.linear_model import LogisticRegression
|
||||||
|
from sklearn.metrics import classification_report
|
||||||
|
from sklearn.model_selection import StratifiedKFold, cross_val_score, cross_val_predict
|
||||||
|
|
||||||
|
|
||||||
|
def load_data(path: str) -> tuple[list[str], list[str]]:
|
||||||
|
queries, labels = [], []
|
||||||
|
skipped = 0
|
||||||
|
with open(path, encoding="utf-8") as f:
|
||||||
|
for i, line in enumerate(f, 1):
|
||||||
|
line = line.strip()
|
||||||
|
if not line:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
rec = json.loads(line)
|
||||||
|
except json.JSONDecodeError as e:
|
||||||
|
print(f" [WARN] line {i} skipped — JSON error: {e}", file=sys.stderr)
|
||||||
|
skipped += 1
|
||||||
|
continue
|
||||||
|
q = rec.get("query", "").strip()
|
||||||
|
t = rec.get("type", "").strip()
|
||||||
|
if not q or not t:
|
||||||
|
skipped += 1
|
||||||
|
continue
|
||||||
|
queries.append(q)
|
||||||
|
labels.append(t)
|
||||||
|
if skipped:
|
||||||
|
print(f" [WARN] {skipped} records skipped (missing query/type or invalid JSON)")
|
||||||
|
return queries, labels
|
||||||
|
|
||||||
|
|
||||||
|
def embed_queries(queries: list[str], labels: list[str], base_url: str) -> tuple[np.ndarray, list[str]]:
|
||||||
|
"""Embed queries with bge-m3 via Ollama, one at a time.
|
||||||
|
|
||||||
|
bge-m3 occasionally produces NaN vectors for certain inputs (known Ollama bug).
|
||||||
|
Embedding one by one lets us detect and skip those queries instead of failing
|
||||||
|
the entire batch.
|
||||||
|
|
||||||
|
Returns (vectors, kept_labels) — labels aligned to the returned vectors.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
from langchain_ollama import OllamaEmbeddings
|
||||||
|
except ImportError:
|
||||||
|
print("[ERROR] langchain-ollama not installed. Run: pip install langchain-ollama", file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
emb = OllamaEmbeddings(model="bge-m3", base_url=base_url)
|
||||||
|
vectors: list[list[float]] = []
|
||||||
|
kept_labels: list[str] = []
|
||||||
|
skipped = 0
|
||||||
|
|
||||||
|
for i, (query, label) in enumerate(zip(queries, labels)):
|
||||||
|
try:
|
||||||
|
vec = emb.embed_query(query)
|
||||||
|
if any(v != v for v in vec): # NaN check (NaN != NaN)
|
||||||
|
print(f" [WARN] query {i+1} produced NaN vector, skipped: '{query[:60]}'", file=sys.stderr)
|
||||||
|
skipped += 1
|
||||||
|
continue
|
||||||
|
vectors.append(vec)
|
||||||
|
kept_labels.append(label)
|
||||||
|
except Exception as e:
|
||||||
|
print(f" [WARN] query {i+1} embedding failed ({e}), skipped: '{query[:60]}'", file=sys.stderr)
|
||||||
|
skipped += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
if (i + 1) % 10 == 0 or (i + 1) == len(queries):
|
||||||
|
print(f" Embedded {i+1}/{len(queries)}...", end="\r")
|
||||||
|
|
||||||
|
print()
|
||||||
|
if skipped:
|
||||||
|
print(f" [WARN] {skipped} queries skipped due to NaN or embedding errors")
|
||||||
|
|
||||||
|
return np.array(vectors), kept_labels
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(description="Train Layer 2 classifier for ADR-0008")
|
||||||
|
parser.add_argument(
|
||||||
|
"--data",
|
||||||
|
default=str(Path(__file__).parent / "seed_classifier_dataset.jsonl"),
|
||||||
|
help="Path to labeled JSONL dataset",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--output",
|
||||||
|
default=os.getenv("CLASSIFIER_MODEL_PATH", "/data/classifier_model.pkl"),
|
||||||
|
help="Output path for serialized model",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--ollama",
|
||||||
|
default=os.getenv("OLLAMA_LOCAL_URL", "http://localhost:11434"),
|
||||||
|
help="Ollama base URL",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--min-cv-accuracy",
|
||||||
|
type=float,
|
||||||
|
default=0.90,
|
||||||
|
help="Minimum cross-validation accuracy to proceed with saving (default: 0.90)",
|
||||||
|
)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# ── Load data ──────────────────────────────────────────────────────────────
|
||||||
|
print(f"\n[1/4] Loading data from {args.data}")
|
||||||
|
queries, labels = load_data(args.data)
|
||||||
|
print(f" {len(queries)} examples loaded")
|
||||||
|
|
||||||
|
dist = Counter(labels)
|
||||||
|
print(f" Distribution: {dict(dist)}")
|
||||||
|
|
||||||
|
if len(queries) < 20:
|
||||||
|
print("[ERROR] Too few examples (< 20). Add more to the dataset.", file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
min_class_count = min(dist.values())
|
||||||
|
n_splits = min(5, min_class_count)
|
||||||
|
if n_splits < 2:
|
||||||
|
print(f"[ERROR] At least one class has fewer than 2 examples. Add more data.", file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# ── Embed ──────────────────────────────────────────────────────────────────
|
||||||
|
print(f"\n[2/4] Embedding with bge-m3 via {args.ollama}")
|
||||||
|
vectors, labels = embed_queries(queries, labels, args.ollama)
|
||||||
|
print(f" Embedding matrix: {vectors.shape} ({len(labels)} examples kept)")
|
||||||
|
|
||||||
|
# ── Train + cross-validate ─────────────────────────────────────────────────
|
||||||
|
print(f"\n[3/4] Training LogisticRegression (C=1.0) with {n_splits}-fold CV")
|
||||||
|
clf = LogisticRegression(max_iter=1000, C=1.0, random_state=42)
|
||||||
|
cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
|
||||||
|
|
||||||
|
scores = cross_val_score(clf, vectors, labels, cv=cv, scoring="accuracy")
|
||||||
|
cv_mean = scores.mean()
|
||||||
|
cv_std = scores.std()
|
||||||
|
print(f" CV accuracy: {cv_mean:.3f} ± {cv_std:.3f} (folds: {scores.round(3).tolist()})")
|
||||||
|
|
||||||
|
# Per-class report via cross-validated predictions
|
||||||
|
y_pred = cross_val_predict(clf, vectors, labels, cv=cv)
|
||||||
|
print("\n Per-class report:")
|
||||||
|
report = classification_report(labels, y_pred, zero_division=0)
|
||||||
|
for line in report.splitlines():
|
||||||
|
print(f" {line}")
|
||||||
|
|
||||||
|
if cv_mean < args.min_cv_accuracy:
|
||||||
|
print(
|
||||||
|
f"\n[FAIL] CV accuracy {cv_mean:.3f} is below threshold {args.min_cv_accuracy:.2f}. "
|
||||||
|
"Add more examples to the dataset before deploying.",
|
||||||
|
file=sys.stderr,
|
||||||
|
)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
print(f"\n CV accuracy {cv_mean:.3f} ≥ {args.min_cv_accuracy:.2f} — proceeding to save.")
|
||||||
|
|
||||||
|
# ── Fit final model on full dataset ────────────────────────────────────────
|
||||||
|
clf.fit(vectors, labels)
|
||||||
|
|
||||||
|
# ── Save ───────────────────────────────────────────────────────────────────
|
||||||
|
print(f"\n[4/4] Saving model to {args.output}")
|
||||||
|
out_path = Path(args.output)
|
||||||
|
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
joblib.dump(
|
||||||
|
{
|
||||||
|
"clf": clf,
|
||||||
|
"classes": clf.classes_.tolist(),
|
||||||
|
"n_train": len(queries),
|
||||||
|
"cv_mean": round(cv_mean, 4),
|
||||||
|
"cv_std": round(cv_std, 4),
|
||||||
|
},
|
||||||
|
out_path,
|
||||||
|
)
|
||||||
|
print(f" Model saved → {out_path}")
|
||||||
|
print(f"\nDone. Classes: {clf.classes_.tolist()}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
Loading…
Reference in New Issue