[FEATURE] Adaptive query routing: PLATFORM type, model specialization, intent history classifier
- Add PLATFORM query type that bypasses RAG and uses a lighter model - Introduce OLLAMA_MODEL_NAME_CONVERSATIONAL env var to route CONVERSATIONAL and PLATFORM queries to a separate (smaller) Ollama model - Replace raw message history in classifier with compact intent history (classify_history) to eliminate anchoring bias in small models - Add <history_rule> and <platform_priority_rule> to classifier prompt so the model evaluates each message independently while still resolving ambiguous references from prior turns - Add fast-path detection for known platform-injected prompt prefixes - Add PLATFORM_PROMPT for account/metrics/usage responses - Persist classify_history in classify_history_store alongside session_store - Document decisions in ADR-0008 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
c886cc9811
commit
d7baccd8f0
|
|
@ -11,14 +11,16 @@ from prompts import (
|
|||
CODE_GENERATION_PROMPT,
|
||||
CONVERSATIONAL_PROMPT,
|
||||
GENERATE_PROMPT,
|
||||
PLATFORM_PROMPT,
|
||||
REFORMULATE_PROMPT,
|
||||
)
|
||||
|
||||
from state import AgentState
|
||||
from state import AgentState, ClassifyEntry
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
session_store: dict[str, list] = defaultdict(list)
|
||||
classify_history_store: dict[str, list] = defaultdict(list)
|
||||
|
||||
|
||||
def format_context(docs):
|
||||
|
|
@ -212,6 +214,19 @@ def _build_generation_prompt(template_prompt: SystemMessage, context: str,
|
|||
return SystemMessage(content=base)
|
||||
|
||||
|
||||
def _format_intent_history(classify_history: list) -> str:
|
||||
if not classify_history:
|
||||
return "(no prior turns)"
|
||||
lines = [f"[{e['type']}] \"{e['topic']}\"" for e in classify_history[-6:]]
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def _is_platform_query(question: str) -> bool:
|
||||
"""Fast-path: skip LLM classifier for known platform prompt prefixes."""
|
||||
q = question.strip().lower()
|
||||
return "you are a direct and concise assistant" in q
|
||||
|
||||
|
||||
def _parse_query_type(raw: str) -> tuple[str, bool]:
|
||||
parts = raw.strip().upper().split()
|
||||
query_type = "RETRIEVAL"
|
||||
|
|
@ -222,11 +237,15 @@ def _parse_query_type(raw: str) -> tuple[str, bool]:
|
|||
query_type = "CODE_GENERATION"
|
||||
elif first.startswith("CONVERSATIONAL"):
|
||||
query_type = "CONVERSATIONAL"
|
||||
elif first.startswith("PLATFORM"):
|
||||
query_type = "PLATFORM"
|
||||
if len(parts) > 1 and parts[1] == "EDITOR":
|
||||
use_editor = True
|
||||
return query_type, use_editor
|
||||
|
||||
def build_graph(llm, embeddings, es_client, index_name):
|
||||
def build_graph(llm, embeddings, es_client, index_name, llm_conversational=None):
|
||||
|
||||
_llm_conv = llm_conversational or llm
|
||||
|
||||
def _persist(state: AgentState, response: BaseMessage):
|
||||
session_id = state.get("session_id", "")
|
||||
|
|
@ -239,17 +258,34 @@ def build_graph(llm, embeddings, es_client, index_name):
|
|||
question = getattr(user_msg, "content",
|
||||
user_msg.get("content", "")
|
||||
if isinstance(user_msg, dict) else "")
|
||||
history_msgs = messages[:-1]
|
||||
selected_text = state.get("selected_text", "")
|
||||
classify_history = state.get("classify_history") or []
|
||||
|
||||
history_text = format_history_for_classify(history_msgs) if history_msgs else "(no history)"
|
||||
prompt_content = _build_classify_prompt(question, history_text, selected_text)
|
||||
topic_snippet = question.strip()[:60].replace("\n", " ")
|
||||
|
||||
if _is_platform_query(question):
|
||||
logger.info(f"[classify] platform prefix detected -> PLATFORM")
|
||||
entry: ClassifyEntry = {"type": "PLATFORM", "topic": topic_snippet}
|
||||
return {
|
||||
"query_type": "PLATFORM",
|
||||
"use_editor_context": False,
|
||||
"classify_history": classify_history + [entry],
|
||||
}
|
||||
|
||||
intent_history_text = _format_intent_history(classify_history)
|
||||
prompt_content = _build_classify_prompt(question, intent_history_text, selected_text)
|
||||
|
||||
resp = llm.invoke([SystemMessage(content=prompt_content)])
|
||||
raw = resp.content.strip().upper()
|
||||
query_type, use_editor_ctx = _parse_query_type(raw)
|
||||
logger.info(f"[classify] selected={bool(selected_text)} raw='{raw}' -> {query_type} editor={use_editor_ctx}")
|
||||
return {"query_type": query_type, "use_editor_context": use_editor_ctx}
|
||||
|
||||
entry: ClassifyEntry = {"type": query_type, "topic": topic_snippet}
|
||||
return {
|
||||
"query_type": query_type,
|
||||
"use_editor_context": use_editor_ctx,
|
||||
"classify_history": classify_history + [entry],
|
||||
}
|
||||
|
||||
def reformulate(state: AgentState) -> AgentState:
|
||||
user_msg = state["messages"][-1]
|
||||
|
|
@ -323,11 +359,25 @@ def build_graph(llm, embeddings, es_client, index_name):
|
|||
))
|
||||
else:
|
||||
enriched = CONVERSATIONAL_PROMPT
|
||||
resp = llm.invoke([enriched] + state["messages"])
|
||||
resp = _llm_conv.invoke([enriched] + state["messages"])
|
||||
logger.info("[conversational] from conversation")
|
||||
_persist(state, resp)
|
||||
return {"messages": [resp]}
|
||||
|
||||
def respond_platform(state):
|
||||
extra_context = state.get("extra_context", "")
|
||||
if extra_context:
|
||||
enriched = SystemMessage(content=(
|
||||
PLATFORM_PROMPT.content +
|
||||
f"\n\n<extra_context>\n{extra_context}\n</extra_context>"
|
||||
))
|
||||
else:
|
||||
enriched = PLATFORM_PROMPT
|
||||
resp = _llm_conv.invoke([enriched] + state["messages"])
|
||||
logger.info("[platform] platform/account query")
|
||||
_persist(state, resp)
|
||||
return {"messages": [resp]}
|
||||
|
||||
def route_by_type(state):
|
||||
return state.get("query_type", "RETRIEVAL")
|
||||
|
||||
|
|
@ -343,6 +393,7 @@ def build_graph(llm, embeddings, es_client, index_name):
|
|||
graph_builder.add_node("generate", generate)
|
||||
graph_builder.add_node("generate_code", generate_code)
|
||||
graph_builder.add_node("respond_conversational", respond_conversational)
|
||||
graph_builder.add_node("respond_platform", respond_platform)
|
||||
|
||||
graph_builder.set_entry_point("classify")
|
||||
|
||||
|
|
@ -353,6 +404,7 @@ def build_graph(llm, embeddings, es_client, index_name):
|
|||
"RETRIEVAL": "reformulate",
|
||||
"CODE_GENERATION": "reformulate",
|
||||
"CONVERSATIONAL": "respond_conversational",
|
||||
"PLATFORM": "respond_platform",
|
||||
}
|
||||
)
|
||||
|
||||
|
|
@ -370,6 +422,7 @@ def build_graph(llm, embeddings, es_client, index_name):
|
|||
graph_builder.add_edge("generate", END)
|
||||
graph_builder.add_edge("generate_code", END)
|
||||
graph_builder.add_edge("respond_conversational", END)
|
||||
graph_builder.add_edge("respond_platform", END)
|
||||
|
||||
return graph_builder.compile()
|
||||
|
||||
|
|
@ -385,14 +438,32 @@ def build_prepare_graph(llm, embeddings, es_client, index_name):
|
|||
history_msgs = messages[:-1]
|
||||
selected_text = state.get("selected_text", "")
|
||||
|
||||
history_text = format_history_for_classify(history_msgs) if history_msgs else "(no history)"
|
||||
prompt_content = _build_classify_prompt(question, history_text, selected_text)
|
||||
if _is_platform_query(question):
|
||||
logger.info(f"[prepare/classify] platform prefix detected -> PLATFORM")
|
||||
classify_history = state.get("classify_history") or []
|
||||
topic_snippet = question.strip()[:60].replace("\n", " ")
|
||||
entry: ClassifyEntry = {"type": "PLATFORM", "topic": topic_snippet}
|
||||
return {
|
||||
"query_type": "PLATFORM",
|
||||
"use_editor_context": False,
|
||||
"classify_history": classify_history + [entry],
|
||||
}
|
||||
|
||||
classify_history = state.get("classify_history") or []
|
||||
topic_snippet = question.strip()[:60].replace("\n", " ")
|
||||
intent_history_text = _format_intent_history(classify_history)
|
||||
prompt_content = _build_classify_prompt(question, intent_history_text, selected_text)
|
||||
|
||||
resp = llm.invoke([SystemMessage(content=prompt_content)])
|
||||
raw = resp.content.strip().upper()
|
||||
query_type, use_editor_ctx = _parse_query_type(raw)
|
||||
logger.info(f"[prepare/classify] selected={bool(selected_text)} raw='{raw}' -> {query_type} editor={use_editor_ctx}")
|
||||
return {"query_type": query_type, "use_editor_context": use_editor_ctx}
|
||||
entry: ClassifyEntry = {"type": query_type, "topic": topic_snippet}
|
||||
return {
|
||||
"query_type": query_type,
|
||||
"use_editor_context": use_editor_ctx,
|
||||
"classify_history": classify_history + [entry],
|
||||
}
|
||||
|
||||
def reformulate(state: AgentState) -> AgentState:
|
||||
user_msg = state["messages"][-1]
|
||||
|
|
@ -450,6 +521,7 @@ def build_prepare_graph(llm, embeddings, es_client, index_name):
|
|||
"RETRIEVAL": "reformulate",
|
||||
"CODE_GENERATION": "reformulate",
|
||||
"CONVERSATIONAL": "skip_retrieve",
|
||||
"PLATFORM": "skip_retrieve",
|
||||
}
|
||||
)
|
||||
|
||||
|
|
@ -479,6 +551,17 @@ def build_final_messages(state: AgentState) -> list:
|
|||
enriched = CONVERSATIONAL_PROMPT
|
||||
return [enriched] + messages
|
||||
|
||||
if query_type == "PLATFORM":
|
||||
extra_context = state.get("extra_context", "")
|
||||
if extra_context:
|
||||
enriched = SystemMessage(content=(
|
||||
PLATFORM_PROMPT.content +
|
||||
f"\n\n<extra_context>\n{extra_context}\n</extra_context>"
|
||||
))
|
||||
else:
|
||||
enriched = PLATFORM_PROMPT
|
||||
return [enriched] + messages
|
||||
|
||||
use_editor = state.get("use_editor_context", False)
|
||||
if query_type == "CODE_GENERATION":
|
||||
prompt = _build_generation_prompt(
|
||||
|
|
|
|||
|
|
@ -3,30 +3,44 @@ from langchain_core.messages import SystemMessage
|
|||
|
||||
CLASSIFY_PROMPT_TEMPLATE = (
|
||||
"<role>\n"
|
||||
"You are a query classifier for an AVAP language assistant. "
|
||||
"Your only job is to classify the user message into one of three categories "
|
||||
"and determine whether the user is explicitly asking about the editor code.\n"
|
||||
"You are an intent classifier. Classify the CURRENT message based solely on "
|
||||
"its own content and purpose. Do not assume the new message continues the "
|
||||
"previous topic — each message must be evaluated independently.\n"
|
||||
"</role>\n\n"
|
||||
|
||||
"<history_rule>\n"
|
||||
"The conversation history shows the intent of previous turns. "
|
||||
"Use it ONLY to resolve ambiguous references in the current message "
|
||||
"(e.g. 'this', 'esto', 'that function', 'lo anterior'). "
|
||||
"Do NOT use it to predict or bias the category of the current message. "
|
||||
"A new message can belong to a completely different category than the previous ones.\n"
|
||||
"</history_rule>\n\n"
|
||||
|
||||
"<platform_priority_rule>\n"
|
||||
"If the current message contains usage percentages, account metrics, consumption "
|
||||
"figures, quota data, subscription details, or billing information — classify it "
|
||||
"as PLATFORM regardless of any other signal, including conversation history.\n"
|
||||
"</platform_priority_rule>\n\n"
|
||||
|
||||
"<categories>\n"
|
||||
"RETRIEVAL — the user is asking about AVAP concepts, documentation, syntax rules, "
|
||||
"or how something works. They want an explanation, not code.\n"
|
||||
"RETRIEVAL — purpose: understand AVAP language documentation, syntax, or behavior.\n"
|
||||
"Examples: 'What is addVar?', 'How does registerEndpoint work?', "
|
||||
"'What is the difference between if() modes?'\n\n"
|
||||
|
||||
"CODE_GENERATION — the user is asking to generate, write, create, build, or show "
|
||||
"an example of an AVAP script, function, API, or code snippet. "
|
||||
"They want working code as output.\n"
|
||||
"CODE_GENERATION — purpose: produce working AVAP code.\n"
|
||||
"Examples: 'Write an API that returns hello world', "
|
||||
"'Generate a function that queries the DB', "
|
||||
"'Show me how to create an endpoint', "
|
||||
"'dame un ejemplo de codigo', 'escribeme un script', "
|
||||
"'dime como seria un API', 'genera un API', 'como haria'\n\n"
|
||||
"'Generate a function that queries the DB', 'dame un ejemplo de codigo'\n\n"
|
||||
|
||||
"CONVERSATIONAL — the user is following up on the previous answer. "
|
||||
"They want a reformulation, summary, or elaboration of what was already said.\n"
|
||||
"Examples: 'can you explain that?', 'en menos palabras', "
|
||||
"'describe it in your own words', 'what did you mean?'\n"
|
||||
"CONVERSATIONAL — purpose: reformulate or continue what was already discussed.\n"
|
||||
"Examples: 'can you explain that?', 'en menos palabras', 'what did you mean?'\n\n"
|
||||
|
||||
"PLATFORM — purpose: obtain information or insight about the user's account, "
|
||||
"usage, limits, metrics, consumption, quota, billing, or platform status. "
|
||||
"This includes messages where platform data is embedded and analysis is requested.\n"
|
||||
"Examples: 'what plan am I on?', 'analyze my account limits and consumption', "
|
||||
"'You have a project usage percentage of 20%, provide an insight', "
|
||||
"'Your quota is 80% used, give a recommendation', "
|
||||
"'cuantas llamadas llevo este mes', 'estado de mi cuenta'\n"
|
||||
"</categories>\n\n"
|
||||
|
||||
"<editor_rule>\n"
|
||||
|
|
@ -38,17 +52,15 @@ CLASSIFY_PROMPT_TEMPLATE = (
|
|||
"'explain this', 'what does this do', 'que hace esto', "
|
||||
"'como mejoro esto', 'el codigo del editor', 'lo que tengo aqui', "
|
||||
"'this selection', 'lo seleccionado', or similar.\n"
|
||||
"Answer NO_EDITOR in all other cases — including general AVAP questions, "
|
||||
"code generation requests, and conversational follow-ups that do not "
|
||||
"refer to specific editor code.\n"
|
||||
"Answer NO_EDITOR in all other cases.\n"
|
||||
"</editor_rule>\n\n"
|
||||
|
||||
"<output_rule>\n"
|
||||
"Your entire response must be exactly two words separated by a single space.\n"
|
||||
"First word: RETRIEVAL, CODE_GENERATION, or CONVERSATIONAL.\n"
|
||||
"First word: RETRIEVAL, CODE_GENERATION, CONVERSATIONAL, or PLATFORM.\n"
|
||||
"Second word: EDITOR or NO_EDITOR.\n"
|
||||
"Valid examples: 'RETRIEVAL NO_EDITOR', 'CODE_GENERATION EDITOR', "
|
||||
"'CONVERSATIONAL NO_EDITOR'.\n"
|
||||
"'CONVERSATIONAL NO_EDITOR', 'PLATFORM NO_EDITOR'.\n"
|
||||
"No other output. No punctuation. No explanation.\n"
|
||||
"</output_rule>\n\n"
|
||||
|
||||
|
|
@ -246,6 +258,28 @@ CONVERSATIONAL_PROMPT = SystemMessage(
|
|||
)
|
||||
|
||||
|
||||
PLATFORM_PROMPT = SystemMessage(
|
||||
content=(
|
||||
"<role>\n"
|
||||
"You are a helpful AVAP platform assistant. "
|
||||
"You help users understand their account, subscription, usage metrics, and platform status.\n"
|
||||
"</role>\n\n"
|
||||
|
||||
"<task>\n"
|
||||
"Answer the user's question about the platform using the information in <extra_context> "
|
||||
"if available. If the information is not available, say so clearly and suggest where "
|
||||
"they can find it (e.g. the platform dashboard or support).\n"
|
||||
"</task>\n\n"
|
||||
|
||||
"<rules>\n"
|
||||
"- Use <extra_context> as the primary source for account/metrics data.\n"
|
||||
"- Keep the same language the user is using.\n"
|
||||
"- Be concise and direct.\n"
|
||||
"- Do not invent account data.\n"
|
||||
"</rules>"
|
||||
)
|
||||
)
|
||||
|
||||
GENERATE_PROMPT = SystemMessage(
|
||||
content=(
|
||||
"<role>\n"
|
||||
|
|
|
|||
|
|
@ -14,7 +14,7 @@ from langchain_core.messages import AIMessage
|
|||
|
||||
from utils.llm_factory import create_chat_model
|
||||
from utils.emb_factory import create_embedding_model
|
||||
from graph import build_graph, build_prepare_graph, build_final_messages, session_store
|
||||
from graph import build_graph, build_prepare_graph, build_final_messages, session_store, classify_history_store
|
||||
|
||||
from evaluate import run_evaluation
|
||||
|
||||
|
|
@ -39,6 +39,18 @@ class BrunixEngine(brunix_pb2_grpc.AssistanceEngineServicer):
|
|||
validate_model_on_init=True,
|
||||
)
|
||||
|
||||
conv_model = os.getenv("OLLAMA_MODEL_NAME_CONVERSATIONAL")
|
||||
if conv_model:
|
||||
self.llm_conversational = create_chat_model(
|
||||
provider="ollama",
|
||||
model=conv_model,
|
||||
base_url=os.getenv("OLLAMA_URL"),
|
||||
temperature=0,
|
||||
)
|
||||
logger.info(f"[ENGINE] Conversational model: {conv_model}")
|
||||
else:
|
||||
self.llm_conversational = self.llm
|
||||
|
||||
self.embeddings = create_embedding_model(
|
||||
provider="ollama",
|
||||
model=os.getenv("OLLAMA_EMB_MODEL_NAME"),
|
||||
|
|
@ -65,6 +77,7 @@ class BrunixEngine(brunix_pb2_grpc.AssistanceEngineServicer):
|
|||
embeddings = self.embeddings,
|
||||
es_client = self.es_client,
|
||||
index_name = self.index_name,
|
||||
llm_conversational = self.llm_conversational,
|
||||
)
|
||||
|
||||
self.prepare_graph = build_prepare_graph(
|
||||
|
|
@ -110,6 +123,7 @@ class BrunixEngine(brunix_pb2_grpc.AssistanceEngineServicer):
|
|||
|
||||
try:
|
||||
history = list(session_store.get(session_id, []))
|
||||
classify_history = list(classify_history_store.get(session_id, []))
|
||||
logger.info(f"[AskAgent] conversation: {len(history)} previous messages.")
|
||||
|
||||
initial_state = {
|
||||
|
|
@ -118,6 +132,7 @@ class BrunixEngine(brunix_pb2_grpc.AssistanceEngineServicer):
|
|||
"reformulated_query": "",
|
||||
"context": "",
|
||||
"query_type": "",
|
||||
"classify_history": classify_history,
|
||||
|
||||
"editor_content": editor_content,
|
||||
"selected_text": selected_text,
|
||||
|
|
@ -131,6 +146,9 @@ class BrunixEngine(brunix_pb2_grpc.AssistanceEngineServicer):
|
|||
result_text = getattr(last_msg, "content", str(last_msg)) \
|
||||
if last_msg else ""
|
||||
|
||||
if session_id:
|
||||
classify_history_store[session_id] = final_state.get("classify_history", classify_history)
|
||||
|
||||
logger.info(f"[AskAgent] query_type={final_state.get('query_type')} "
|
||||
f"answer='{result_text[:100]}'")
|
||||
|
||||
|
|
@ -181,6 +199,7 @@ class BrunixEngine(brunix_pb2_grpc.AssistanceEngineServicer):
|
|||
|
||||
try:
|
||||
history = list(session_store.get(session_id, []))
|
||||
classify_history = list(classify_history_store.get(session_id, []))
|
||||
logger.info(f"[AskAgentStream] conversation: {len(history)} previous messages.")
|
||||
|
||||
initial_state = {
|
||||
|
|
@ -189,6 +208,7 @@ class BrunixEngine(brunix_pb2_grpc.AssistanceEngineServicer):
|
|||
"reformulated_query": "",
|
||||
"context": "",
|
||||
"query_type": "",
|
||||
"classify_history": classify_history,
|
||||
|
||||
"editor_content": editor_content,
|
||||
"selected_text": selected_text,
|
||||
|
|
@ -205,7 +225,10 @@ class BrunixEngine(brunix_pb2_grpc.AssistanceEngineServicer):
|
|||
final_messages = build_final_messages(prepared)
|
||||
full_response = []
|
||||
|
||||
for chunk in self.llm.stream(final_messages):
|
||||
query_type = prepared.get("query_type", "RETRIEVAL")
|
||||
active_llm = self.llm_conversational if query_type in ("CONVERSATIONAL", "PLATFORM") else self.llm
|
||||
|
||||
for chunk in active_llm.stream(final_messages):
|
||||
token = chunk.content
|
||||
if token:
|
||||
full_response.append(token)
|
||||
|
|
@ -219,6 +242,7 @@ class BrunixEngine(brunix_pb2_grpc.AssistanceEngineServicer):
|
|||
session_store[session_id] = (
|
||||
list(prepared["messages"]) + [AIMessage(content=complete_text)]
|
||||
)
|
||||
classify_history_store[session_id] = prepared.get("classify_history", classify_history)
|
||||
|
||||
logger.info(
|
||||
f"[AskAgentStream] done — "
|
||||
|
|
|
|||
|
|
@ -3,6 +3,11 @@ from typing import TypedDict, Annotated
|
|||
from langgraph.graph.message import add_messages
|
||||
|
||||
|
||||
class ClassifyEntry(TypedDict):
|
||||
type: str # RETRIEVAL | CODE_GENERATION | CONVERSATIONAL | PLATFORM
|
||||
topic: str # brief subject extracted from the question (~60 chars)
|
||||
|
||||
|
||||
class AgentState(TypedDict):
|
||||
# -- CORE
|
||||
messages: Annotated[list, add_messages]
|
||||
|
|
@ -10,6 +15,10 @@ class AgentState(TypedDict):
|
|||
context: str
|
||||
query_type: str
|
||||
session_id: str
|
||||
# -- CLASSIFIER INTENT HISTORY
|
||||
# Compact trace of past decisions. Gives the classifier the conversation
|
||||
# thread without passing raw message content (which biases small models).
|
||||
classify_history: list[ClassifyEntry]
|
||||
# -- OPEN AI API
|
||||
editor_content: str
|
||||
selected_text: str
|
||||
|
|
|
|||
|
|
@ -22,6 +22,13 @@ class OllamaChatFactory(BaseProviderFactory):
|
|||
return ChatOllama(model=model, **kwargs)
|
||||
|
||||
|
||||
class AnthropicChatFactory(BaseProviderFactory):
|
||||
def create(self, model: str, **kwargs: Any):
|
||||
from langchain_anthropic import ChatAnthropic
|
||||
|
||||
return ChatAnthropic(model=model, **kwargs)
|
||||
|
||||
|
||||
class BedrockChatFactory(BaseProviderFactory):
|
||||
def create(self, model: str, **kwargs: Any):
|
||||
from langchain_aws import ChatBedrockConverse
|
||||
|
|
|
|||
|
|
@ -0,0 +1,98 @@
|
|||
# ADR-0008: Adaptive Query Routing with Intent History and Model Specialization
|
||||
|
||||
**Date:** 2026-04-09
|
||||
**Status:** Accepted
|
||||
**Deciders:** Rafael Ruiz (CTO)
|
||||
**Related ADRs:** ADR-0002 (Two-Phase Streaming), ADR-0003 (Hybrid Retrieval RRF)
|
||||
|
||||
---
|
||||
|
||||
## Context
|
||||
|
||||
The assistance engine previously used a single Ollama model (`qwen3:1.7b`) for all query types and a single LLM-based classifier that received raw conversation history. Two problems emerged in production:
|
||||
|
||||
### Problem 1 — Model oversizing for lightweight queries
|
||||
|
||||
Platform queries (account status, usage metrics, subscription data) and conversational follow-ups do not require retrieval or a large model. Running `qwen3:1.7b` for a one-sentence platform insight wastes resources and adds latency.
|
||||
|
||||
### Problem 2 — Classifier bias from raw message history
|
||||
|
||||
When the classifier received raw conversation messages as history, a small model (1.7B parameters) exhibited **anchoring bias**: it would classify new messages as the same type as recent messages, regardless of the actual content of the new query. This caused platform queries (`"You have a project usage percentage of 20%, provide a recommendation"`) to be misclassified as `RETRIEVAL` or `CODE_GENERATION` during sessions that had previously handled AVAP language questions.
|
||||
|
||||
Root cause: passing full message content to a small classifier is too noisy. The model uses conversation topic as a proxy for intent type.
|
||||
|
||||
---
|
||||
|
||||
## Decision
|
||||
|
||||
### 1. New query type: `PLATFORM`
|
||||
|
||||
A fourth classification category is introduced alongside `RETRIEVAL`, `CODE_GENERATION`, and `CONVERSATIONAL`:
|
||||
|
||||
| Type | Purpose | RAG | Model |
|
||||
|---|---|---|---|
|
||||
| `RETRIEVAL` | AVAP language documentation | Yes | `OLLAMA_MODEL_NAME` |
|
||||
| `CODE_GENERATION` | Produce working AVAP code | Yes | `OLLAMA_MODEL_NAME` |
|
||||
| `CONVERSATIONAL` | Rephrase / continue prior answer | No | `OLLAMA_MODEL_NAME_CONVERSATIONAL` |
|
||||
| `PLATFORM` | Account, metrics, usage, billing | No | `OLLAMA_MODEL_NAME_CONVERSATIONAL` |
|
||||
|
||||
`PLATFORM` queries skip RAG entirely and are served with a dedicated `PLATFORM_PROMPT` that instructs the model to use `extra_context` (where user account data is injected) as primary source.
|
||||
|
||||
### 2. Model specialization via environment variables
|
||||
|
||||
Two model slots are configured independently:
|
||||
|
||||
```
|
||||
OLLAMA_MODEL_NAME=qwen3:1.7b # RETRIEVAL + CODE_GENERATION
|
||||
OLLAMA_MODEL_NAME_CONVERSATIONAL=qwen3:0.6b # CONVERSATIONAL + PLATFORM
|
||||
```
|
||||
|
||||
If `OLLAMA_MODEL_NAME_CONVERSATIONAL` is not set, both slots fall back to `OLLAMA_MODEL_NAME` (backward compatible).
|
||||
|
||||
### 3. Intent history instead of raw message history for classification
|
||||
|
||||
The classifier no longer receives raw conversation messages. Instead, a compact **intent history** (`classify_history`) is maintained per session:
|
||||
|
||||
```
|
||||
[RETRIEVAL] "What is addVar in AVAP?"
|
||||
[CODE_GENERATION] "Write an API endpoint that retur"
|
||||
[PLATFORM] "You have a project usage percentag"
|
||||
```
|
||||
|
||||
Each entry stores only the `type` and a 60-character topic snippet. This gives the classifier the conversational thread (useful for resolving ambiguous references like "this", "esto", "lo anterior") without the topical noise that causes anchoring bias.
|
||||
|
||||
`classify_history` is persisted in `classify_history_store` (parallel to `session_store`) and passed in `AgentState` across turns.
|
||||
|
||||
### 4. Classifier prompt redesign
|
||||
|
||||
The prompt now includes:
|
||||
|
||||
- **`<history_rule>`** — explicit instruction: use history only to resolve ambiguous references, not to predict the category of the new message
|
||||
- **`<platform_priority_rule>`** — hard override: if the message contains usage percentages, account metrics, quota data, or billing information, classify as `PLATFORM` regardless of history
|
||||
- **`<step1_purpose>`** replaced by inline role instruction that each message must be evaluated independently
|
||||
|
||||
### 5. Fast-path for known platform prefixes
|
||||
|
||||
Queries containing `"you are a direct and concise assistant"` (a system-injected prefix used by the platform) are classified as `PLATFORM` deterministically without invoking the LLM classifier. This is justified because this prefix is controlled by the platform itself, not by user input, so deterministic detection is both correct and cheaper.
|
||||
|
||||
---
|
||||
|
||||
## Consequences
|
||||
|
||||
### Positive
|
||||
|
||||
- Platform and conversational queries are served by a smaller, faster model
|
||||
- Classifier bias from conversation history is eliminated while preserving the ability to resolve ambiguous references
|
||||
- `PLATFORM` queries never hit Elasticsearch, reducing unnecessary retrieval load
|
||||
- The system is more predictable: platform-injected prompts are classified in O(1) without an LLM call
|
||||
|
||||
### Negative / Trade-offs
|
||||
|
||||
- `classify_history` adds a small amount of state per session (bounded to last 6 entries)
|
||||
- Two model slots mean two warm-up calls at startup if models differ
|
||||
- The `qwen3:1.7b` classifier can still misclassify edge cases where no platform signals are present in the text — this is inherent to using a 1.7B model for semantic classification
|
||||
|
||||
### Open questions
|
||||
|
||||
- Whether the classifier should be upgraded to a more capable model in the future (at the cost of latency/resources)
|
||||
- Whether `PLATFORM` should eventually split into sub-types (e.g. `PLATFORM_METRICS` vs `PLATFORM_BILLING`) as the platform data schema grows
|
||||
Loading…
Reference in New Issue