Merge branch 'mrh-online-dev' of github.com:BRUNIX-AI/assistance-engine into mrh-online-dev

This commit is contained in:
acano 2026-03-03 10:30:01 +01:00
commit 203ba4a45c
3 changed files with 1721 additions and 1 deletions

View File

@ -0,0 +1,868 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "9ea4c7c5",
"metadata": {},
"source": [
"# n00 Dual Index Ingestion v1\n",
"\n",
"This notebook implements a dual-index ingestion strategy for RAG:\n",
"- **Docs index** for conceptual documentation chunks\n",
"- **Code index** for code chunks with parent/child hierarchy\n",
"\n",
"It is designed to support query decomposition when there is no exact solution and the model must synthesize ideas from both docs and code."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4f7602ef",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import re\n",
"import uuid\n",
"from dataclasses import dataclass\n",
"from datetime import datetime, timezone\n",
"from pathlib import Path\n",
"from typing import Any, Dict, Iterable, List, Optional, Tuple\n",
"\n",
"import torch\n",
"import torch.nn.functional as F\n",
"from elasticsearch import Elasticsearch\n",
"from elasticsearch.helpers import bulk\n",
"from loguru import logger\n",
"from transformers import AutoConfig, AutoModel, AutoTokenizer\n",
"from src.config import PROJ_ROOT, RAW_DIR, OLLAMA_EMB_MODEL_NAME, ELASTICSEARCH_LOCAL_URL\n",
"\n",
"\n",
"DOCS_INDEX = os.getenv(\"ELASTICSEARCH_DOCS_INDEX\", \"avap_docs_v1\")\n",
"CODE_INDEX = os.getenv(\"ELASTICSEARCH_CODE_INDEX\", \"avap_code_v1\")\n",
"ES_URL = ELASTICSEARCH_LOCAL_URL\n",
"RAW_DOCS_DIR = RAW_DIR / \"docs\"\n",
"RAW_CODE_DIR = RAW_DIR / \"code\"\n",
"\n",
"config = AutoConfig.from_pretrained(OLLAMA_EMB_MODEL_NAME)\n",
"EMBEDDING_DIM = int(config.hidden_size)\n",
"logger.info(f\"Embedding model: {OLLAMA_EMB_MODEL_NAME}\")\n",
"logger.info(f\"Embedding dim: {EMBEDDING_DIM}\")\n",
"logger.info(f\"Raw docs dir: {RAW_DOCS_DIR}\")\n",
"logger.info(f\"Raw code dir: {RAW_CODE_DIR}\")"
]
},
{
"cell_type": "markdown",
"id": "7dd24f58",
"metadata": {},
"source": [
"## Domain models"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7aa1acca",
"metadata": {},
"outputs": [],
"source": [
"@dataclass(frozen=True)\n",
"class DocChunk:\n",
" chunk_id: str\n",
" doc_id: str\n",
" text: str\n",
" source: str\n",
" metadata: Dict[str, Any]\n",
"\n",
"\n",
"@dataclass(frozen=True)\n",
"class CodeChunk:\n",
" chunk_id: str\n",
" doc_id: str\n",
" text: str\n",
" source: str\n",
" metadata: Dict[str, Any]\n",
" parent_id: Optional[str]\n",
" chunk_level: str\n",
" language: str\n",
" start_line: int\n",
" end_line: int\n",
" chunk_role: str"
]
},
{
"cell_type": "markdown",
"id": "ac97ae4e",
"metadata": {},
"source": [
"## Utilities"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a975c8ee",
"metadata": {},
"outputs": [],
"source": [
"def now_iso() -> str:\n",
" return datetime.now(timezone.utc).isoformat()\n",
"\n",
"\n",
"def clean_text(text: str) -> str:\n",
" text = text.replace(\"\\u00a0\", \" \")\n",
" text = re.sub(r\"\\s+\", \" \", text).strip()\n",
" return text\n",
"\n",
"\n",
"def token_window_chunking(\n",
" text: str,\n",
" tokenizer: AutoTokenizer,\n",
" chunk_size: int = 1000,\n",
" overlap: int = 150,\n",
") -> List[str]:\n",
" if chunk_size <= overlap:\n",
" raise ValueError(\"chunk_size must be greater than overlap\")\n",
"\n",
" token_ids = tokenizer.encode(text, add_special_tokens=False)\n",
" chunks: List[str] = []\n",
" start = 0\n",
"\n",
" while start < len(token_ids):\n",
" end = min(start + chunk_size, len(token_ids))\n",
" piece_ids = token_ids[start:end]\n",
" chunks.append(tokenizer.decode(piece_ids, skip_special_tokens=True))\n",
"\n",
" if end == len(token_ids):\n",
" break\n",
"\n",
" start = end - overlap\n",
"\n",
" return [chunk for chunk in chunks if chunk.strip()]"
]
},
{
"cell_type": "markdown",
"id": "cfe2bfa9",
"metadata": {},
"source": [
"## Code chunking with parent/child hierarchy"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "33acb0ac",
"metadata": {},
"outputs": [],
"source": [
"def _line_windows(\n",
" lines: List[str],\n",
" max_lines: int,\n",
" overlap: int\n",
") -> List[Tuple[int, int, str]]:\n",
" if max_lines <= overlap:\n",
" raise ValueError(\"max_lines must be greater than overlap\")\n",
"\n",
" windows: List[Tuple[int, int, str]] = []\n",
" start = 0\n",
"\n",
" while start < len(lines):\n",
" end = min(start + max_lines, len(lines))\n",
" text = \"\\n\".join(lines[start:end]).strip()\n",
" if text:\n",
" windows.append((start + 1, end, text))\n",
"\n",
" if end == len(lines):\n",
" break\n",
"\n",
" start = end - overlap\n",
"\n",
" return windows\n",
"\n",
"\n",
"def infer_chunk_role(text: str) -> str:\n",
" lowered = text.lower()\n",
" if \"try:\" in lowered or \"except\" in lowered:\n",
" return \"error\"\n",
" if \"import \" in lowered or \"from \" in lowered:\n",
" return \"init\"\n",
" if \"print(\" in lowered or \"logger\" in lowered:\n",
" return \"io\"\n",
" return \"logic\"\n",
"\n",
"\n",
"def build_code_chunks(\n",
" code_text: str,\n",
" source: str,\n",
" metadata: Dict[str, Any],\n",
" language: str = \"unknown\",\n",
" parent_max_lines: int = 80,\n",
" parent_overlap: int = 12,\n",
" child_max_lines: int = 20,\n",
" child_overlap: int = 4,\n",
") -> List[CodeChunk]:\n",
" doc_id = str(metadata.get(\"doc_id\") or uuid.uuid4())\n",
" lines = code_text.splitlines()\n",
"\n",
" parent_windows = _line_windows(lines, parent_max_lines, parent_overlap)\n",
" chunks: List[CodeChunk] = []\n",
"\n",
" for parent_idx, (start_line, end_line, parent_text) in enumerate(parent_windows):\n",
" parent_id = f\"{doc_id}:p{parent_idx}\"\n",
" parent_chunk = CodeChunk(\n",
" chunk_id=parent_id,\n",
" doc_id=doc_id,\n",
" text=parent_text,\n",
" source=source,\n",
" metadata={\n",
" **metadata,\n",
" \"doc_id\": doc_id,\n",
" \"source_type\": \"code\",\n",
" \"updated_at\": now_iso(),\n",
" \"is_parent\": True,\n",
" },\n",
" parent_id=None,\n",
" chunk_level=\"parent\",\n",
" language=language,\n",
" start_line=start_line,\n",
" end_line=end_line,\n",
" chunk_role=\"logic\",\n",
" )\n",
" chunks.append(parent_chunk)\n",
"\n",
" parent_lines = parent_text.splitlines()\n",
" child_windows = _line_windows(parent_lines, child_max_lines, child_overlap)\n",
"\n",
" for child_idx, (child_start, child_end, child_text) in enumerate(child_windows):\n",
" global_start = start_line + child_start - 1\n",
" global_end = start_line + child_end - 1\n",
" child_id = f\"{parent_id}:c{child_idx}\"\n",
"\n",
" child_chunk = CodeChunk(\n",
" chunk_id=child_id,\n",
" doc_id=doc_id,\n",
" text=child_text,\n",
" source=source,\n",
" metadata={\n",
" **metadata,\n",
" \"doc_id\": doc_id,\n",
" \"source_type\": \"code\",\n",
" \"updated_at\": now_iso(),\n",
" \"is_parent\": False,\n",
" },\n",
" parent_id=parent_id,\n",
" chunk_level=\"child\",\n",
" language=language,\n",
" start_line=global_start,\n",
" end_line=global_end,\n",
" chunk_role=infer_chunk_role(child_text),\n",
" )\n",
" chunks.append(child_chunk)\n",
"\n",
" return chunks"
]
},
{
"cell_type": "markdown",
"id": "27583e2b",
"metadata": {},
"source": [
"## Build documentation chunks"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5c452926",
"metadata": {},
"outputs": [],
"source": [
"def build_doc_chunks(\n",
" doc_text: str,\n",
" source: str,\n",
" metadata: Dict[str, Any],\n",
" tokenizer: AutoTokenizer,\n",
" chunk_size: int = 1000,\n",
" overlap: int = 150,\n",
") -> List[DocChunk]:\n",
" doc_id = str(metadata.get(\"doc_id\") or uuid.uuid4())\n",
" cleaned = clean_text(doc_text)\n",
" parts = token_window_chunking(\n",
" cleaned,\n",
" tokenizer=tokenizer,\n",
" chunk_size=chunk_size,\n",
" overlap=overlap,\n",
" )\n",
"\n",
" chunks: List[DocChunk] = []\n",
" for idx, part in enumerate(parts):\n",
" chunk_id = f\"{doc_id}:d{idx}\"\n",
" chunks.append(\n",
" DocChunk(\n",
" chunk_id=chunk_id,\n",
" doc_id=doc_id,\n",
" text=part,\n",
" source=source,\n",
" metadata={\n",
" **metadata,\n",
" \"doc_id\": doc_id,\n",
" \"source_type\": \"doc\",\n",
" \"updated_at\": now_iso(),\n",
" },\n",
" )\n",
" )\n",
"\n",
" return chunks"
]
},
{
"cell_type": "markdown",
"id": "0a4ee80e",
"metadata": {},
"source": [
"## Embedding utilities"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "be0e5adb",
"metadata": {},
"outputs": [],
"source": [
"_EMBED_CACHE: Dict[str, Any] = {}\n",
"\n",
"\n",
"def get_embedder(model_name: str = OLLAMA_EMB_MODEL_NAME) -> Tuple[AutoTokenizer, AutoModel, str]:\n",
" if model_name in _EMBED_CACHE:\n",
" return _EMBED_CACHE[model_name]\n",
"\n",
" device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
" tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)\n",
" model = AutoModel.from_pretrained(model_name).to(device)\n",
" model.eval()\n",
"\n",
" _EMBED_CACHE[model_name] = (tokenizer, model, device)\n",
" return tokenizer, model, device\n",
"\n",
"\n",
"def embed_texts(texts: List[str], batch_size: int = 32) -> List[List[float]]:\n",
" tokenizer, model, device = get_embedder()\n",
" vectors: List[List[float]] = []\n",
"\n",
" for idx in range(0, len(texts), batch_size):\n",
" batch = texts[idx: idx + batch_size]\n",
" with torch.no_grad():\n",
" encoded = tokenizer(\n",
" batch,\n",
" padding=True,\n",
" truncation=True,\n",
" return_tensors=\"pt\",\n",
" ).to(device)\n",
" output = model(**encoded)\n",
" mask = encoded[\"attention_mask\"].unsqueeze(-1)\n",
" pooled = (output.last_hidden_state * mask).sum(1) / mask.sum(1).clamp(min=1e-9)\n",
" normalized = F.normalize(pooled, p=2, dim=1)\n",
" vectors.extend(normalized.cpu().tolist())\n",
"\n",
" return vectors"
]
},
{
"cell_type": "markdown",
"id": "674b0561",
"metadata": {},
"source": [
"## Elasticsearch mappings (separated indices)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "11040f31",
"metadata": {},
"outputs": [],
"source": [
"DOCS_MAPPING = {\n",
" \"settings\": {\"index\": {\"number_of_shards\": 1}},\n",
" \"mappings\": {\n",
" \"properties\": {\n",
" \"doc_id\": {\"type\": \"keyword\"},\n",
" \"chunk_id\": {\"type\": \"keyword\"},\n",
" \"text\": {\"type\": \"text\"},\n",
" \"source\": {\"type\": \"keyword\"},\n",
" \"source_type\": {\"type\": \"keyword\"},\n",
" \"domain\": {\"type\": \"keyword\"},\n",
" \"version\": {\"type\": \"keyword\"},\n",
" \"tags\": {\"type\": \"keyword\"},\n",
" \"difficulty\": {\"type\": \"keyword\"},\n",
" \"updated_at\": {\"type\": \"date\"},\n",
" \"section_title\": {\"type\": \"keyword\"},\n",
" \"topic_path\": {\"type\": \"keyword\"},\n",
" \"metadata\": {\"type\": \"object\", \"enabled\": True},\n",
" \"embedding\": {\n",
" \"type\": \"dense_vector\",\n",
" \"dims\": EMBEDDING_DIM,\n",
" \"index\": True,\n",
" \"similarity\": \"cosine\",\n",
" },\n",
" }\n",
" },\n",
"}\n",
"\n",
"\n",
"CODE_MAPPING = {\n",
" \"settings\": {\"index\": {\"number_of_shards\": 1}},\n",
" \"mappings\": {\n",
" \"properties\": {\n",
" \"doc_id\": {\"type\": \"keyword\"},\n",
" \"chunk_id\": {\"type\": \"keyword\"},\n",
" \"example_id\": {\"type\": \"keyword\"},\n",
" \"example_title\": {\"type\": \"keyword\"},\n",
" \"parent_id\": {\"type\": \"keyword\"},\n",
" \"chunk_level\": {\"type\": \"keyword\"},\n",
" \"chunk_role\": {\"type\": \"keyword\"},\n",
" \"language\": {\"type\": \"keyword\"},\n",
" \"symbol_name\": {\"type\": \"keyword\"},\n",
" \"intent\": {\"type\": \"keyword\"},\n",
" \"dependencies\": {\"type\": \"keyword\"},\n",
" \"start_line\": {\"type\": \"integer\"},\n",
" \"end_line\": {\"type\": \"integer\"},\n",
" \"repo\": {\"type\": \"keyword\"},\n",
" \"file_path\": {\"type\": \"keyword\"},\n",
" \"source\": {\"type\": \"keyword\"},\n",
" \"source_type\": {\"type\": \"keyword\"},\n",
" \"domain\": {\"type\": \"keyword\"},\n",
" \"version\": {\"type\": \"keyword\"},\n",
" \"tags\": {\"type\": \"keyword\"},\n",
" \"difficulty\": {\"type\": \"keyword\"},\n",
" \"updated_at\": {\"type\": \"date\"},\n",
" \"text\": {\"type\": \"text\"},\n",
" \"metadata\": {\"type\": \"object\", \"enabled\": True},\n",
" \"embedding\": {\n",
" \"type\": \"dense_vector\",\n",
" \"dims\": EMBEDDING_DIM,\n",
" \"index\": True,\n",
" \"similarity\": \"cosine\",\n",
" },\n",
" }\n",
" },\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8f78db43",
"metadata": {},
"outputs": [],
"source": [
"def get_es_client() -> Elasticsearch:\n",
" return Elasticsearch(\n",
" ES_URL,\n",
" request_timeout=90,\n",
" max_retries=5,\n",
" retry_on_timeout=True,\n",
" )\n",
"\n",
"\n",
"def recreate_index(es: Elasticsearch, index_name: str, mapping: Dict[str, Any]) -> None:\n",
" if es.indices.exists(index=index_name):\n",
" es.indices.delete(index=index_name)\n",
" logger.info(f\"Deleted existing index: {index_name}\")\n",
"\n",
" es.indices.create(index=index_name, body=mapping)\n",
" logger.info(f\"Created index: {index_name}\")"
]
},
{
"cell_type": "markdown",
"id": "8a66d191",
"metadata": {},
"source": [
"## Indexing functions"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5db6e05e",
"metadata": {},
"outputs": [],
"source": [
"def index_doc_chunks(\n",
" es: Elasticsearch,\n",
" index_name: str,\n",
" chunks: List[DocChunk],\n",
" batch_size: int = 32,\n",
") -> None:\n",
" vectors = embed_texts([chunk.text for chunk in chunks], batch_size=batch_size)\n",
"\n",
" def actions() -> Iterable[Dict[str, Any]]:\n",
" for chunk, vector in zip(chunks, vectors):\n",
" yield {\n",
" \"_op_type\": \"index\",\n",
" \"_index\": index_name,\n",
" \"_id\": chunk.chunk_id,\n",
" \"_source\": {\n",
" \"doc_id\": chunk.doc_id,\n",
" \"chunk_id\": chunk.chunk_id,\n",
" \"text\": chunk.text,\n",
" \"source\": chunk.source,\n",
" \"source_type\": \"doc\",\n",
" \"domain\": chunk.metadata.get(\"domain\"),\n",
" \"version\": chunk.metadata.get(\"version\"),\n",
" \"tags\": chunk.metadata.get(\"tags\", []),\n",
" \"difficulty\": chunk.metadata.get(\"difficulty\"),\n",
" \"updated_at\": chunk.metadata.get(\"updated_at\"),\n",
" \"section_title\": chunk.metadata.get(\"section_title\"),\n",
" \"topic_path\": chunk.metadata.get(\"topic_path\"),\n",
" \"metadata\": chunk.metadata,\n",
" \"embedding\": vector,\n",
" },\n",
" }\n",
"\n",
" bulk(es.options(request_timeout=180), actions())\n",
" logger.info(f\"Indexed {len(chunks)} doc chunks into {index_name}\")\n",
"\n",
"\n",
"def index_code_chunks(\n",
" es: Elasticsearch,\n",
" index_name: str,\n",
" chunks: List[CodeChunk],\n",
" batch_size: int = 32,\n",
") -> None:\n",
" vectors = embed_texts([chunk.text for chunk in chunks], batch_size=batch_size)\n",
"\n",
" def actions() -> Iterable[Dict[str, Any]]:\n",
" for chunk, vector in zip(chunks, vectors):\n",
" yield {\n",
" \"_op_type\": \"index\",\n",
" \"_index\": index_name,\n",
" \"_id\": chunk.chunk_id,\n",
" \"_source\": {\n",
" \"doc_id\": chunk.doc_id,\n",
" \"chunk_id\": chunk.chunk_id,\n",
" \"example_id\": chunk.metadata.get(\"example_id\"),\n",
" \"example_title\": chunk.metadata.get(\"example_title\"),\n",
" \"parent_id\": chunk.parent_id,\n",
" \"chunk_level\": chunk.chunk_level,\n",
" \"chunk_role\": chunk.chunk_role,\n",
" \"language\": chunk.language,\n",
" \"symbol_name\": chunk.metadata.get(\"symbol_name\"),\n",
" \"intent\": chunk.metadata.get(\"intent\"),\n",
" \"dependencies\": chunk.metadata.get(\"dependencies\", []),\n",
" \"start_line\": chunk.start_line,\n",
" \"end_line\": chunk.end_line,\n",
" \"repo\": chunk.metadata.get(\"repo\"),\n",
" \"file_path\": chunk.metadata.get(\"file_path\"),\n",
" \"source\": chunk.source,\n",
" \"source_type\": \"code\",\n",
" \"domain\": chunk.metadata.get(\"domain\"),\n",
" \"version\": chunk.metadata.get(\"version\"),\n",
" \"tags\": chunk.metadata.get(\"tags\", []),\n",
" \"difficulty\": chunk.metadata.get(\"difficulty\"),\n",
" \"updated_at\": chunk.metadata.get(\"updated_at\"),\n",
" \"text\": chunk.text,\n",
" \"metadata\": chunk.metadata,\n",
" \"embedding\": vector,\n",
" },\n",
" }\n",
"\n",
" bulk(es.options(request_timeout=180), actions())\n",
" logger.info(f\"Indexed {len(chunks)} code chunks into {index_name}\")"
]
},
{
"cell_type": "markdown",
"id": "db3a45d4",
"metadata": {},
"source": [
"## Ingest from data/raw/docs and data/raw/code"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f0e3b3e4",
"metadata": {},
"outputs": [],
"source": [
"def _natural_sort_key(path: Path) -> Tuple[int, str]:\n",
" match = re.match(r\"^(\\d+)_\", path.stem)\n",
" if match:\n",
" return int(match.group(1)), path.stem.lower()\n",
" return 10**9, path.stem.lower()\n",
"\n",
"\n",
"def discover_raw_sources(\n",
" docs_dir: Path,\n",
" code_dir: Path,\n",
") -> Tuple[List[Path], List[Path]]:\n",
" if not docs_dir.exists():\n",
" raise FileNotFoundError(f\"Docs directory not found: {docs_dir}\")\n",
" if not code_dir.exists():\n",
" raise FileNotFoundError(f\"Code directory not found: {code_dir}\")\n",
"\n",
" docs_files = sorted(\n",
" [path for path in docs_dir.glob(\"*.txt\") if path.is_file()],\n",
" key=_natural_sort_key,\n",
" )\n",
" code_files = sorted(\n",
" [path for path in code_dir.glob(\"*.txt\") if path.is_file()],\n",
" key=_natural_sort_key,\n",
" )\n",
"\n",
" if not docs_files:\n",
" raise ValueError(f\"No .txt documentation files found in {docs_dir}\")\n",
" if not code_files:\n",
" raise ValueError(f\"No .txt code files found in {code_dir}\")\n",
"\n",
" return docs_files, code_files\n",
"\n",
"\n",
"def infer_example_title(prefix_text: str, fallback_title: str) -> str:\n",
" candidate_lines = [line.strip() for line in prefix_text.splitlines() if line.strip()]\n",
" for line in reversed(candidate_lines[-8:]):\n",
" lowered = line.lower()\n",
" if lowered == \"code snippet\":\n",
" continue\n",
" cleaned = re.sub(r\"^[#\\-\\s>*]*\", \"\", line)\n",
" cleaned = re.sub(r\"^\\d+[\\.)]\\s*\", \"\", cleaned)\n",
" cleaned = cleaned.replace(\"**\", \"\").strip()\n",
" if cleaned and cleaned.lower() != \"code snippet\":\n",
" return cleaned[:120]\n",
" return fallback_title\n",
"\n",
"\n",
"def extract_code_examples(file_text: str, file_stem: str) -> List[Dict[str, str]]:\n",
" pattern = re.compile(\n",
" r\"```(?P<lang>[a-zA-Z0-9_+-]*)\\n(?P<code>.*?)```\",\n",
" re.DOTALL,\n",
" )\n",
" examples: List[Dict[str, str]] = []\n",
"\n",
" for idx, match in enumerate(pattern.finditer(file_text), start=1):\n",
" code = match.group(\"code\").strip()\n",
" if not code:\n",
" continue\n",
" fallback_title = f\"{file_stem} - Example {idx:03d}\"\n",
" prefix = file_text[max(0, match.start() - 600): match.start()]\n",
" title = infer_example_title(prefix, fallback_title)\n",
" language = match.group(\"lang\").strip() or \"avap\"\n",
" examples.append({\n",
" \"example_index\": idx,\n",
" \"title\": title,\n",
" \"language\": language,\n",
" \"code\": code,\n",
" })\n",
"\n",
" if examples:\n",
" return examples\n",
"\n",
" plain_text = file_text.strip()\n",
" if not plain_text:\n",
" return []\n",
"\n",
" return [\n",
" {\n",
" \"example_index\": 1,\n",
" \"title\": f\"{file_stem} - Example 001\",\n",
" \"language\": \"avap\",\n",
" \"code\": plain_text,\n",
" }\n",
" ]\n",
"\n",
"\n",
"def build_doc_chunks_from_raw(\n",
" docs_files: List[Path],\n",
" tokenizer: AutoTokenizer,\n",
" chunk_size: int = 900,\n",
" overlap: int = 120,\n",
") -> List[DocChunk]:\n",
" all_chunks: List[DocChunk] = []\n",
"\n",
" for idx, path in enumerate(docs_files, start=1):\n",
" title = path.stem.replace(\"_\", \" \")\n",
" text = path.read_text(encoding=\"utf-8\")\n",
"\n",
" chunks = build_doc_chunks(\n",
" doc_text=text,\n",
" source=str(path.relative_to(PROJ_ROOT)),\n",
" metadata={\n",
" \"doc_id\": f\"doc-{idx:03d}\",\n",
" \"domain\": \"avap\",\n",
" \"version\": \"v1\",\n",
" \"tags\": [\"documentation\"],\n",
" \"difficulty\": \"mixed\",\n",
" \"section_title\": title,\n",
" \"topic_path\": f\"docs/{path.stem}\",\n",
" },\n",
" tokenizer=tokenizer,\n",
" chunk_size=chunk_size,\n",
" overlap=overlap,\n",
" )\n",
" all_chunks.extend(chunks)\n",
"\n",
" return all_chunks\n",
"\n",
"\n",
"def build_code_chunks_from_raw(\n",
" code_files: List[Path],\n",
" child_max_lines: int = 14,\n",
" child_overlap: int = 3,\n",
") -> List[CodeChunk]:\n",
" all_chunks: List[CodeChunk] = []\n",
" example_counter = 0\n",
"\n",
" for code_file in code_files:\n",
" file_text = code_file.read_text(encoding=\"utf-8\")\n",
" examples = extract_code_examples(file_text, code_file.stem)\n",
"\n",
" for example in examples:\n",
" example_counter += 1\n",
" code_text = example[\"code\"]\n",
" line_count = max(1, len(code_text.splitlines()))\n",
" child_window = min(child_max_lines, line_count)\n",
" overlap_window = min(child_overlap, max(0, child_window - 1))\n",
"\n",
" code_chunks = build_code_chunks(\n",
" code_text=code_text,\n",
" source=str(code_file.relative_to(PROJ_ROOT)),\n",
" metadata={\n",
" \"doc_id\": f\"code-{example_counter:03d}\",\n",
" \"example_id\": f\"example-{example_counter:03d}\",\n",
" \"example_title\": example[\"title\"],\n",
" \"repo\": \"BRUNIX-AI/assistance-engine\",\n",
" \"file_path\": str(code_file.relative_to(PROJ_ROOT)),\n",
" \"symbol_name\": example[\"title\"],\n",
" \"intent\": \"avap code example\",\n",
" \"dependencies\": [],\n",
" \"domain\": \"avap\",\n",
" \"version\": \"v1\",\n",
" \"tags\": [\"code-example\"],\n",
" \"difficulty\": \"mixed\",\n",
" },\n",
" language=example[\"language\"],\n",
" parent_max_lines=line_count,\n",
" parent_overlap=0,\n",
" child_max_lines=child_window,\n",
" child_overlap=overlap_window,\n",
" )\n",
" all_chunks.extend(code_chunks)\n",
"\n",
" return all_chunks\n",
"\n",
"\n",
"tokenizer, _, _ = get_embedder()\n",
"docs_files, code_files = discover_raw_sources(RAW_DOCS_DIR, RAW_CODE_DIR)\n",
"\n",
"doc_chunks = build_doc_chunks_from_raw(docs_files, tokenizer=tokenizer)\n",
"code_chunks = build_code_chunks_from_raw(code_files)\n",
"\n",
"print(f\"Documentation files discovered: {len(docs_files)}\")\n",
"print(f\"Code files discovered: {len(code_files)}\")\n",
"print(f\"Doc chunks built: {len(doc_chunks)}\")\n",
"print(f\"Code chunks built (parent + child): {len(code_chunks)}\")\n",
"print(f\"Parent chunks: {sum(1 for chunk in code_chunks if chunk.chunk_level == 'parent')}\")\n",
"print(f\"Child chunks: {sum(1 for chunk in code_chunks if chunk.chunk_level == 'child')}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "573ead0e",
"metadata": {},
"outputs": [],
"source": [
"es = get_es_client()\n",
"recreate_index(es, DOCS_INDEX, DOCS_MAPPING)\n",
"recreate_index(es, CODE_INDEX, CODE_MAPPING)\n",
"\n",
"index_doc_chunks(es, DOCS_INDEX, doc_chunks, batch_size=16)\n",
"index_code_chunks(es, CODE_INDEX, code_chunks, batch_size=16)\n",
"\n",
"es.indices.refresh(index=DOCS_INDEX)\n",
"es.indices.refresh(index=CODE_INDEX)\n",
"\n",
"print(\"Dual index ingestion completed from data/raw.\")\n",
"print(f\"Docs index: {DOCS_INDEX}\")\n",
"print(f\"Code index: {CODE_INDEX}\")\n",
"print(f\"Indexed doc chunks: {len(doc_chunks)}\")\n",
"print(f\"Indexed code chunks: {len(code_chunks)}\")"
]
},
{
"cell_type": "markdown",
"id": "35f6cdee",
"metadata": {},
"source": [
"## Optional: hybrid retrieval across both indices"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "968a8727",
"metadata": {},
"outputs": [],
"source": [
"def vector_search(\n",
" es: Elasticsearch,\n",
" index_name: str,\n",
" query: str,\n",
" top_k: int = 5,\n",
") -> List[Dict[str, Any]]:\n",
" query_vec = embed_texts([query], batch_size=1)[0]\n",
"\n",
" body = {\n",
" \"size\": top_k,\n",
" \"query\": {\n",
" \"script_score\": {\n",
" \"query\": {\"match_all\": {}},\n",
" \"script\": {\n",
" \"source\": \"cosineSimilarity(params.query_vector, 'embedding') + 1.0\",\n",
" \"params\": {\"query_vector\": query_vec},\n",
" },\n",
" }\n",
" },\n",
" }\n",
"\n",
" result = es.search(index=index_name, body=body)\n",
" return result[\"hits\"][\"hits\"]\n",
"\n",
"\n",
"query = \"How do I handle normalization and edge cases in vector similarity?\"\n",
"doc_hits = vector_search(es, DOCS_INDEX, query, top_k=3)\n",
"code_hits = vector_search(es, CODE_INDEX, query, top_k=5)\n",
"\n",
"print(\"Top docs:\")\n",
"for hit in doc_hits:\n",
" print(\"-\", hit[\"_source\"].get(\"section_title\"), \"|\", hit[\"_score\"])\n",
"\n",
"print(\"\\nTop code chunks:\")\n",
"for hit in code_hits:\n",
" source = hit[\"_source\"]\n",
" print(\n",
" \"-\",\n",
" source.get(\"chunk_id\"),\n",
" \"| level=\",\n",
" source.get(\"chunk_level\"),\n",
" \"| parent=\",\n",
" source.get(\"parent_id\"),\n",
" \"| score=\",\n",
" hit.get(\"_score\"),\n",
" )"
]
}
],
"metadata": {
"language_info": {
"name": "python"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

File diff suppressed because one or more lines are too long

View File

@ -1,5 +1,5 @@
version = 1
revision = 3
revision = 2
requires-python = ">=3.11"
resolution-markers = [
"python_full_version >= '3.14' and sys_platform == 'win32'",