Merge branch 'mrh-online-dev' of github.com:BRUNIX-AI/assistance-engine into mrh-online-dev
This commit is contained in:
commit
203ba4a45c
|
|
@ -0,0 +1,868 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "9ea4c7c5",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# n00 Dual Index Ingestion v1\n",
|
||||
"\n",
|
||||
"This notebook implements a dual-index ingestion strategy for RAG:\n",
|
||||
"- **Docs index** for conceptual documentation chunks\n",
|
||||
"- **Code index** for code chunks with parent/child hierarchy\n",
|
||||
"\n",
|
||||
"It is designed to support query decomposition when there is no exact solution and the model must synthesize ideas from both docs and code."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "4f7602ef",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"import re\n",
|
||||
"import uuid\n",
|
||||
"from dataclasses import dataclass\n",
|
||||
"from datetime import datetime, timezone\n",
|
||||
"from pathlib import Path\n",
|
||||
"from typing import Any, Dict, Iterable, List, Optional, Tuple\n",
|
||||
"\n",
|
||||
"import torch\n",
|
||||
"import torch.nn.functional as F\n",
|
||||
"from elasticsearch import Elasticsearch\n",
|
||||
"from elasticsearch.helpers import bulk\n",
|
||||
"from loguru import logger\n",
|
||||
"from transformers import AutoConfig, AutoModel, AutoTokenizer\n",
|
||||
"from src.config import PROJ_ROOT, RAW_DIR, OLLAMA_EMB_MODEL_NAME, ELASTICSEARCH_LOCAL_URL\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"DOCS_INDEX = os.getenv(\"ELASTICSEARCH_DOCS_INDEX\", \"avap_docs_v1\")\n",
|
||||
"CODE_INDEX = os.getenv(\"ELASTICSEARCH_CODE_INDEX\", \"avap_code_v1\")\n",
|
||||
"ES_URL = ELASTICSEARCH_LOCAL_URL\n",
|
||||
"RAW_DOCS_DIR = RAW_DIR / \"docs\"\n",
|
||||
"RAW_CODE_DIR = RAW_DIR / \"code\"\n",
|
||||
"\n",
|
||||
"config = AutoConfig.from_pretrained(OLLAMA_EMB_MODEL_NAME)\n",
|
||||
"EMBEDDING_DIM = int(config.hidden_size)\n",
|
||||
"logger.info(f\"Embedding model: {OLLAMA_EMB_MODEL_NAME}\")\n",
|
||||
"logger.info(f\"Embedding dim: {EMBEDDING_DIM}\")\n",
|
||||
"logger.info(f\"Raw docs dir: {RAW_DOCS_DIR}\")\n",
|
||||
"logger.info(f\"Raw code dir: {RAW_CODE_DIR}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "7dd24f58",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Domain models"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "7aa1acca",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"@dataclass(frozen=True)\n",
|
||||
"class DocChunk:\n",
|
||||
" chunk_id: str\n",
|
||||
" doc_id: str\n",
|
||||
" text: str\n",
|
||||
" source: str\n",
|
||||
" metadata: Dict[str, Any]\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"@dataclass(frozen=True)\n",
|
||||
"class CodeChunk:\n",
|
||||
" chunk_id: str\n",
|
||||
" doc_id: str\n",
|
||||
" text: str\n",
|
||||
" source: str\n",
|
||||
" metadata: Dict[str, Any]\n",
|
||||
" parent_id: Optional[str]\n",
|
||||
" chunk_level: str\n",
|
||||
" language: str\n",
|
||||
" start_line: int\n",
|
||||
" end_line: int\n",
|
||||
" chunk_role: str"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "ac97ae4e",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Utilities"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "a975c8ee",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def now_iso() -> str:\n",
|
||||
" return datetime.now(timezone.utc).isoformat()\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def clean_text(text: str) -> str:\n",
|
||||
" text = text.replace(\"\\u00a0\", \" \")\n",
|
||||
" text = re.sub(r\"\\s+\", \" \", text).strip()\n",
|
||||
" return text\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def token_window_chunking(\n",
|
||||
" text: str,\n",
|
||||
" tokenizer: AutoTokenizer,\n",
|
||||
" chunk_size: int = 1000,\n",
|
||||
" overlap: int = 150,\n",
|
||||
") -> List[str]:\n",
|
||||
" if chunk_size <= overlap:\n",
|
||||
" raise ValueError(\"chunk_size must be greater than overlap\")\n",
|
||||
"\n",
|
||||
" token_ids = tokenizer.encode(text, add_special_tokens=False)\n",
|
||||
" chunks: List[str] = []\n",
|
||||
" start = 0\n",
|
||||
"\n",
|
||||
" while start < len(token_ids):\n",
|
||||
" end = min(start + chunk_size, len(token_ids))\n",
|
||||
" piece_ids = token_ids[start:end]\n",
|
||||
" chunks.append(tokenizer.decode(piece_ids, skip_special_tokens=True))\n",
|
||||
"\n",
|
||||
" if end == len(token_ids):\n",
|
||||
" break\n",
|
||||
"\n",
|
||||
" start = end - overlap\n",
|
||||
"\n",
|
||||
" return [chunk for chunk in chunks if chunk.strip()]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "cfe2bfa9",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Code chunking with parent/child hierarchy"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "33acb0ac",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def _line_windows(\n",
|
||||
" lines: List[str],\n",
|
||||
" max_lines: int,\n",
|
||||
" overlap: int\n",
|
||||
") -> List[Tuple[int, int, str]]:\n",
|
||||
" if max_lines <= overlap:\n",
|
||||
" raise ValueError(\"max_lines must be greater than overlap\")\n",
|
||||
"\n",
|
||||
" windows: List[Tuple[int, int, str]] = []\n",
|
||||
" start = 0\n",
|
||||
"\n",
|
||||
" while start < len(lines):\n",
|
||||
" end = min(start + max_lines, len(lines))\n",
|
||||
" text = \"\\n\".join(lines[start:end]).strip()\n",
|
||||
" if text:\n",
|
||||
" windows.append((start + 1, end, text))\n",
|
||||
"\n",
|
||||
" if end == len(lines):\n",
|
||||
" break\n",
|
||||
"\n",
|
||||
" start = end - overlap\n",
|
||||
"\n",
|
||||
" return windows\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def infer_chunk_role(text: str) -> str:\n",
|
||||
" lowered = text.lower()\n",
|
||||
" if \"try:\" in lowered or \"except\" in lowered:\n",
|
||||
" return \"error\"\n",
|
||||
" if \"import \" in lowered or \"from \" in lowered:\n",
|
||||
" return \"init\"\n",
|
||||
" if \"print(\" in lowered or \"logger\" in lowered:\n",
|
||||
" return \"io\"\n",
|
||||
" return \"logic\"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def build_code_chunks(\n",
|
||||
" code_text: str,\n",
|
||||
" source: str,\n",
|
||||
" metadata: Dict[str, Any],\n",
|
||||
" language: str = \"unknown\",\n",
|
||||
" parent_max_lines: int = 80,\n",
|
||||
" parent_overlap: int = 12,\n",
|
||||
" child_max_lines: int = 20,\n",
|
||||
" child_overlap: int = 4,\n",
|
||||
") -> List[CodeChunk]:\n",
|
||||
" doc_id = str(metadata.get(\"doc_id\") or uuid.uuid4())\n",
|
||||
" lines = code_text.splitlines()\n",
|
||||
"\n",
|
||||
" parent_windows = _line_windows(lines, parent_max_lines, parent_overlap)\n",
|
||||
" chunks: List[CodeChunk] = []\n",
|
||||
"\n",
|
||||
" for parent_idx, (start_line, end_line, parent_text) in enumerate(parent_windows):\n",
|
||||
" parent_id = f\"{doc_id}:p{parent_idx}\"\n",
|
||||
" parent_chunk = CodeChunk(\n",
|
||||
" chunk_id=parent_id,\n",
|
||||
" doc_id=doc_id,\n",
|
||||
" text=parent_text,\n",
|
||||
" source=source,\n",
|
||||
" metadata={\n",
|
||||
" **metadata,\n",
|
||||
" \"doc_id\": doc_id,\n",
|
||||
" \"source_type\": \"code\",\n",
|
||||
" \"updated_at\": now_iso(),\n",
|
||||
" \"is_parent\": True,\n",
|
||||
" },\n",
|
||||
" parent_id=None,\n",
|
||||
" chunk_level=\"parent\",\n",
|
||||
" language=language,\n",
|
||||
" start_line=start_line,\n",
|
||||
" end_line=end_line,\n",
|
||||
" chunk_role=\"logic\",\n",
|
||||
" )\n",
|
||||
" chunks.append(parent_chunk)\n",
|
||||
"\n",
|
||||
" parent_lines = parent_text.splitlines()\n",
|
||||
" child_windows = _line_windows(parent_lines, child_max_lines, child_overlap)\n",
|
||||
"\n",
|
||||
" for child_idx, (child_start, child_end, child_text) in enumerate(child_windows):\n",
|
||||
" global_start = start_line + child_start - 1\n",
|
||||
" global_end = start_line + child_end - 1\n",
|
||||
" child_id = f\"{parent_id}:c{child_idx}\"\n",
|
||||
"\n",
|
||||
" child_chunk = CodeChunk(\n",
|
||||
" chunk_id=child_id,\n",
|
||||
" doc_id=doc_id,\n",
|
||||
" text=child_text,\n",
|
||||
" source=source,\n",
|
||||
" metadata={\n",
|
||||
" **metadata,\n",
|
||||
" \"doc_id\": doc_id,\n",
|
||||
" \"source_type\": \"code\",\n",
|
||||
" \"updated_at\": now_iso(),\n",
|
||||
" \"is_parent\": False,\n",
|
||||
" },\n",
|
||||
" parent_id=parent_id,\n",
|
||||
" chunk_level=\"child\",\n",
|
||||
" language=language,\n",
|
||||
" start_line=global_start,\n",
|
||||
" end_line=global_end,\n",
|
||||
" chunk_role=infer_chunk_role(child_text),\n",
|
||||
" )\n",
|
||||
" chunks.append(child_chunk)\n",
|
||||
"\n",
|
||||
" return chunks"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "27583e2b",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Build documentation chunks"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "5c452926",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def build_doc_chunks(\n",
|
||||
" doc_text: str,\n",
|
||||
" source: str,\n",
|
||||
" metadata: Dict[str, Any],\n",
|
||||
" tokenizer: AutoTokenizer,\n",
|
||||
" chunk_size: int = 1000,\n",
|
||||
" overlap: int = 150,\n",
|
||||
") -> List[DocChunk]:\n",
|
||||
" doc_id = str(metadata.get(\"doc_id\") or uuid.uuid4())\n",
|
||||
" cleaned = clean_text(doc_text)\n",
|
||||
" parts = token_window_chunking(\n",
|
||||
" cleaned,\n",
|
||||
" tokenizer=tokenizer,\n",
|
||||
" chunk_size=chunk_size,\n",
|
||||
" overlap=overlap,\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" chunks: List[DocChunk] = []\n",
|
||||
" for idx, part in enumerate(parts):\n",
|
||||
" chunk_id = f\"{doc_id}:d{idx}\"\n",
|
||||
" chunks.append(\n",
|
||||
" DocChunk(\n",
|
||||
" chunk_id=chunk_id,\n",
|
||||
" doc_id=doc_id,\n",
|
||||
" text=part,\n",
|
||||
" source=source,\n",
|
||||
" metadata={\n",
|
||||
" **metadata,\n",
|
||||
" \"doc_id\": doc_id,\n",
|
||||
" \"source_type\": \"doc\",\n",
|
||||
" \"updated_at\": now_iso(),\n",
|
||||
" },\n",
|
||||
" )\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" return chunks"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "0a4ee80e",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Embedding utilities"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "be0e5adb",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"_EMBED_CACHE: Dict[str, Any] = {}\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def get_embedder(model_name: str = OLLAMA_EMB_MODEL_NAME) -> Tuple[AutoTokenizer, AutoModel, str]:\n",
|
||||
" if model_name in _EMBED_CACHE:\n",
|
||||
" return _EMBED_CACHE[model_name]\n",
|
||||
"\n",
|
||||
" device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
|
||||
" tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)\n",
|
||||
" model = AutoModel.from_pretrained(model_name).to(device)\n",
|
||||
" model.eval()\n",
|
||||
"\n",
|
||||
" _EMBED_CACHE[model_name] = (tokenizer, model, device)\n",
|
||||
" return tokenizer, model, device\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def embed_texts(texts: List[str], batch_size: int = 32) -> List[List[float]]:\n",
|
||||
" tokenizer, model, device = get_embedder()\n",
|
||||
" vectors: List[List[float]] = []\n",
|
||||
"\n",
|
||||
" for idx in range(0, len(texts), batch_size):\n",
|
||||
" batch = texts[idx: idx + batch_size]\n",
|
||||
" with torch.no_grad():\n",
|
||||
" encoded = tokenizer(\n",
|
||||
" batch,\n",
|
||||
" padding=True,\n",
|
||||
" truncation=True,\n",
|
||||
" return_tensors=\"pt\",\n",
|
||||
" ).to(device)\n",
|
||||
" output = model(**encoded)\n",
|
||||
" mask = encoded[\"attention_mask\"].unsqueeze(-1)\n",
|
||||
" pooled = (output.last_hidden_state * mask).sum(1) / mask.sum(1).clamp(min=1e-9)\n",
|
||||
" normalized = F.normalize(pooled, p=2, dim=1)\n",
|
||||
" vectors.extend(normalized.cpu().tolist())\n",
|
||||
"\n",
|
||||
" return vectors"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "674b0561",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Elasticsearch mappings (separated indices)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "11040f31",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"DOCS_MAPPING = {\n",
|
||||
" \"settings\": {\"index\": {\"number_of_shards\": 1}},\n",
|
||||
" \"mappings\": {\n",
|
||||
" \"properties\": {\n",
|
||||
" \"doc_id\": {\"type\": \"keyword\"},\n",
|
||||
" \"chunk_id\": {\"type\": \"keyword\"},\n",
|
||||
" \"text\": {\"type\": \"text\"},\n",
|
||||
" \"source\": {\"type\": \"keyword\"},\n",
|
||||
" \"source_type\": {\"type\": \"keyword\"},\n",
|
||||
" \"domain\": {\"type\": \"keyword\"},\n",
|
||||
" \"version\": {\"type\": \"keyword\"},\n",
|
||||
" \"tags\": {\"type\": \"keyword\"},\n",
|
||||
" \"difficulty\": {\"type\": \"keyword\"},\n",
|
||||
" \"updated_at\": {\"type\": \"date\"},\n",
|
||||
" \"section_title\": {\"type\": \"keyword\"},\n",
|
||||
" \"topic_path\": {\"type\": \"keyword\"},\n",
|
||||
" \"metadata\": {\"type\": \"object\", \"enabled\": True},\n",
|
||||
" \"embedding\": {\n",
|
||||
" \"type\": \"dense_vector\",\n",
|
||||
" \"dims\": EMBEDDING_DIM,\n",
|
||||
" \"index\": True,\n",
|
||||
" \"similarity\": \"cosine\",\n",
|
||||
" },\n",
|
||||
" }\n",
|
||||
" },\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"CODE_MAPPING = {\n",
|
||||
" \"settings\": {\"index\": {\"number_of_shards\": 1}},\n",
|
||||
" \"mappings\": {\n",
|
||||
" \"properties\": {\n",
|
||||
" \"doc_id\": {\"type\": \"keyword\"},\n",
|
||||
" \"chunk_id\": {\"type\": \"keyword\"},\n",
|
||||
" \"example_id\": {\"type\": \"keyword\"},\n",
|
||||
" \"example_title\": {\"type\": \"keyword\"},\n",
|
||||
" \"parent_id\": {\"type\": \"keyword\"},\n",
|
||||
" \"chunk_level\": {\"type\": \"keyword\"},\n",
|
||||
" \"chunk_role\": {\"type\": \"keyword\"},\n",
|
||||
" \"language\": {\"type\": \"keyword\"},\n",
|
||||
" \"symbol_name\": {\"type\": \"keyword\"},\n",
|
||||
" \"intent\": {\"type\": \"keyword\"},\n",
|
||||
" \"dependencies\": {\"type\": \"keyword\"},\n",
|
||||
" \"start_line\": {\"type\": \"integer\"},\n",
|
||||
" \"end_line\": {\"type\": \"integer\"},\n",
|
||||
" \"repo\": {\"type\": \"keyword\"},\n",
|
||||
" \"file_path\": {\"type\": \"keyword\"},\n",
|
||||
" \"source\": {\"type\": \"keyword\"},\n",
|
||||
" \"source_type\": {\"type\": \"keyword\"},\n",
|
||||
" \"domain\": {\"type\": \"keyword\"},\n",
|
||||
" \"version\": {\"type\": \"keyword\"},\n",
|
||||
" \"tags\": {\"type\": \"keyword\"},\n",
|
||||
" \"difficulty\": {\"type\": \"keyword\"},\n",
|
||||
" \"updated_at\": {\"type\": \"date\"},\n",
|
||||
" \"text\": {\"type\": \"text\"},\n",
|
||||
" \"metadata\": {\"type\": \"object\", \"enabled\": True},\n",
|
||||
" \"embedding\": {\n",
|
||||
" \"type\": \"dense_vector\",\n",
|
||||
" \"dims\": EMBEDDING_DIM,\n",
|
||||
" \"index\": True,\n",
|
||||
" \"similarity\": \"cosine\",\n",
|
||||
" },\n",
|
||||
" }\n",
|
||||
" },\n",
|
||||
"}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "8f78db43",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def get_es_client() -> Elasticsearch:\n",
|
||||
" return Elasticsearch(\n",
|
||||
" ES_URL,\n",
|
||||
" request_timeout=90,\n",
|
||||
" max_retries=5,\n",
|
||||
" retry_on_timeout=True,\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def recreate_index(es: Elasticsearch, index_name: str, mapping: Dict[str, Any]) -> None:\n",
|
||||
" if es.indices.exists(index=index_name):\n",
|
||||
" es.indices.delete(index=index_name)\n",
|
||||
" logger.info(f\"Deleted existing index: {index_name}\")\n",
|
||||
"\n",
|
||||
" es.indices.create(index=index_name, body=mapping)\n",
|
||||
" logger.info(f\"Created index: {index_name}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "8a66d191",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Indexing functions"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "5db6e05e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def index_doc_chunks(\n",
|
||||
" es: Elasticsearch,\n",
|
||||
" index_name: str,\n",
|
||||
" chunks: List[DocChunk],\n",
|
||||
" batch_size: int = 32,\n",
|
||||
") -> None:\n",
|
||||
" vectors = embed_texts([chunk.text for chunk in chunks], batch_size=batch_size)\n",
|
||||
"\n",
|
||||
" def actions() -> Iterable[Dict[str, Any]]:\n",
|
||||
" for chunk, vector in zip(chunks, vectors):\n",
|
||||
" yield {\n",
|
||||
" \"_op_type\": \"index\",\n",
|
||||
" \"_index\": index_name,\n",
|
||||
" \"_id\": chunk.chunk_id,\n",
|
||||
" \"_source\": {\n",
|
||||
" \"doc_id\": chunk.doc_id,\n",
|
||||
" \"chunk_id\": chunk.chunk_id,\n",
|
||||
" \"text\": chunk.text,\n",
|
||||
" \"source\": chunk.source,\n",
|
||||
" \"source_type\": \"doc\",\n",
|
||||
" \"domain\": chunk.metadata.get(\"domain\"),\n",
|
||||
" \"version\": chunk.metadata.get(\"version\"),\n",
|
||||
" \"tags\": chunk.metadata.get(\"tags\", []),\n",
|
||||
" \"difficulty\": chunk.metadata.get(\"difficulty\"),\n",
|
||||
" \"updated_at\": chunk.metadata.get(\"updated_at\"),\n",
|
||||
" \"section_title\": chunk.metadata.get(\"section_title\"),\n",
|
||||
" \"topic_path\": chunk.metadata.get(\"topic_path\"),\n",
|
||||
" \"metadata\": chunk.metadata,\n",
|
||||
" \"embedding\": vector,\n",
|
||||
" },\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" bulk(es.options(request_timeout=180), actions())\n",
|
||||
" logger.info(f\"Indexed {len(chunks)} doc chunks into {index_name}\")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def index_code_chunks(\n",
|
||||
" es: Elasticsearch,\n",
|
||||
" index_name: str,\n",
|
||||
" chunks: List[CodeChunk],\n",
|
||||
" batch_size: int = 32,\n",
|
||||
") -> None:\n",
|
||||
" vectors = embed_texts([chunk.text for chunk in chunks], batch_size=batch_size)\n",
|
||||
"\n",
|
||||
" def actions() -> Iterable[Dict[str, Any]]:\n",
|
||||
" for chunk, vector in zip(chunks, vectors):\n",
|
||||
" yield {\n",
|
||||
" \"_op_type\": \"index\",\n",
|
||||
" \"_index\": index_name,\n",
|
||||
" \"_id\": chunk.chunk_id,\n",
|
||||
" \"_source\": {\n",
|
||||
" \"doc_id\": chunk.doc_id,\n",
|
||||
" \"chunk_id\": chunk.chunk_id,\n",
|
||||
" \"example_id\": chunk.metadata.get(\"example_id\"),\n",
|
||||
" \"example_title\": chunk.metadata.get(\"example_title\"),\n",
|
||||
" \"parent_id\": chunk.parent_id,\n",
|
||||
" \"chunk_level\": chunk.chunk_level,\n",
|
||||
" \"chunk_role\": chunk.chunk_role,\n",
|
||||
" \"language\": chunk.language,\n",
|
||||
" \"symbol_name\": chunk.metadata.get(\"symbol_name\"),\n",
|
||||
" \"intent\": chunk.metadata.get(\"intent\"),\n",
|
||||
" \"dependencies\": chunk.metadata.get(\"dependencies\", []),\n",
|
||||
" \"start_line\": chunk.start_line,\n",
|
||||
" \"end_line\": chunk.end_line,\n",
|
||||
" \"repo\": chunk.metadata.get(\"repo\"),\n",
|
||||
" \"file_path\": chunk.metadata.get(\"file_path\"),\n",
|
||||
" \"source\": chunk.source,\n",
|
||||
" \"source_type\": \"code\",\n",
|
||||
" \"domain\": chunk.metadata.get(\"domain\"),\n",
|
||||
" \"version\": chunk.metadata.get(\"version\"),\n",
|
||||
" \"tags\": chunk.metadata.get(\"tags\", []),\n",
|
||||
" \"difficulty\": chunk.metadata.get(\"difficulty\"),\n",
|
||||
" \"updated_at\": chunk.metadata.get(\"updated_at\"),\n",
|
||||
" \"text\": chunk.text,\n",
|
||||
" \"metadata\": chunk.metadata,\n",
|
||||
" \"embedding\": vector,\n",
|
||||
" },\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" bulk(es.options(request_timeout=180), actions())\n",
|
||||
" logger.info(f\"Indexed {len(chunks)} code chunks into {index_name}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "db3a45d4",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Ingest from data/raw/docs and data/raw/code"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "f0e3b3e4",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def _natural_sort_key(path: Path) -> Tuple[int, str]:\n",
|
||||
" match = re.match(r\"^(\\d+)_\", path.stem)\n",
|
||||
" if match:\n",
|
||||
" return int(match.group(1)), path.stem.lower()\n",
|
||||
" return 10**9, path.stem.lower()\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def discover_raw_sources(\n",
|
||||
" docs_dir: Path,\n",
|
||||
" code_dir: Path,\n",
|
||||
") -> Tuple[List[Path], List[Path]]:\n",
|
||||
" if not docs_dir.exists():\n",
|
||||
" raise FileNotFoundError(f\"Docs directory not found: {docs_dir}\")\n",
|
||||
" if not code_dir.exists():\n",
|
||||
" raise FileNotFoundError(f\"Code directory not found: {code_dir}\")\n",
|
||||
"\n",
|
||||
" docs_files = sorted(\n",
|
||||
" [path for path in docs_dir.glob(\"*.txt\") if path.is_file()],\n",
|
||||
" key=_natural_sort_key,\n",
|
||||
" )\n",
|
||||
" code_files = sorted(\n",
|
||||
" [path for path in code_dir.glob(\"*.txt\") if path.is_file()],\n",
|
||||
" key=_natural_sort_key,\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" if not docs_files:\n",
|
||||
" raise ValueError(f\"No .txt documentation files found in {docs_dir}\")\n",
|
||||
" if not code_files:\n",
|
||||
" raise ValueError(f\"No .txt code files found in {code_dir}\")\n",
|
||||
"\n",
|
||||
" return docs_files, code_files\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def infer_example_title(prefix_text: str, fallback_title: str) -> str:\n",
|
||||
" candidate_lines = [line.strip() for line in prefix_text.splitlines() if line.strip()]\n",
|
||||
" for line in reversed(candidate_lines[-8:]):\n",
|
||||
" lowered = line.lower()\n",
|
||||
" if lowered == \"code snippet\":\n",
|
||||
" continue\n",
|
||||
" cleaned = re.sub(r\"^[#\\-\\s>*]*\", \"\", line)\n",
|
||||
" cleaned = re.sub(r\"^\\d+[\\.)]\\s*\", \"\", cleaned)\n",
|
||||
" cleaned = cleaned.replace(\"**\", \"\").strip()\n",
|
||||
" if cleaned and cleaned.lower() != \"code snippet\":\n",
|
||||
" return cleaned[:120]\n",
|
||||
" return fallback_title\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def extract_code_examples(file_text: str, file_stem: str) -> List[Dict[str, str]]:\n",
|
||||
" pattern = re.compile(\n",
|
||||
" r\"```(?P<lang>[a-zA-Z0-9_+-]*)\\n(?P<code>.*?)```\",\n",
|
||||
" re.DOTALL,\n",
|
||||
" )\n",
|
||||
" examples: List[Dict[str, str]] = []\n",
|
||||
"\n",
|
||||
" for idx, match in enumerate(pattern.finditer(file_text), start=1):\n",
|
||||
" code = match.group(\"code\").strip()\n",
|
||||
" if not code:\n",
|
||||
" continue\n",
|
||||
" fallback_title = f\"{file_stem} - Example {idx:03d}\"\n",
|
||||
" prefix = file_text[max(0, match.start() - 600): match.start()]\n",
|
||||
" title = infer_example_title(prefix, fallback_title)\n",
|
||||
" language = match.group(\"lang\").strip() or \"avap\"\n",
|
||||
" examples.append({\n",
|
||||
" \"example_index\": idx,\n",
|
||||
" \"title\": title,\n",
|
||||
" \"language\": language,\n",
|
||||
" \"code\": code,\n",
|
||||
" })\n",
|
||||
"\n",
|
||||
" if examples:\n",
|
||||
" return examples\n",
|
||||
"\n",
|
||||
" plain_text = file_text.strip()\n",
|
||||
" if not plain_text:\n",
|
||||
" return []\n",
|
||||
"\n",
|
||||
" return [\n",
|
||||
" {\n",
|
||||
" \"example_index\": 1,\n",
|
||||
" \"title\": f\"{file_stem} - Example 001\",\n",
|
||||
" \"language\": \"avap\",\n",
|
||||
" \"code\": plain_text,\n",
|
||||
" }\n",
|
||||
" ]\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def build_doc_chunks_from_raw(\n",
|
||||
" docs_files: List[Path],\n",
|
||||
" tokenizer: AutoTokenizer,\n",
|
||||
" chunk_size: int = 900,\n",
|
||||
" overlap: int = 120,\n",
|
||||
") -> List[DocChunk]:\n",
|
||||
" all_chunks: List[DocChunk] = []\n",
|
||||
"\n",
|
||||
" for idx, path in enumerate(docs_files, start=1):\n",
|
||||
" title = path.stem.replace(\"_\", \" \")\n",
|
||||
" text = path.read_text(encoding=\"utf-8\")\n",
|
||||
"\n",
|
||||
" chunks = build_doc_chunks(\n",
|
||||
" doc_text=text,\n",
|
||||
" source=str(path.relative_to(PROJ_ROOT)),\n",
|
||||
" metadata={\n",
|
||||
" \"doc_id\": f\"doc-{idx:03d}\",\n",
|
||||
" \"domain\": \"avap\",\n",
|
||||
" \"version\": \"v1\",\n",
|
||||
" \"tags\": [\"documentation\"],\n",
|
||||
" \"difficulty\": \"mixed\",\n",
|
||||
" \"section_title\": title,\n",
|
||||
" \"topic_path\": f\"docs/{path.stem}\",\n",
|
||||
" },\n",
|
||||
" tokenizer=tokenizer,\n",
|
||||
" chunk_size=chunk_size,\n",
|
||||
" overlap=overlap,\n",
|
||||
" )\n",
|
||||
" all_chunks.extend(chunks)\n",
|
||||
"\n",
|
||||
" return all_chunks\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def build_code_chunks_from_raw(\n",
|
||||
" code_files: List[Path],\n",
|
||||
" child_max_lines: int = 14,\n",
|
||||
" child_overlap: int = 3,\n",
|
||||
") -> List[CodeChunk]:\n",
|
||||
" all_chunks: List[CodeChunk] = []\n",
|
||||
" example_counter = 0\n",
|
||||
"\n",
|
||||
" for code_file in code_files:\n",
|
||||
" file_text = code_file.read_text(encoding=\"utf-8\")\n",
|
||||
" examples = extract_code_examples(file_text, code_file.stem)\n",
|
||||
"\n",
|
||||
" for example in examples:\n",
|
||||
" example_counter += 1\n",
|
||||
" code_text = example[\"code\"]\n",
|
||||
" line_count = max(1, len(code_text.splitlines()))\n",
|
||||
" child_window = min(child_max_lines, line_count)\n",
|
||||
" overlap_window = min(child_overlap, max(0, child_window - 1))\n",
|
||||
"\n",
|
||||
" code_chunks = build_code_chunks(\n",
|
||||
" code_text=code_text,\n",
|
||||
" source=str(code_file.relative_to(PROJ_ROOT)),\n",
|
||||
" metadata={\n",
|
||||
" \"doc_id\": f\"code-{example_counter:03d}\",\n",
|
||||
" \"example_id\": f\"example-{example_counter:03d}\",\n",
|
||||
" \"example_title\": example[\"title\"],\n",
|
||||
" \"repo\": \"BRUNIX-AI/assistance-engine\",\n",
|
||||
" \"file_path\": str(code_file.relative_to(PROJ_ROOT)),\n",
|
||||
" \"symbol_name\": example[\"title\"],\n",
|
||||
" \"intent\": \"avap code example\",\n",
|
||||
" \"dependencies\": [],\n",
|
||||
" \"domain\": \"avap\",\n",
|
||||
" \"version\": \"v1\",\n",
|
||||
" \"tags\": [\"code-example\"],\n",
|
||||
" \"difficulty\": \"mixed\",\n",
|
||||
" },\n",
|
||||
" language=example[\"language\"],\n",
|
||||
" parent_max_lines=line_count,\n",
|
||||
" parent_overlap=0,\n",
|
||||
" child_max_lines=child_window,\n",
|
||||
" child_overlap=overlap_window,\n",
|
||||
" )\n",
|
||||
" all_chunks.extend(code_chunks)\n",
|
||||
"\n",
|
||||
" return all_chunks\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"tokenizer, _, _ = get_embedder()\n",
|
||||
"docs_files, code_files = discover_raw_sources(RAW_DOCS_DIR, RAW_CODE_DIR)\n",
|
||||
"\n",
|
||||
"doc_chunks = build_doc_chunks_from_raw(docs_files, tokenizer=tokenizer)\n",
|
||||
"code_chunks = build_code_chunks_from_raw(code_files)\n",
|
||||
"\n",
|
||||
"print(f\"Documentation files discovered: {len(docs_files)}\")\n",
|
||||
"print(f\"Code files discovered: {len(code_files)}\")\n",
|
||||
"print(f\"Doc chunks built: {len(doc_chunks)}\")\n",
|
||||
"print(f\"Code chunks built (parent + child): {len(code_chunks)}\")\n",
|
||||
"print(f\"Parent chunks: {sum(1 for chunk in code_chunks if chunk.chunk_level == 'parent')}\")\n",
|
||||
"print(f\"Child chunks: {sum(1 for chunk in code_chunks if chunk.chunk_level == 'child')}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "573ead0e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"es = get_es_client()\n",
|
||||
"recreate_index(es, DOCS_INDEX, DOCS_MAPPING)\n",
|
||||
"recreate_index(es, CODE_INDEX, CODE_MAPPING)\n",
|
||||
"\n",
|
||||
"index_doc_chunks(es, DOCS_INDEX, doc_chunks, batch_size=16)\n",
|
||||
"index_code_chunks(es, CODE_INDEX, code_chunks, batch_size=16)\n",
|
||||
"\n",
|
||||
"es.indices.refresh(index=DOCS_INDEX)\n",
|
||||
"es.indices.refresh(index=CODE_INDEX)\n",
|
||||
"\n",
|
||||
"print(\"Dual index ingestion completed from data/raw.\")\n",
|
||||
"print(f\"Docs index: {DOCS_INDEX}\")\n",
|
||||
"print(f\"Code index: {CODE_INDEX}\")\n",
|
||||
"print(f\"Indexed doc chunks: {len(doc_chunks)}\")\n",
|
||||
"print(f\"Indexed code chunks: {len(code_chunks)}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "35f6cdee",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Optional: hybrid retrieval across both indices"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "968a8727",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def vector_search(\n",
|
||||
" es: Elasticsearch,\n",
|
||||
" index_name: str,\n",
|
||||
" query: str,\n",
|
||||
" top_k: int = 5,\n",
|
||||
") -> List[Dict[str, Any]]:\n",
|
||||
" query_vec = embed_texts([query], batch_size=1)[0]\n",
|
||||
"\n",
|
||||
" body = {\n",
|
||||
" \"size\": top_k,\n",
|
||||
" \"query\": {\n",
|
||||
" \"script_score\": {\n",
|
||||
" \"query\": {\"match_all\": {}},\n",
|
||||
" \"script\": {\n",
|
||||
" \"source\": \"cosineSimilarity(params.query_vector, 'embedding') + 1.0\",\n",
|
||||
" \"params\": {\"query_vector\": query_vec},\n",
|
||||
" },\n",
|
||||
" }\n",
|
||||
" },\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" result = es.search(index=index_name, body=body)\n",
|
||||
" return result[\"hits\"][\"hits\"]\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"query = \"How do I handle normalization and edge cases in vector similarity?\"\n",
|
||||
"doc_hits = vector_search(es, DOCS_INDEX, query, top_k=3)\n",
|
||||
"code_hits = vector_search(es, CODE_INDEX, query, top_k=5)\n",
|
||||
"\n",
|
||||
"print(\"Top docs:\")\n",
|
||||
"for hit in doc_hits:\n",
|
||||
" print(\"-\", hit[\"_source\"].get(\"section_title\"), \"|\", hit[\"_score\"])\n",
|
||||
"\n",
|
||||
"print(\"\\nTop code chunks:\")\n",
|
||||
"for hit in code_hits:\n",
|
||||
" source = hit[\"_source\"]\n",
|
||||
" print(\n",
|
||||
" \"-\",\n",
|
||||
" source.get(\"chunk_id\"),\n",
|
||||
" \"| level=\",\n",
|
||||
" source.get(\"chunk_level\"),\n",
|
||||
" \"| parent=\",\n",
|
||||
" source.get(\"parent_id\"),\n",
|
||||
" \"| score=\",\n",
|
||||
" hit.get(\"_score\"),\n",
|
||||
" )"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"language_info": {
|
||||
"name": "python"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue