added lark to notebook

2026-03-03 09:39:09 +01:00 · 2026-03-03 09:39:09 +01:00 · 8297ae204c
parent 5a666079a4
commit 8297ae204c
4 changed files with 1724 additions and 1 deletions
--- a/pyproject.toml
+++ b/pyproject.toml
@ -30,6 +30,7 @@ dev = [
    "evidently>=0.7.20",
    "jupyter>=1.1.1",
    "langfuse<=3",
    "lark>=1.3.1",
    "markdown>=3.10.2",
    "mteb>=2.8.8",
    "polars>=1.38.1",
--- a/scratches/pseco/ingestion/n00
+++ b/scratches/pseco/ingestion/n00
@ -0,0 +1,868 @@
 {
 "cells": [
  {
   "cell_type": "markdown",
   "id": "9ea4c7c5",
   "metadata": {},
   "source": [
    "# n00 Dual Index Ingestion v1\n",
    "\n",
    "This notebook implements a dual-index ingestion strategy for RAG:\n",
    "- **Docs index** for conceptual documentation chunks\n",
    "- **Code index** for code chunks with parent/child hierarchy\n",
    "\n",
    "It is designed to support query decomposition when there is no exact solution and the model must synthesize ideas from both docs and code."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4f7602ef",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import re\n",
    "import uuid\n",
    "from dataclasses import dataclass\n",
    "from datetime import datetime, timezone\n",
    "from pathlib import Path\n",
    "from typing import Any, Dict, Iterable, List, Optional, Tuple\n",
    "\n",
    "import torch\n",
    "import torch.nn.functional as F\n",
    "from elasticsearch import Elasticsearch\n",
    "from elasticsearch.helpers import bulk\n",
    "from loguru import logger\n",
    "from transformers import AutoConfig, AutoModel, AutoTokenizer\n",
    "from src.config import PROJ_ROOT, RAW_DIR, OLLAMA_EMB_MODEL_NAME, ELASTICSEARCH_LOCAL_URL\n",
    "\n",
    "\n",
    "DOCS_INDEX = os.getenv(\"ELASTICSEARCH_DOCS_INDEX\", \"avap_docs_v1\")\n",
    "CODE_INDEX = os.getenv(\"ELASTICSEARCH_CODE_INDEX\", \"avap_code_v1\")\n",
    "ES_URL = ELASTICSEARCH_LOCAL_URL\n",
    "RAW_DOCS_DIR = RAW_DIR / \"docs\"\n",
    "RAW_CODE_DIR = RAW_DIR / \"code\"\n",
    "\n",
    "config = AutoConfig.from_pretrained(OLLAMA_EMB_MODEL_NAME)\n",
    "EMBEDDING_DIM = int(config.hidden_size)\n",
    "logger.info(f\"Embedding model: {OLLAMA_EMB_MODEL_NAME}\")\n",
    "logger.info(f\"Embedding dim: {EMBEDDING_DIM}\")\n",
    "logger.info(f\"Raw docs dir: {RAW_DOCS_DIR}\")\n",
    "logger.info(f\"Raw code dir: {RAW_CODE_DIR}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7dd24f58",
   "metadata": {},
   "source": [
    "## Domain models"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7aa1acca",
   "metadata": {},
   "outputs": [],
   "source": [
    "@dataclass(frozen=True)\n",
    "class DocChunk:\n",
    "    chunk_id: str\n",
    "    doc_id: str\n",
    "    text: str\n",
    "    source: str\n",
    "    metadata: Dict[str, Any]\n",
    "\n",
    "\n",
    "@dataclass(frozen=True)\n",
    "class CodeChunk:\n",
    "    chunk_id: str\n",
    "    doc_id: str\n",
    "    text: str\n",
    "    source: str\n",
    "    metadata: Dict[str, Any]\n",
    "    parent_id: Optional[str]\n",
    "    chunk_level: str\n",
    "    language: str\n",
    "    start_line: int\n",
    "    end_line: int\n",
    "    chunk_role: str"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ac97ae4e",
   "metadata": {},
   "source": [
    "## Utilities"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a975c8ee",
   "metadata": {},
   "outputs": [],
   "source": [
    "def now_iso() -> str:\n",
    "    return datetime.now(timezone.utc).isoformat()\n",
    "\n",
    "\n",
    "def clean_text(text: str) -> str:\n",
    "    text = text.replace(\"\\u00a0\", \" \")\n",
    "    text = re.sub(r\"\\s+\", \" \", text).strip()\n",
    "    return text\n",
    "\n",
    "\n",
    "def token_window_chunking(\n",
    "    text: str,\n",
    "    tokenizer: AutoTokenizer,\n",
    "    chunk_size: int = 1000,\n",
    "    overlap: int = 150,\n",
    ") -> List[str]:\n",
    "    if chunk_size <= overlap:\n",
    "        raise ValueError(\"chunk_size must be greater than overlap\")\n",
    "\n",
    "    token_ids = tokenizer.encode(text, add_special_tokens=False)\n",
    "    chunks: List[str] = []\n",
    "    start = 0\n",
    "\n",
    "    while start < len(token_ids):\n",
    "        end = min(start + chunk_size, len(token_ids))\n",
    "        piece_ids = token_ids[start:end]\n",
    "        chunks.append(tokenizer.decode(piece_ids, skip_special_tokens=True))\n",
    "\n",
    "        if end == len(token_ids):\n",
    "            break\n",
    "\n",
    "        start = end - overlap\n",
    "\n",
    "    return [chunk for chunk in chunks if chunk.strip()]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "cfe2bfa9",
   "metadata": {},
   "source": [
    "## Code chunking with parent/child hierarchy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "33acb0ac",
   "metadata": {},
   "outputs": [],
   "source": [
    "def _line_windows(\n",
    "    lines: List[str],\n",
    "    max_lines: int,\n",
    "    overlap: int\n",
    ") -> List[Tuple[int, int, str]]:\n",
    "    if max_lines <= overlap:\n",
    "        raise ValueError(\"max_lines must be greater than overlap\")\n",
    "\n",
    "    windows: List[Tuple[int, int, str]] = []\n",
    "    start = 0\n",
    "\n",
    "    while start < len(lines):\n",
    "        end = min(start + max_lines, len(lines))\n",
    "        text = \"\\n\".join(lines[start:end]).strip()\n",
    "        if text:\n",
    "            windows.append((start + 1, end, text))\n",
    "\n",
    "        if end == len(lines):\n",
    "            break\n",
    "\n",
    "        start = end - overlap\n",
    "\n",
    "    return windows\n",
    "\n",
    "\n",
    "def infer_chunk_role(text: str) -> str:\n",
    "    lowered = text.lower()\n",
    "    if \"try:\" in lowered or \"except\" in lowered:\n",
    "        return \"error\"\n",
    "    if \"import \" in lowered or \"from \" in lowered:\n",
    "        return \"init\"\n",
    "    if \"print(\" in lowered or \"logger\" in lowered:\n",
    "        return \"io\"\n",
    "    return \"logic\"\n",
    "\n",
    "\n",
    "def build_code_chunks(\n",
    "    code_text: str,\n",
    "    source: str,\n",
    "    metadata: Dict[str, Any],\n",
    "    language: str = \"unknown\",\n",
    "    parent_max_lines: int = 80,\n",
    "    parent_overlap: int = 12,\n",
    "    child_max_lines: int = 20,\n",
    "    child_overlap: int = 4,\n",
    ") -> List[CodeChunk]:\n",
    "    doc_id = str(metadata.get(\"doc_id\") or uuid.uuid4())\n",
    "    lines = code_text.splitlines()\n",
    "\n",
    "    parent_windows = _line_windows(lines, parent_max_lines, parent_overlap)\n",
    "    chunks: List[CodeChunk] = []\n",
    "\n",
    "    for parent_idx, (start_line, end_line, parent_text) in enumerate(parent_windows):\n",
    "        parent_id = f\"{doc_id}:p{parent_idx}\"\n",
    "        parent_chunk = CodeChunk(\n",
    "            chunk_id=parent_id,\n",
    "            doc_id=doc_id,\n",
    "            text=parent_text,\n",
    "            source=source,\n",
    "            metadata={\n",
    "                **metadata,\n",
    "                \"doc_id\": doc_id,\n",
    "                \"source_type\": \"code\",\n",
    "                \"updated_at\": now_iso(),\n",
    "                \"is_parent\": True,\n",
    "            },\n",
    "            parent_id=None,\n",
    "            chunk_level=\"parent\",\n",
    "            language=language,\n",
    "            start_line=start_line,\n",
    "            end_line=end_line,\n",
    "            chunk_role=\"logic\",\n",
    "        )\n",
    "        chunks.append(parent_chunk)\n",
    "\n",
    "        parent_lines = parent_text.splitlines()\n",
    "        child_windows = _line_windows(parent_lines, child_max_lines, child_overlap)\n",
    "\n",
    "        for child_idx, (child_start, child_end, child_text) in enumerate(child_windows):\n",
    "            global_start = start_line + child_start - 1\n",
    "            global_end = start_line + child_end - 1\n",
    "            child_id = f\"{parent_id}:c{child_idx}\"\n",
    "\n",
    "            child_chunk = CodeChunk(\n",
    "                chunk_id=child_id,\n",
    "                doc_id=doc_id,\n",
    "                text=child_text,\n",
    "                source=source,\n",
    "                metadata={\n",
    "                    **metadata,\n",
    "                    \"doc_id\": doc_id,\n",
    "                    \"source_type\": \"code\",\n",
    "                    \"updated_at\": now_iso(),\n",
    "                    \"is_parent\": False,\n",
    "                },\n",
    "                parent_id=parent_id,\n",
    "                chunk_level=\"child\",\n",
    "                language=language,\n",
    "                start_line=global_start,\n",
    "                end_line=global_end,\n",
    "                chunk_role=infer_chunk_role(child_text),\n",
    "            )\n",
    "            chunks.append(child_chunk)\n",
    "\n",
    "    return chunks"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "27583e2b",
   "metadata": {},
   "source": [
    "## Build documentation chunks"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5c452926",
   "metadata": {},
   "outputs": [],
   "source": [
    "def build_doc_chunks(\n",
    "    doc_text: str,\n",
    "    source: str,\n",
    "    metadata: Dict[str, Any],\n",
    "    tokenizer: AutoTokenizer,\n",
    "    chunk_size: int = 1000,\n",
    "    overlap: int = 150,\n",
    ") -> List[DocChunk]:\n",
    "    doc_id = str(metadata.get(\"doc_id\") or uuid.uuid4())\n",
    "    cleaned = clean_text(doc_text)\n",
    "    parts = token_window_chunking(\n",
    "        cleaned,\n",
    "        tokenizer=tokenizer,\n",
    "        chunk_size=chunk_size,\n",
    "        overlap=overlap,\n",
    "    )\n",
    "\n",
    "    chunks: List[DocChunk] = []\n",
    "    for idx, part in enumerate(parts):\n",
    "        chunk_id = f\"{doc_id}:d{idx}\"\n",
    "        chunks.append(\n",
    "            DocChunk(\n",
    "                chunk_id=chunk_id,\n",
    "                doc_id=doc_id,\n",
    "                text=part,\n",
    "                source=source,\n",
    "                metadata={\n",
    "                    **metadata,\n",
    "                    \"doc_id\": doc_id,\n",
    "                    \"source_type\": \"doc\",\n",
    "                    \"updated_at\": now_iso(),\n",
    "                },\n",
    "            )\n",
    "        )\n",
    "\n",
    "    return chunks"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0a4ee80e",
   "metadata": {},
   "source": [
    "## Embedding utilities"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "be0e5adb",
   "metadata": {},
   "outputs": [],
   "source": [
    "_EMBED_CACHE: Dict[str, Any] = {}\n",
    "\n",
    "\n",
    "def get_embedder(model_name: str = OLLAMA_EMB_MODEL_NAME) -> Tuple[AutoTokenizer, AutoModel, str]:\n",
    "    if model_name in _EMBED_CACHE:\n",
    "        return _EMBED_CACHE[model_name]\n",
    "\n",
    "    device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
    "    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)\n",
    "    model = AutoModel.from_pretrained(model_name).to(device)\n",
    "    model.eval()\n",
    "\n",
    "    _EMBED_CACHE[model_name] = (tokenizer, model, device)\n",
    "    return tokenizer, model, device\n",
    "\n",
    "\n",
    "def embed_texts(texts: List[str], batch_size: int = 32) -> List[List[float]]:\n",
    "    tokenizer, model, device = get_embedder()\n",
    "    vectors: List[List[float]] = []\n",
    "\n",
    "    for idx in range(0, len(texts), batch_size):\n",
    "        batch = texts[idx: idx + batch_size]\n",
    "        with torch.no_grad():\n",
    "            encoded = tokenizer(\n",
    "                batch,\n",
    "                padding=True,\n",
    "                truncation=True,\n",
    "                return_tensors=\"pt\",\n",
    "            ).to(device)\n",
    "            output = model(**encoded)\n",
    "            mask = encoded[\"attention_mask\"].unsqueeze(-1)\n",
    "            pooled = (output.last_hidden_state * mask).sum(1) / mask.sum(1).clamp(min=1e-9)\n",
    "            normalized = F.normalize(pooled, p=2, dim=1)\n",
    "            vectors.extend(normalized.cpu().tolist())\n",
    "\n",
    "    return vectors"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "674b0561",
   "metadata": {},
   "source": [
    "## Elasticsearch mappings (separated indices)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "11040f31",
   "metadata": {},
   "outputs": [],
   "source": [
    "DOCS_MAPPING = {\n",
    "    \"settings\": {\"index\": {\"number_of_shards\": 1}},\n",
    "    \"mappings\": {\n",
    "        \"properties\": {\n",
    "            \"doc_id\": {\"type\": \"keyword\"},\n",
    "            \"chunk_id\": {\"type\": \"keyword\"},\n",
    "            \"text\": {\"type\": \"text\"},\n",
    "            \"source\": {\"type\": \"keyword\"},\n",
    "            \"source_type\": {\"type\": \"keyword\"},\n",
    "            \"domain\": {\"type\": \"keyword\"},\n",
    "            \"version\": {\"type\": \"keyword\"},\n",
    "            \"tags\": {\"type\": \"keyword\"},\n",
    "            \"difficulty\": {\"type\": \"keyword\"},\n",
    "            \"updated_at\": {\"type\": \"date\"},\n",
    "            \"section_title\": {\"type\": \"keyword\"},\n",
    "            \"topic_path\": {\"type\": \"keyword\"},\n",
    "            \"metadata\": {\"type\": \"object\", \"enabled\": True},\n",
    "            \"embedding\": {\n",
    "                \"type\": \"dense_vector\",\n",
    "                \"dims\": EMBEDDING_DIM,\n",
    "                \"index\": True,\n",
    "                \"similarity\": \"cosine\",\n",
    "            },\n",
    "        }\n",
    "    },\n",
    "}\n",
    "\n",
    "\n",
    "CODE_MAPPING = {\n",
    "    \"settings\": {\"index\": {\"number_of_shards\": 1}},\n",
    "    \"mappings\": {\n",
    "        \"properties\": {\n",
    "            \"doc_id\": {\"type\": \"keyword\"},\n",
    "            \"chunk_id\": {\"type\": \"keyword\"},\n",
    "            \"example_id\": {\"type\": \"keyword\"},\n",
    "            \"example_title\": {\"type\": \"keyword\"},\n",
    "            \"parent_id\": {\"type\": \"keyword\"},\n",
    "            \"chunk_level\": {\"type\": \"keyword\"},\n",
    "            \"chunk_role\": {\"type\": \"keyword\"},\n",
    "            \"language\": {\"type\": \"keyword\"},\n",
    "            \"symbol_name\": {\"type\": \"keyword\"},\n",
    "            \"intent\": {\"type\": \"keyword\"},\n",
    "            \"dependencies\": {\"type\": \"keyword\"},\n",
    "            \"start_line\": {\"type\": \"integer\"},\n",
    "            \"end_line\": {\"type\": \"integer\"},\n",
    "            \"repo\": {\"type\": \"keyword\"},\n",
    "            \"file_path\": {\"type\": \"keyword\"},\n",
    "            \"source\": {\"type\": \"keyword\"},\n",
    "            \"source_type\": {\"type\": \"keyword\"},\n",
    "            \"domain\": {\"type\": \"keyword\"},\n",
    "            \"version\": {\"type\": \"keyword\"},\n",
    "            \"tags\": {\"type\": \"keyword\"},\n",
    "            \"difficulty\": {\"type\": \"keyword\"},\n",
    "            \"updated_at\": {\"type\": \"date\"},\n",
    "            \"text\": {\"type\": \"text\"},\n",
    "            \"metadata\": {\"type\": \"object\", \"enabled\": True},\n",
    "            \"embedding\": {\n",
    "                \"type\": \"dense_vector\",\n",
    "                \"dims\": EMBEDDING_DIM,\n",
    "                \"index\": True,\n",
    "                \"similarity\": \"cosine\",\n",
    "            },\n",
    "        }\n",
    "    },\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8f78db43",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_es_client() -> Elasticsearch:\n",
    "    return Elasticsearch(\n",
    "        ES_URL,\n",
    "        request_timeout=90,\n",
    "        max_retries=5,\n",
    "        retry_on_timeout=True,\n",
    "    )\n",
    "\n",
    "\n",
    "def recreate_index(es: Elasticsearch, index_name: str, mapping: Dict[str, Any]) -> None:\n",
    "    if es.indices.exists(index=index_name):\n",
    "        es.indices.delete(index=index_name)\n",
    "        logger.info(f\"Deleted existing index: {index_name}\")\n",
    "\n",
    "    es.indices.create(index=index_name, body=mapping)\n",
    "    logger.info(f\"Created index: {index_name}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8a66d191",
   "metadata": {},
   "source": [
    "## Indexing functions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5db6e05e",
   "metadata": {},
   "outputs": [],
   "source": [
    "def index_doc_chunks(\n",
    "    es: Elasticsearch,\n",
    "    index_name: str,\n",
    "    chunks: List[DocChunk],\n",
    "    batch_size: int = 32,\n",
    ") -> None:\n",
    "    vectors = embed_texts([chunk.text for chunk in chunks], batch_size=batch_size)\n",
    "\n",
    "    def actions() -> Iterable[Dict[str, Any]]:\n",
    "        for chunk, vector in zip(chunks, vectors):\n",
    "            yield {\n",
    "                \"_op_type\": \"index\",\n",
    "                \"_index\": index_name,\n",
    "                \"_id\": chunk.chunk_id,\n",
    "                \"_source\": {\n",
    "                    \"doc_id\": chunk.doc_id,\n",
    "                    \"chunk_id\": chunk.chunk_id,\n",
    "                    \"text\": chunk.text,\n",
    "                    \"source\": chunk.source,\n",
    "                    \"source_type\": \"doc\",\n",
    "                    \"domain\": chunk.metadata.get(\"domain\"),\n",
    "                    \"version\": chunk.metadata.get(\"version\"),\n",
    "                    \"tags\": chunk.metadata.get(\"tags\", []),\n",
    "                    \"difficulty\": chunk.metadata.get(\"difficulty\"),\n",
    "                    \"updated_at\": chunk.metadata.get(\"updated_at\"),\n",
    "                    \"section_title\": chunk.metadata.get(\"section_title\"),\n",
    "                    \"topic_path\": chunk.metadata.get(\"topic_path\"),\n",
    "                    \"metadata\": chunk.metadata,\n",
    "                    \"embedding\": vector,\n",
    "                },\n",
    "            }\n",
    "\n",
    "    bulk(es.options(request_timeout=180), actions())\n",
    "    logger.info(f\"Indexed {len(chunks)} doc chunks into {index_name}\")\n",
    "\n",
    "\n",
    "def index_code_chunks(\n",
    "    es: Elasticsearch,\n",
    "    index_name: str,\n",
    "    chunks: List[CodeChunk],\n",
    "    batch_size: int = 32,\n",
    ") -> None:\n",
    "    vectors = embed_texts([chunk.text for chunk in chunks], batch_size=batch_size)\n",
    "\n",
    "    def actions() -> Iterable[Dict[str, Any]]:\n",
    "        for chunk, vector in zip(chunks, vectors):\n",
    "            yield {\n",
    "                \"_op_type\": \"index\",\n",
    "                \"_index\": index_name,\n",
    "                \"_id\": chunk.chunk_id,\n",
    "                \"_source\": {\n",
    "                    \"doc_id\": chunk.doc_id,\n",
    "                    \"chunk_id\": chunk.chunk_id,\n",
    "                    \"example_id\": chunk.metadata.get(\"example_id\"),\n",
    "                    \"example_title\": chunk.metadata.get(\"example_title\"),\n",
    "                    \"parent_id\": chunk.parent_id,\n",
    "                    \"chunk_level\": chunk.chunk_level,\n",
    "                    \"chunk_role\": chunk.chunk_role,\n",
    "                    \"language\": chunk.language,\n",
    "                    \"symbol_name\": chunk.metadata.get(\"symbol_name\"),\n",
    "                    \"intent\": chunk.metadata.get(\"intent\"),\n",
    "                    \"dependencies\": chunk.metadata.get(\"dependencies\", []),\n",
    "                    \"start_line\": chunk.start_line,\n",
    "                    \"end_line\": chunk.end_line,\n",
    "                    \"repo\": chunk.metadata.get(\"repo\"),\n",
    "                    \"file_path\": chunk.metadata.get(\"file_path\"),\n",
    "                    \"source\": chunk.source,\n",
    "                    \"source_type\": \"code\",\n",
    "                    \"domain\": chunk.metadata.get(\"domain\"),\n",
    "                    \"version\": chunk.metadata.get(\"version\"),\n",
    "                    \"tags\": chunk.metadata.get(\"tags\", []),\n",
    "                    \"difficulty\": chunk.metadata.get(\"difficulty\"),\n",
    "                    \"updated_at\": chunk.metadata.get(\"updated_at\"),\n",
    "                    \"text\": chunk.text,\n",
    "                    \"metadata\": chunk.metadata,\n",
    "                    \"embedding\": vector,\n",
    "                },\n",
    "            }\n",
    "\n",
    "    bulk(es.options(request_timeout=180), actions())\n",
    "    logger.info(f\"Indexed {len(chunks)} code chunks into {index_name}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "db3a45d4",
   "metadata": {},
   "source": [
    "## Ingest from data/raw/docs and data/raw/code"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f0e3b3e4",
   "metadata": {},
   "outputs": [],
   "source": [
    "def _natural_sort_key(path: Path) -> Tuple[int, str]:\n",
    "    match = re.match(r\"^(\\d+)_\", path.stem)\n",
    "    if match:\n",
    "        return int(match.group(1)), path.stem.lower()\n",
    "    return 10**9, path.stem.lower()\n",
    "\n",
    "\n",
    "def discover_raw_sources(\n",
    "    docs_dir: Path,\n",
    "    code_dir: Path,\n",
    ") -> Tuple[List[Path], List[Path]]:\n",
    "    if not docs_dir.exists():\n",
    "        raise FileNotFoundError(f\"Docs directory not found: {docs_dir}\")\n",
    "    if not code_dir.exists():\n",
    "        raise FileNotFoundError(f\"Code directory not found: {code_dir}\")\n",
    "\n",
    "    docs_files = sorted(\n",
    "        [path for path in docs_dir.glob(\"*.txt\") if path.is_file()],\n",
    "        key=_natural_sort_key,\n",
    "    )\n",
    "    code_files = sorted(\n",
    "        [path for path in code_dir.glob(\"*.txt\") if path.is_file()],\n",
    "        key=_natural_sort_key,\n",
    "    )\n",
    "\n",
    "    if not docs_files:\n",
    "        raise ValueError(f\"No .txt documentation files found in {docs_dir}\")\n",
    "    if not code_files:\n",
    "        raise ValueError(f\"No .txt code files found in {code_dir}\")\n",
    "\n",
    "    return docs_files, code_files\n",
    "\n",
    "\n",
    "def infer_example_title(prefix_text: str, fallback_title: str) -> str:\n",
    "    candidate_lines = [line.strip() for line in prefix_text.splitlines() if line.strip()]\n",
    "    for line in reversed(candidate_lines[-8:]):\n",
    "        lowered = line.lower()\n",
    "        if lowered == \"code snippet\":\n",
    "            continue\n",
    "        cleaned = re.sub(r\"^[#\\-\\s>*]*\", \"\", line)\n",
    "        cleaned = re.sub(r\"^\\d+[\\.)]\\s*\", \"\", cleaned)\n",
    "        cleaned = cleaned.replace(\"**\", \"\").strip()\n",
    "        if cleaned and cleaned.lower() != \"code snippet\":\n",
    "            return cleaned[:120]\n",
    "    return fallback_title\n",
    "\n",
    "\n",
    "def extract_code_examples(file_text: str, file_stem: str) -> List[Dict[str, str]]:\n",
    "    pattern = re.compile(\n",
    "        r\"```(?P<lang>[a-zA-Z0-9_+-]*)\\n(?P<code>.*?)```\",\n",
    "        re.DOTALL,\n",
    "    )\n",
    "    examples: List[Dict[str, str]] = []\n",
    "\n",
    "    for idx, match in enumerate(pattern.finditer(file_text), start=1):\n",
    "        code = match.group(\"code\").strip()\n",
    "        if not code:\n",
    "            continue\n",
    "        fallback_title = f\"{file_stem} - Example {idx:03d}\"\n",
    "        prefix = file_text[max(0, match.start() - 600): match.start()]\n",
    "        title = infer_example_title(prefix, fallback_title)\n",
    "        language = match.group(\"lang\").strip() or \"avap\"\n",
    "        examples.append({\n",
    "            \"example_index\": idx,\n",
    "            \"title\": title,\n",
    "            \"language\": language,\n",
    "            \"code\": code,\n",
    "        })\n",
    "\n",
    "    if examples:\n",
    "        return examples\n",
    "\n",
    "    plain_text = file_text.strip()\n",
    "    if not plain_text:\n",
    "        return []\n",
    "\n",
    "    return [\n",
    "        {\n",
    "            \"example_index\": 1,\n",
    "            \"title\": f\"{file_stem} - Example 001\",\n",
    "            \"language\": \"avap\",\n",
    "            \"code\": plain_text,\n",
    "        }\n",
    "    ]\n",
    "\n",
    "\n",
    "def build_doc_chunks_from_raw(\n",
    "    docs_files: List[Path],\n",
    "    tokenizer: AutoTokenizer,\n",
    "    chunk_size: int = 900,\n",
    "    overlap: int = 120,\n",
    ") -> List[DocChunk]:\n",
    "    all_chunks: List[DocChunk] = []\n",
    "\n",
    "    for idx, path in enumerate(docs_files, start=1):\n",
    "        title = path.stem.replace(\"_\", \" \")\n",
    "        text = path.read_text(encoding=\"utf-8\")\n",
    "\n",
    "        chunks = build_doc_chunks(\n",
    "            doc_text=text,\n",
    "            source=str(path.relative_to(PROJ_ROOT)),\n",
    "            metadata={\n",
    "                \"doc_id\": f\"doc-{idx:03d}\",\n",
    "                \"domain\": \"avap\",\n",
    "                \"version\": \"v1\",\n",
    "                \"tags\": [\"documentation\"],\n",
    "                \"difficulty\": \"mixed\",\n",
    "                \"section_title\": title,\n",
    "                \"topic_path\": f\"docs/{path.stem}\",\n",
    "            },\n",
    "            tokenizer=tokenizer,\n",
    "            chunk_size=chunk_size,\n",
    "            overlap=overlap,\n",
    "        )\n",
    "        all_chunks.extend(chunks)\n",
    "\n",
    "    return all_chunks\n",
    "\n",
    "\n",
    "def build_code_chunks_from_raw(\n",
    "    code_files: List[Path],\n",
    "    child_max_lines: int = 14,\n",
    "    child_overlap: int = 3,\n",
    ") -> List[CodeChunk]:\n",
    "    all_chunks: List[CodeChunk] = []\n",
    "    example_counter = 0\n",
    "\n",
    "    for code_file in code_files:\n",
    "        file_text = code_file.read_text(encoding=\"utf-8\")\n",
    "        examples = extract_code_examples(file_text, code_file.stem)\n",
    "\n",
    "        for example in examples:\n",
    "            example_counter += 1\n",
    "            code_text = example[\"code\"]\n",
    "            line_count = max(1, len(code_text.splitlines()))\n",
    "            child_window = min(child_max_lines, line_count)\n",
    "            overlap_window = min(child_overlap, max(0, child_window - 1))\n",
    "\n",
    "            code_chunks = build_code_chunks(\n",
    "                code_text=code_text,\n",
    "                source=str(code_file.relative_to(PROJ_ROOT)),\n",
    "                metadata={\n",
    "                    \"doc_id\": f\"code-{example_counter:03d}\",\n",
    "                    \"example_id\": f\"example-{example_counter:03d}\",\n",
    "                    \"example_title\": example[\"title\"],\n",
    "                    \"repo\": \"BRUNIX-AI/assistance-engine\",\n",
    "                    \"file_path\": str(code_file.relative_to(PROJ_ROOT)),\n",
    "                    \"symbol_name\": example[\"title\"],\n",
    "                    \"intent\": \"avap code example\",\n",
    "                    \"dependencies\": [],\n",
    "                    \"domain\": \"avap\",\n",
    "                    \"version\": \"v1\",\n",
    "                    \"tags\": [\"code-example\"],\n",
    "                    \"difficulty\": \"mixed\",\n",
    "                },\n",
    "                language=example[\"language\"],\n",
    "                parent_max_lines=line_count,\n",
    "                parent_overlap=0,\n",
    "                child_max_lines=child_window,\n",
    "                child_overlap=overlap_window,\n",
    "            )\n",
    "            all_chunks.extend(code_chunks)\n",
    "\n",
    "    return all_chunks\n",
    "\n",
    "\n",
    "tokenizer, _, _ = get_embedder()\n",
    "docs_files, code_files = discover_raw_sources(RAW_DOCS_DIR, RAW_CODE_DIR)\n",
    "\n",
    "doc_chunks = build_doc_chunks_from_raw(docs_files, tokenizer=tokenizer)\n",
    "code_chunks = build_code_chunks_from_raw(code_files)\n",
    "\n",
    "print(f\"Documentation files discovered: {len(docs_files)}\")\n",
    "print(f\"Code files discovered: {len(code_files)}\")\n",
    "print(f\"Doc chunks built: {len(doc_chunks)}\")\n",
    "print(f\"Code chunks built (parent + child): {len(code_chunks)}\")\n",
    "print(f\"Parent chunks: {sum(1 for chunk in code_chunks if chunk.chunk_level == 'parent')}\")\n",
    "print(f\"Child chunks: {sum(1 for chunk in code_chunks if chunk.chunk_level == 'child')}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "573ead0e",
   "metadata": {},
   "outputs": [],
   "source": [
    "es = get_es_client()\n",
    "recreate_index(es, DOCS_INDEX, DOCS_MAPPING)\n",
    "recreate_index(es, CODE_INDEX, CODE_MAPPING)\n",
    "\n",
    "index_doc_chunks(es, DOCS_INDEX, doc_chunks, batch_size=16)\n",
    "index_code_chunks(es, CODE_INDEX, code_chunks, batch_size=16)\n",
    "\n",
    "es.indices.refresh(index=DOCS_INDEX)\n",
    "es.indices.refresh(index=CODE_INDEX)\n",
    "\n",
    "print(\"Dual index ingestion completed from data/raw.\")\n",
    "print(f\"Docs index: {DOCS_INDEX}\")\n",
    "print(f\"Code index: {CODE_INDEX}\")\n",
    "print(f\"Indexed doc chunks: {len(doc_chunks)}\")\n",
    "print(f\"Indexed code chunks: {len(code_chunks)}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "35f6cdee",
   "metadata": {},
   "source": [
    "## Optional: hybrid retrieval across both indices"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "968a8727",
   "metadata": {},
   "outputs": [],
   "source": [
    "def vector_search(\n",
    "    es: Elasticsearch,\n",
    "    index_name: str,\n",
    "    query: str,\n",
    "    top_k: int = 5,\n",
    ") -> List[Dict[str, Any]]:\n",
    "    query_vec = embed_texts([query], batch_size=1)[0]\n",
    "\n",
    "    body = {\n",
    "        \"size\": top_k,\n",
    "        \"query\": {\n",
    "            \"script_score\": {\n",
    "                \"query\": {\"match_all\": {}},\n",
    "                \"script\": {\n",
    "                    \"source\": \"cosineSimilarity(params.query_vector, 'embedding') + 1.0\",\n",
    "                    \"params\": {\"query_vector\": query_vec},\n",
    "                },\n",
    "            }\n",
    "        },\n",
    "    }\n",
    "\n",
    "    result = es.search(index=index_name, body=body)\n",
    "    return result[\"hits\"][\"hits\"]\n",
    "\n",
    "\n",
    "query = \"How do I handle normalization and edge cases in vector similarity?\"\n",
    "doc_hits = vector_search(es, DOCS_INDEX, query, top_k=3)\n",
    "code_hits = vector_search(es, CODE_INDEX, query, top_k=5)\n",
    "\n",
    "print(\"Top docs:\")\n",
    "for hit in doc_hits:\n",
    "    print(\"-\", hit[\"_source\"].get(\"section_title\"), \"|\", hit[\"_score\"])\n",
    "\n",
    "print(\"\\nTop code chunks:\")\n",
    "for hit in code_hits:\n",
    "    source = hit[\"_source\"]\n",
    "    print(\n",
    "        \"-\",\n",
    "        source.get(\"chunk_id\"),\n",
    "        \"| level=\",\n",
    "        source.get(\"chunk_level\"),\n",
    "        \"| parent=\",\n",
    "        source.get(\"parent_id\"),\n",
    "        \"| score=\",\n",
    "        hit.get(\"_score\"),\n",
    "    )"
   ]
  }
 ],
 "metadata": {
  "language_info": {
   "name": "python"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
--- a/scratches/pseco/ingestion/n00
+++ b/scratches/pseco/ingestion/n00
--- a/uv.lock
+++ b/uv.lock
@ -1,5 +1,5 @@
 version = 1
-revision = 3
+revision = 2
 requires-python = ">=3.11"
 resolution-markers = [
    "python_full_version >= '3.14' and sys_platform == 'win32'",
@ -293,6 +293,7 @@ dev = [
    { name = "evidently" },
    { name = "jupyter" },
    { name = "langfuse" },
    { name = "lark" },
    { name = "markdown" },
    { name = "mteb" },
    { name = "polars" },
@ -330,6 +331,7 @@ dev = [
    { name = "evidently", specifier = ">=0.7.20" },
    { name = "jupyter", specifier = ">=1.1.1" },
    { name = "langfuse", specifier = "<=3" },
    { name = "lark", specifier = ">=1.3.1" },
    { name = "markdown", specifier = ">=3.10.2" },
    { name = "mteb", specifier = ">=2.8.8" },
    { name = "polars", specifier = ">=1.38.1" },