working on dual index

2026-03-03 12:01:03 +01:00 · 2026-03-03 12:01:03 +01:00 · 9575af3ff0
parent c2e43c030a
commit 9575af3ff0
3 changed files with 823 additions and 403 deletions
--- a/scratches/pseco/ingestion/n00
+++ b/scratches/pseco/ingestion/n00
--- a/scratches/pseco/ingestion/n00
+++ b/scratches/pseco/ingestion/n00
@ -0,0 +1,572 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "id": "0a8abbfa",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 28,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import os\n",
+    "import re\n",
+    "import uuid\n",
+    "from pathlib import Path\n",
+    "\n",
+    "from langchain_core.documents import Document\n",
+    "from langchain_elasticsearch import ElasticsearchStore\n",
+    "from langchain_ollama import OllamaEmbeddings\n",
+    "from transformers import AutoConfig\n",
+    "from elasticsearch import Elasticsearch\n",
+    "import nltk\n",
+    "\n",
+    "from lark import Lark, Transformer\n",
+    "from src.config import PROJ_ROOT, DATA_DIR\n",
+    "from datetime import datetime\n",
+    "nltk.download(\"punkt\", quiet=True)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "5c9d292b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ELASTICSEARCH_URL = os.getenv(\"ELASTICSEARCH_LOCAL_URL\")\n",
+    "ELASTICSEARCH_DOCS_INDEX = os.getenv(\"ELASTICSEARCH_DOCS_INDEX\")\n",
+    "ELASTICSEARCH_CODE_INDEX = os.getenv(\"ELASTICSEARCH_CODE_INDEX\")\n",
+    "HF_EMB_MODEL_NAME = os.getenv(\"HF_EMB_MODEL_NAME\")\n",
+    "OLLAMA_URL = os.getenv(\"OLLAMA_URL\")\n",
+    "OLLAMA_LOCAL_URL = os.getenv(\"OLLAMA_LOCAL_URL\")\n",
+    "OLLAMA_MODEL_NAME = os.getenv(\"OLLAMA_MODEL_NAME\")\n",
+    "OLLAMA_EMB_MODEL_NAME = os.getenv(\"OLLAMA_EMB_MODEL_NAME\")\n",
+    "\n",
+    "config = AutoConfig.from_pretrained(HF_EMB_MODEL_NAME)\n",
+    "embedding_dim = config.hidden_size"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "0e1cd9b9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "grammar = (DATA_DIR / \"raw\" / \"code\" / \"BNF_v3.txt\").read_text(\n",
+    "    encoding=\"utf-8\"\n",
+    ")\n",
+    "code = (DATA_DIR / \"raw\" / \"code\" / \"Code_Snippets_v1.txt\").read_text(\n",
+    "    encoding=\"utf-8\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "baa779f3",
+   "metadata": {},
+   "source": [
+    "# Functions"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "ca43bd67",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class BnfToDict(Transformer):\n",
+    "    def NONTERMINAL(self, token):\n",
+    "        return str(token)[1:-1].strip()\n",
+    "\n",
+    "    def QUOTED(self, token):\n",
+    "        return str(token)[1:-1]\n",
+    "\n",
+    "    def IDENT(self, token):\n",
+    "        return str(token)\n",
+    "\n",
+    "    def SYMBOL(self, token):\n",
+    "        return str(token)\n",
+    "\n",
+    "    def alternative(self, items):\n",
+    "        return [str(item) for item in items]\n",
+    "\n",
+    "    def expansion(self, items):\n",
+    "        return items\n",
+    "\n",
+    "    def rule(self, items):\n",
+    "        lhs = items[0]\n",
+    "        rhs = items[1]\n",
+    "        return {\"lhs\": lhs, \"alternatives\": rhs}\n",
+    "\n",
+    "    def start(self, items):\n",
+    "        return items\n",
+    "\n",
+    "\n",
+    "def load_bnf_rules_for_rag(grammar, code_path: Path) -> list[dict]:\n",
+    "    parser = Lark(grammar, parser=\"lalr\")\n",
+    "    transformer = BnfToDict()\n",
+    "    rules: list[dict] = []\n",
+    "    skipped: list[str] = []\n",
+    "\n",
+    "    for raw_line in code_path.read_text(encoding=\"utf-8\").splitlines():\n",
+    "        line = raw_line.strip()\n",
+    "        if not line or line.startswith(\"#\") or \"::=\" not in line:\n",
+    "            continue\n",
+    "        try:\n",
+    "            tree = parser.parse(line)\n",
+    "            parsed = transformer.transform(tree)\n",
+    "            rules.extend(parsed)\n",
+    "        except Exception:\n",
+    "            skipped.append(line)\n",
+    "\n",
+    "    print(f\"Parsed rules: {len(rules)}\")\n",
+    "    print(f\"Skipped lines: {len(skipped)}\")\n",
+    "    return rules"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "7969500e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def bnf_rules_to_documents(rules: list[dict]) -> list[Document]:\n",
+    "    docs: list[Document] = []\n",
+    "    for rule in rules:\n",
+    "        lhs = rule[\"lhs\"]\n",
+    "        alternatives = rule[\"alternatives\"]\n",
+    "        rendered_alts = [\" \".join(alt) if alt else \"<empty>\" for alt in alternatives]\n",
+    "        content = f\"{lhs} ::= \" + \" | \".join(rendered_alts)\n",
+    "        docs.append(\n",
+    "            Document(\n",
+    "                id=str(uuid.uuid4()),\n",
+    "                page_content=content,\n",
+    "                metadata={\n",
+    "                    \"source\": \"BNF.txt\",\n",
+    "                    \"type\": \"grammar_rule\",\n",
+    "                    \"non_terminal\": lhs,\n",
+    "                    \"alternatives\": len(alternatives),\n",
+    "                },\n",
+    "            )\n",
+    "        )\n",
+    "    return docs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "7c67fa0b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def extract_rag_chunks(source_code, parser):\n",
+    "    # Limpiamos comentarios y líneas de texto del ejemplo para que el parser no falle\n",
+    "    lines = [l for l in source_code.split('\\n') if not l.startswith('###') and l.strip()]\n",
+    "    \n",
+    "    chunks = []\n",
+    "    for line in lines:\n",
+    "        try:\n",
+    "            # Quitamos el número del ejemplo (ej: \"1. \")\n",
+    "            clean_line = line.split(':', 1)[-1] if ':' in line else line\n",
+    "            tree = parser.parse(clean_line.strip())\n",
+    "            \n",
+    "            chunks.append({\n",
+    "                \"code\": clean_line.strip(),\n",
+    "                \"type\": \"statement\",\n",
+    "                \"length\": len(clean_line)\n",
+    "            })\n",
+    "        except Exception as e:\n",
+    "            continue # Si la línea no cumple el BNF, la saltamos\n",
+    "    return chunks"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "ab8c2b9b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def extraer_codigo_puro(code, parser):\n",
+    "    # Extraemos el contenido dentro de bloques de código Markdown ``` ... ```\n",
+    "    bloques_codigo = re.findall(r'```(.*?)```', code, re.DOTALL)\n",
+    "    \n",
+    "    chunks_validados = []\n",
+    "    \n",
+    "    for bloque in bloques_codigo:\n",
+    "        lineas = bloque.strip().split('\\n')\n",
+    "        lineas_limpias = []\n",
+    "        \n",
+    "        for linea in lineas:\n",
+    "            linea = linea.strip()\n",
+    "            if not linea: continue\n",
+    "            \n",
+    "            try:\n",
+    "                # Validamos cada línea con tu BNF\n",
+    "                parser.parse(linea)\n",
+    "                lineas_limpias.append(linea)\n",
+    "            except Exception as e:\n",
+    "                print(f\"⚠️ Línea saltada (no cumple BNF): {linea}\")\n",
+    "        \n",
+    "        if lineas_limpias:\n",
+    "            chunks_validados.append(\"\\n\".join(lineas_limpias))\n",
+    "            \n",
+    "    return chunks_validados"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "id": "8ed54f3f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def procesar_snippet_con_metadata(code, parser):\n",
+    "    # 1. Separar por los bloques numerados (ej: 1. **Nombre**)\n",
+    "    patron_bloque = re.compile(r'\\d+\\.\\s+\\*\\*(.*?)\\*\\*')\n",
+    "    bloques = patron_bloque.split(code)[1:] \n",
+    "    \n",
+    "    documentos_elastic = []\n",
+    "    \n",
+    "    for i in range(0, len(bloques), 2):\n",
+    "        titulo = bloques[i].strip()\n",
+    "        contenido_bruto = bloques[i+1]\n",
+    "        \n",
+    "        # 2. Extraer el código dentro de las triple comillas ```\n",
+    "        codigo_match = re.search(r'```(.*?)```', contenido_bruto, re.DOTALL)\n",
+    "        if codigo_match:\n",
+    "            codigo_bloque = codigo_match.group(1).strip()\n",
+    "            \n",
+    "            # 3. Validar con Lark cada línea\n",
+    "            lineas_validas = []\n",
+    "            for linea in codigo_bloque.split('\\n'):\n",
+    "                linea_clean = linea.strip()\n",
+    "                if linea_clean:\n",
+    "                    try:\n",
+    "                        parser.parse(linea_clean)\n",
+    "                        lineas_validas.append(linea_clean)\n",
+    "                    except:\n",
+    "                        print(f\"⚠️ Error sintáctico (BNF) en: {linea_clean}\")\n",
+    "            \n",
+    "            # 4. Crear el formato DOCUMENTO para Elasticsearch\n",
+    "            if lineas_validas:\n",
+    "                codigo_final = \"\\n\".join(lineas_validas)\n",
+    "                \n",
+    "                doc = {\n",
+    "                    \"_id\": str(uuid.uuid4()), # ID único para evitar colisiones\n",
+    "                    \"title\": titulo,\n",
+    "                    \"content\": codigo_final,   # El campo principal para el RAG\n",
+    "                    \"metadata\": {\n",
+    "                        \"tipo_bloque\": \"ejemplo_fundamentos\",\n",
+    "                        \"lenguaje\": \"AVAP\",\n",
+    "                        \"line_count\": len(lineas_validas),\n",
+    "                        \"char_count\": len(codigo_final),\n",
+    "                        # \"ingested_at\": datetime.now().isoformat()\n",
+    "                    }\n",
+    "                }\n",
+    "                documentos_elastic.append(doc)\n",
+    "                \n",
+    "    return documentos_elastic"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "560f9f86",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Ensure every item is a langchain `Document` before indexing\n",
+    "def ensure_documents(items):\n",
+    "    out: list[Document] = []\n",
+    "    for x in items:\n",
+    "        if isinstance(x, Document):\n",
+    "            out.append(x)\n",
+    "            continue\n",
+    "        if isinstance(x, dict):\n",
+    "            content = x.get(\"content\") or x.get(\"page_content\") or x.get(\"text\") or x.get(\"code\") or str(x)\n",
+    "            id_ = x.get(\"_id\") or x.get(\"id\") or str(uuid.uuid4())\n",
+    "            metadata = x.get(\"metadata\") or {k: v for k, v in x.items() if k not in (\"_id\", \"id\", \"content\", \"page_content\", \"text\", \"code\")}\n",
+    "            out.append(Document(id=id_, page_content=content, metadata=metadata))\n",
+    "            continue\n",
+    "        if isinstance(x, str):\n",
+    "            out.append(Document(id=str(uuid.uuid4()), page_content=x, metadata={\"source\": \"chunked_code\"}))\n",
+    "            continue\n",
+    "        # handle objects from chunkers with `.text` attribute\n",
+    "        if hasattr(x, \"text\") or hasattr(x, \"token_count\"):\n",
+    "            txt = getattr(x, \"text\", str(x))\n",
+    "            out.append(Document(id=str(uuid.uuid4()), page_content=txt, metadata={\"source\": \"chunker\", \"token_count\": getattr(x, \"token_count\", None)}))\n",
+    "            continue\n",
+    "        # fallback: stringify the object\n",
+    "        out.append(Document(id=str(uuid.uuid4()), page_content=str(x), metadata={\"source\": \"unknown\", \"orig_type\": type(x).__name__}))\n",
+    "    return out"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "23a92e13",
+   "metadata": {},
+   "source": [
+    "## BNF "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "6e777f53",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "parser = Lark(grammar)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "2981a944",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "code_chunks = extract_rag_chunks(source_code=code, parser=parser)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "950b5789",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "chunked_code = extraer_codigo_puro(code=code, parser=parser)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "id": "3be3c168",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "code_snippets=procesar_snippet_con_metadata(code=code, parser=parser)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "77f6c552",
+   "metadata": {},
+   "source": [
+    "## Elastic Search"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "09ce3e29",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "es = Elasticsearch(\n",
+    "    ELASTICSEARCH_URL,\n",
+    "    request_timeout=120,\n",
+    "    max_retries=5,\n",
+    "    retry_on_timeout=True,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "d575c386",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if es.indices.exists(index=ELASTICSEARCH_DOCS_INDEX):\n",
+    "    es.indices.delete(index=ELASTICSEARCH_DOCS_INDEX)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "40ea0af8",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "avap-docs-test\n"
+     ]
+    }
+   ],
+   "source": [
+    "for index in es.indices.get(index=\"*\"):\n",
+    "    print(index)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "4e091b39",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "OllamaEmbeddings(model='qwen3-0.6B-emb:latest', validate_model_on_init=False, base_url='http://localhost:11434', client_kwargs={}, async_client_kwargs={}, sync_client_kwargs={}, mirostat=None, mirostat_eta=None, mirostat_tau=None, num_ctx=None, num_gpu=None, keep_alive=None, num_thread=None, repeat_last_n=None, repeat_penalty=None, temperature=None, stop=None, tfs_z=None, top_k=None, top_p=None)"
+      ]
+     },
+     "execution_count": 19,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "embeddings = OllamaEmbeddings(base_url=OLLAMA_LOCAL_URL, model=OLLAMA_EMB_MODEL_NAME)\n",
+    "embeddings"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1ed4c817",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Prepared docs: 50\n"
+     ]
+    }
+   ],
+   "source": [
+    "\n",
+    "\n",
+    "# convert current list to Documents\n",
+    "docs_to_index = ensure_documents(chunked_code)\n",
+    "print(\"Prepared docs:\", len(docs_to_index))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "id": "5aff21c0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# index into Elasticsearch\n",
+    "db = ElasticsearchStore.from_documents(\n",
+    "    docs_to_index,\n",
+    "    embeddings,\n",
+    "    client=es,\n",
+    "    index_name=ELASTICSEARCH_DOCS_INDEX,\n",
+    "    distance_strategy=\"COSINE\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "74c0a377",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "response = es.search(\n",
+    "    index=ELASTICSEARCH_DOCS_INDEX,\n",
+    "    body={\n",
+    "        \"query\": {\"match_all\": {}},\n",
+    "        \"size\": 10 \n",
+    "    }\n",
+    ")\n",
+    "\n",
+    "for hit in response[\"hits\"][\"hits\"]:\n",
+    "    print(\"ID:\", hit[\"_id\"])\n",
+    "    print(\"Source:\", hit[\"_source\"])\n",
+    "    print(\"-\" * 40)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d823650e",
+   "metadata": {},
+   "source": [
+    "# Retrive"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5732a27d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "base_retriever = db.as_retriever(\n",
+    "    search_type=\"similarity\",\n",
+    "    search_kwargs={\"k\": 5}\n",
+    "    ) \n",
+    "\n",
+    "docs = base_retriever.invoke(\"What reserved words does AVAP have?\")\n",
+    "docs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8706506f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "embeddings = OllamaEmbeddings(base_url=OLLAMA_URL, model=OLLAMA_EMB_MODEL_NAME)\n",
+    "\n",
+    "vector_store = ElasticsearchStore(\n",
+    "    client=es,\n",
+    "    index_name=ELASTICSEARCH_DOCS_INDEX,\n",
+    "    embedding=embeddings,\n",
+    "    query_field=\"text\",\n",
+    "    vector_query_field=\"vector\",\n",
+    ")\n",
+    "\n",
+    "results = vector_store.similarity_search_with_score(\n",
+    "    query=\"What data types does AVAP have?\",\n",
+    "    k=50\n",
+    ")\n",
+    "\n",
+    "results"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "assistance-engine",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/scripts/start-tunnels.sh
+++ b/scripts/start-tunnels.sh
@ -22,9 +22,9 @@ if [ ! -f "$KUBECONFIG_PATH" ]; then
 fi

 # 1. AI Model Tunnel (Ollama)
-# echo -e "${YELLOW}[1/3]${NC} Starting Ollama Light Service tunnel (localhost:11434)..."
-# kubectl port-forward --address 0.0.0.0 svc/ollama-light-service 11434:11434 -n brunix --kubeconfig "$KUBECONFIG_PATH" &
-# OLLAMA_PID=$!
+echo -e "${YELLOW}[1/3]${NC} Starting Ollama Light Service tunnel (localhost:11434)..."
+kubectl port-forward --address 0.0.0.0 svc/ollama-light-service 11434:11434 -n brunix --kubeconfig "$KUBECONFIG_PATH" &
+OLLAMA_PID=$!

 # 2. Knowledge Base Tunnel (Elasticsearch)
 echo -e "${YELLOW}[2/3]${NC} Starting Elasticsearch Vector DB tunnel (localhost:9200)..."