From 63c5fc976f4db1f5ab6aecf1a5b633f0b05ca705 Mon Sep 17 00:00:00 2001
From: pseco <pseco@mrhouston.net>
Date: Tue, 3 Mar 2026 15:07:53 +0100
Subject: [PATCH] working on Dual Index

---
 .../pseco/agent/n00 LangGraph Agent v2.ipynb  |   3 +-
 .../pseco/ingestion/n00 Dual Index v3.ipynb   | 169 ++++++++++--------
 2 files changed, 101 insertions(+), 71 deletions(-)

diff --git a/scratches/pseco/agent/n00 LangGraph Agent v2.ipynb b/scratches/pseco/agent/n00 LangGraph Agent v2.ipynb
index d3840e0..bf6680f 100644
--- a/scratches/pseco/agent/n00 LangGraph Agent v2.ipynb	
+++ b/scratches/pseco/agent/n00 LangGraph Agent v2.ipynb	
@@ -39,13 +39,14 @@
         },
         {
             "cell_type": "code",
-            "execution_count": 146,
+            "execution_count": null,
             "id": "30edcecc",
             "metadata": {},
             "outputs": [],
             "source": [
                 "ES_URL = os.getenv(\"ELASTICSEARCH_LOCAL_URL\")\n",
                 "INDEX_NAME = os.getenv(\"ELASTICSEARCH_INDEX\")\n",
+                "CODE_INDE\n",
                 "BASE_URL = os.getenv(\"OLLAMA_LOCAL_URL\")\n",
                 "MODEL_NAME = os.getenv(\"OLLAMA_MODEL_NAME\")\n",
                 "EMB_MODEL_NAME = os.getenv(\"OLLAMA_EMB_MODEL_NAME\")\n",
diff --git a/scratches/pseco/ingestion/n00 Dual Index v3.ipynb b/scratches/pseco/ingestion/n00 Dual Index v3.ipynb
index 50fe4d2..6881e7c 100644
--- a/scratches/pseco/ingestion/n00 Dual Index v3.ipynb	
+++ b/scratches/pseco/ingestion/n00 Dual Index v3.ipynb	
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 28,
+   "execution_count": 39,
    "id": "0a8abbfa",
    "metadata": {},
    "outputs": [
@@ -12,7 +12,7 @@
        "True"
       ]
      },
-     "execution_count": 28,
+     "execution_count": 39,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -38,7 +38,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 40,
    "id": "5c9d292b",
    "metadata": {},
    "outputs": [],
@@ -58,7 +58,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 41,
    "id": "0e1cd9b9",
    "metadata": {},
    "outputs": [],
@@ -81,7 +81,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 42,
    "id": "ca43bd67",
    "metadata": {},
    "outputs": [],
@@ -138,7 +138,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 43,
    "id": "7969500e",
    "metadata": {},
    "outputs": [],
@@ -167,7 +167,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 44,
    "id": "7c67fa0b",
    "metadata": {},
    "outputs": [],
@@ -195,7 +195,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 45,
    "id": "ab8c2b9b",
    "metadata": {},
    "outputs": [],
@@ -229,7 +229,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 26,
+   "execution_count": 46,
    "id": "8ed54f3f",
    "metadata": {},
    "outputs": [],
@@ -270,7 +270,6 @@
     "                    \"title\": titulo,\n",
     "                    \"content\": codigo_final,   # El campo principal para el RAG\n",
     "                    \"metadata\": {\n",
-    "                        \"tipo_bloque\": \"ejemplo_fundamentos\",\n",
     "                        \"lenguaje\": \"AVAP\",\n",
     "                        \"line_count\": len(lineas_validas),\n",
     "                        \"char_count\": len(codigo_final),\n",
@@ -284,7 +283,61 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 47,
+   "id": "fb52d20b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def procesar_snippet_a_documentos(code, parser):\n",
+    "    # 1. Separar por bloques numerados (ej: 1. **Nombre**)\n",
+    "    patron_bloque = re.compile(r'\\d+\\.\\s+\\*\\*(.*?)\\*\\*')\n",
+    "    bloques = patron_bloque.split(code)[1:] \n",
+    "    \n",
+    "    documentos_finales = [] # Aquí guardaremos objetos Document\n",
+    "    \n",
+    "    for i in range(0, len(bloques), 2):\n",
+    "        titulo = bloques[i].strip()\n",
+    "        contenido_bruto = bloques[i+1]\n",
+    "        \n",
+    "        # 2. Extraer código dentro de ``` ... ```\n",
+    "        codigo_match = re.search(r'```(.*?)```', contenido_bruto, re.DOTALL)\n",
+    "        if codigo_match:\n",
+    "            codigo_bloque = codigo_match.group(1).strip()\n",
+    "            \n",
+    "            # 3. Validar con Lark cada línea\n",
+    "            lineas_validas = []\n",
+    "            for linea in codigo_bloque.split('\\n'):\n",
+    "                linea_clean = linea.strip()\n",
+    "                if linea_clean:\n",
+    "                    try:\n",
+    "                        parser.parse(linea_clean)\n",
+    "                        lineas_validas.append(linea_clean)\n",
+    "                    except:\n",
+    "                        print(f\"⚠️ Error BNF: {linea_clean}\")\n",
+    "            \n",
+    "            # 4. CREACIÓN DIRECTA DEL OBJETO DOCUMENT\n",
+    "            if lineas_validas:\n",
+    "                codigo_final = \"\\n\".join(lineas_validas)\n",
+    "                \n",
+    "                # Construimos el Documento con su ID y Metadatos integrados\n",
+    "                doc = Document(\n",
+    "                    id=str(uuid.uuid4()),\n",
+    "                    page_content=codigo_final,\n",
+    "                    metadata={\n",
+    "                        \"title\": titulo,\n",
+    "                        \"language\": \"AVAP\",\n",
+    "                        \"type\": \"code_snippet\",\n",
+    "                        \"line_count\": len(lineas_validas)\n",
+    "                    }\n",
+    "                )\n",
+    "                documentos_finales.append(doc)\n",
+    "                \n",
+    "    return documentos_finales"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 48,
    "id": "560f9f86",
    "metadata": {},
    "outputs": [],
@@ -297,9 +350,26 @@
     "            out.append(x)\n",
     "            continue\n",
     "        if isinstance(x, dict):\n",
-    "            content = x.get(\"content\") or x.get(\"page_content\") or x.get(\"text\") or x.get(\"code\") or str(x)\n",
+    "            # 1. Extraemos el contenido buscando en todas las posibles claves de tu parser\n",
+    "            content = x.get(\"content\") or x.get(\"codigo\") or x.get(\"page_content\") or x.get(\"text\") or str(x)\n",
+    "            \n",
+    "            # 2. Buscamos el ID\n",
     "            id_ = x.get(\"_id\") or x.get(\"id\") or str(uuid.uuid4())\n",
-    "            metadata = x.get(\"metadata\") or {k: v for k, v in x.items() if k not in (\"_id\", \"id\", \"content\", \"page_content\", \"text\", \"code\")}\n",
+    "            \n",
+    "            # 3. LÓGICA DE METADATOS: Combinamos el objeto 'metadata' con campos sueltos como 'titulo'\n",
+    "            # Empezamos con lo que haya en la clave 'metadata'\n",
+    "            metadata = x.get(\"metadata\", {}).copy()\n",
+    "            \n",
+    "            # Añadimos campos útiles que pusiste en la raíz del dict (como el título)\n",
+    "            if \"titulo\" in x:\n",
+    "                metadata[\"title\"] = x[\"titulo\"]\n",
+    "            if \"tipo_bloque\" in x:\n",
+    "                metadata[\"tipo_bloque\"] = x[\"tipo_bloque\"]\n",
+    "                \n",
+    "            # Si no hay metadatos en absoluto, ponemos una fuente por defecto\n",
+    "            if not metadata:\n",
+    "                metadata = {\"source\": \"parsed_code\"}\n",
+    "\n",
     "            out.append(Document(id=id_, page_content=content, metadata=metadata))\n",
     "            continue\n",
     "        if isinstance(x, str):\n",
@@ -325,7 +395,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 49,
    "id": "6e777f53",
    "metadata": {},
    "outputs": [],
@@ -335,32 +405,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
-   "id": "2981a944",
+   "execution_count": 50,
+   "id": "b01c3fda",
    "metadata": {},
    "outputs": [],
    "source": [
-    "code_chunks = extract_rag_chunks(source_code=code, parser=parser)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 21,
-   "id": "950b5789",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "chunked_code = extraer_codigo_puro(code=code, parser=parser)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 29,
-   "id": "3be3c168",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "code_snippets=procesar_snippet_con_metadata(code=code, parser=parser)"
+    "code_chunks = procesar_snippet_a_documentos(code=code, parser=parser)"
    ]
   },
   {
@@ -373,7 +423,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 51,
    "id": "09ce3e29",
    "metadata": {},
    "outputs": [],
@@ -388,18 +438,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 52,
    "id": "d575c386",
    "metadata": {},
    "outputs": [],
    "source": [
-    "if es.indices.exists(index=ELASTICSEARCH_DOCS_INDEX):\n",
-    "    es.indices.delete(index=ELASTICSEARCH_DOCS_INDEX)"
+    "if es.indices.exists(index=ELASTICSEARCH_CODE_INDEX):\n",
+    "    es.indices.delete(index=ELASTICSEARCH_CODE_INDEX)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 56,
    "id": "40ea0af8",
    "metadata": {},
    "outputs": [
@@ -407,6 +457,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "avap-code\n",
       "avap-docs-test\n"
      ]
     }
@@ -418,7 +469,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 54,
    "id": "4e091b39",
    "metadata": {},
    "outputs": [
@@ -428,7 +479,7 @@
        "OllamaEmbeddings(model='qwen3-0.6B-emb:latest', validate_model_on_init=False, base_url='http://localhost:11434', client_kwargs={}, async_client_kwargs={}, sync_client_kwargs={}, mirostat=None, mirostat_eta=None, mirostat_tau=None, num_ctx=None, num_gpu=None, keep_alive=None, num_thread=None, repeat_last_n=None, repeat_penalty=None, temperature=None, stop=None, tfs_z=None, top_k=None, top_p=None)"
       ]
      },
-     "execution_count": 19,
+     "execution_count": 54,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -440,39 +491,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "id": "1ed4c817",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Prepared docs: 50\n"
-     ]
-    }
-   ],
-   "source": [
-    "\n",
-    "\n",
-    "# convert current list to Documents\n",
-    "docs_to_index = ensure_documents(chunked_code)\n",
-    "print(\"Prepared docs:\", len(docs_to_index))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 32,
+   "execution_count": 55,
    "id": "5aff21c0",
    "metadata": {},
    "outputs": [],
    "source": [
     "# index into Elasticsearch\n",
     "db = ElasticsearchStore.from_documents(\n",
-    "    docs_to_index,\n",
+    "    code_chunks,\n",
     "    embeddings,\n",
     "    client=es,\n",
-    "    index_name=ELASTICSEARCH_DOCS_INDEX,\n",
+    "    index_name=ELASTICSEARCH_CODE_INDEX,\n",
     "    distance_strategy=\"COSINE\",\n",
     ")"
    ]
@@ -485,7 +514,7 @@
    "outputs": [],
    "source": [
     "response = es.search(\n",
-    "    index=ELASTICSEARCH_DOCS_INDEX,\n",
+    "    index=ELASTICSEARCH_CODE_INDEX,\n",
     "    body={\n",
     "        \"query\": {\"match_all\": {}},\n",
     "        \"size\": 10 \n",