working on Dual Index

2026-03-03 15:07:53 +01:00 · 2026-03-03 15:07:53 +01:00 · 63c5fc976f
parent 9575af3ff0
commit 63c5fc976f
2 changed files with 101 additions and 71 deletions
--- a/scratches/pseco/agent/n00
+++ b/scratches/pseco/agent/n00
@ -39,13 +39,14 @@
        },
        {
            "cell_type": "code",
-            "execution_count": 146,
+            "execution_count": null,
            "id": "30edcecc",
            "metadata": {},
            "outputs": [],
            "source": [
                "ES_URL = os.getenv(\"ELASTICSEARCH_LOCAL_URL\")\n",
                "INDEX_NAME = os.getenv(\"ELASTICSEARCH_INDEX\")\n",
                "CODE_INDE\n",
                "BASE_URL = os.getenv(\"OLLAMA_LOCAL_URL\")\n",
                "MODEL_NAME = os.getenv(\"OLLAMA_MODEL_NAME\")\n",
                "EMB_MODEL_NAME = os.getenv(\"OLLAMA_EMB_MODEL_NAME\")\n",
--- a/scratches/pseco/ingestion/n00
+++ b/scratches/pseco/ingestion/n00
@ -2,7 +2,7 @@
 "cells": [
  {
   "cell_type": "code",
-   "execution_count": 28,
+   "execution_count": 39,
   "id": "0a8abbfa",
   "metadata": {},
   "outputs": [
@ -12,7 +12,7 @@
       "True"
      ]
     },
-     "execution_count": 28,
+     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -38,7 +38,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 40,
   "id": "5c9d292b",
   "metadata": {},
   "outputs": [],
@ -58,7 +58,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 41,
   "id": "0e1cd9b9",
   "metadata": {},
   "outputs": [],
@ -81,7 +81,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 42,
   "id": "ca43bd67",
   "metadata": {},
   "outputs": [],
@ -138,7 +138,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 43,
   "id": "7969500e",
   "metadata": {},
   "outputs": [],
@ -167,7 +167,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 44,
   "id": "7c67fa0b",
   "metadata": {},
   "outputs": [],
@ -195,7 +195,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 45,
   "id": "ab8c2b9b",
   "metadata": {},
   "outputs": [],
@ -229,7 +229,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 26,
+   "execution_count": 46,
   "id": "8ed54f3f",
   "metadata": {},
   "outputs": [],
@ -270,7 +270,6 @@
    "                    \"title\": titulo,\n",
    "                    \"content\": codigo_final,   # El campo principal para el RAG\n",
    "                    \"metadata\": {\n",
    "                        \"tipo_bloque\": \"ejemplo_fundamentos\",\n",
    "                        \"lenguaje\": \"AVAP\",\n",
    "                        \"line_count\": len(lineas_validas),\n",
    "                        \"char_count\": len(codigo_final),\n",
@ -284,7 +283,61 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 47,
   "id": "fb52d20b",
   "metadata": {},
   "outputs": [],
   "source": [
    "def procesar_snippet_a_documentos(code, parser):\n",
    "    # 1. Separar por bloques numerados (ej: 1. **Nombre**)\n",
    "    patron_bloque = re.compile(r'\\d+\\.\\s+\\*\\*(.*?)\\*\\*')\n",
    "    bloques = patron_bloque.split(code)[1:] \n",
    "    \n",
    "    documentos_finales = [] # Aquí guardaremos objetos Document\n",
    "    \n",
    "    for i in range(0, len(bloques), 2):\n",
    "        titulo = bloques[i].strip()\n",
    "        contenido_bruto = bloques[i+1]\n",
    "        \n",
    "        # 2. Extraer código dentro de ``` ... ```\n",
    "        codigo_match = re.search(r'```(.*?)```', contenido_bruto, re.DOTALL)\n",
    "        if codigo_match:\n",
    "            codigo_bloque = codigo_match.group(1).strip()\n",
    "            \n",
    "            # 3. Validar con Lark cada línea\n",
    "            lineas_validas = []\n",
    "            for linea in codigo_bloque.split('\\n'):\n",
    "                linea_clean = linea.strip()\n",
    "                if linea_clean:\n",
    "                    try:\n",
    "                        parser.parse(linea_clean)\n",
    "                        lineas_validas.append(linea_clean)\n",
    "                    except:\n",
    "                        print(f\"⚠️ Error BNF: {linea_clean}\")\n",
    "            \n",
    "            # 4. CREACIÓN DIRECTA DEL OBJETO DOCUMENT\n",
    "            if lineas_validas:\n",
    "                codigo_final = \"\\n\".join(lineas_validas)\n",
    "                \n",
    "                # Construimos el Documento con su ID y Metadatos integrados\n",
    "                doc = Document(\n",
    "                    id=str(uuid.uuid4()),\n",
    "                    page_content=codigo_final,\n",
    "                    metadata={\n",
    "                        \"title\": titulo,\n",
    "                        \"language\": \"AVAP\",\n",
    "                        \"type\": \"code_snippet\",\n",
    "                        \"line_count\": len(lineas_validas)\n",
    "                    }\n",
    "                )\n",
    "                documentos_finales.append(doc)\n",
    "                \n",
    "    return documentos_finales"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "id": "560f9f86",
   "metadata": {},
   "outputs": [],
@ -297,9 +350,26 @@
    "            out.append(x)\n",
    "            continue\n",
    "        if isinstance(x, dict):\n",
-    "            content = x.get(\"content\") or x.get(\"page_content\") or x.get(\"text\") or x.get(\"code\") or str(x)\n",
+    "            # 1. Extraemos el contenido buscando en todas las posibles claves de tu parser\n",
    "            content = x.get(\"content\") or x.get(\"codigo\") or x.get(\"page_content\") or x.get(\"text\") or str(x)\n",
    "            \n",
    "            # 2. Buscamos el ID\n",
    "            id_ = x.get(\"_id\") or x.get(\"id\") or str(uuid.uuid4())\n",
-    "            metadata = x.get(\"metadata\") or {k: v for k, v in x.items() if k not in (\"_id\", \"id\", \"content\", \"page_content\", \"text\", \"code\")}\n",
+    "            \n",
    "            # 3. LÓGICA DE METADATOS: Combinamos el objeto 'metadata' con campos sueltos como 'titulo'\n",
    "            # Empezamos con lo que haya en la clave 'metadata'\n",
    "            metadata = x.get(\"metadata\", {}).copy()\n",
    "            \n",
    "            # Añadimos campos útiles que pusiste en la raíz del dict (como el título)\n",
    "            if \"titulo\" in x:\n",
    "                metadata[\"title\"] = x[\"titulo\"]\n",
    "            if \"tipo_bloque\" in x:\n",
    "                metadata[\"tipo_bloque\"] = x[\"tipo_bloque\"]\n",
    "                \n",
    "            # Si no hay metadatos en absoluto, ponemos una fuente por defecto\n",
    "            if not metadata:\n",
    "                metadata = {\"source\": \"parsed_code\"}\n",
    "\n",
    "            out.append(Document(id=id_, page_content=content, metadata=metadata))\n",
    "            continue\n",
    "        if isinstance(x, str):\n",
@ -325,7 +395,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 49,
   "id": "6e777f53",
   "metadata": {},
   "outputs": [],
@ -335,32 +405,12 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 50,
-   "id": "2981a944",
+   "id": "b01c3fda",
   "metadata": {},
   "outputs": [],
   "source": [
-    "code_chunks = extract_rag_chunks(source_code=code, parser=parser)"
+    "code_chunks = procesar_snippet_a_documentos(code=code, parser=parser)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "950b5789",
   "metadata": {},
   "outputs": [],
   "source": [
    "chunked_code = extraer_codigo_puro(code=code, parser=parser)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "3be3c168",
   "metadata": {},
   "outputs": [],
   "source": [
    "code_snippets=procesar_snippet_con_metadata(code=code, parser=parser)"
   ]
  },
  {
@ -373,7 +423,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 51,
   "id": "09ce3e29",
   "metadata": {},
   "outputs": [],
@ -388,18 +438,18 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 52,
   "id": "d575c386",
   "metadata": {},
   "outputs": [],
   "source": [
-    "if es.indices.exists(index=ELASTICSEARCH_DOCS_INDEX):\n",
+    "if es.indices.exists(index=ELASTICSEARCH_CODE_INDEX):\n",
-    "    es.indices.delete(index=ELASTICSEARCH_DOCS_INDEX)"
+    "    es.indices.delete(index=ELASTICSEARCH_CODE_INDEX)"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 56,
   "id": "40ea0af8",
   "metadata": {},
   "outputs": [
@ -407,6 +457,7 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "avap-code\n",
      "avap-docs-test\n"
     ]
    }
@ -418,7 +469,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 54,
   "id": "4e091b39",
   "metadata": {},
   "outputs": [
@ -428,7 +479,7 @@
       "OllamaEmbeddings(model='qwen3-0.6B-emb:latest', validate_model_on_init=False, base_url='http://localhost:11434', client_kwargs={}, async_client_kwargs={}, sync_client_kwargs={}, mirostat=None, mirostat_eta=None, mirostat_tau=None, num_ctx=None, num_gpu=None, keep_alive=None, num_thread=None, repeat_last_n=None, repeat_penalty=None, temperature=None, stop=None, tfs_z=None, top_k=None, top_p=None)"
      ]
     },
-     "execution_count": 19,
+     "execution_count": 54,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -440,39 +491,17 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 55,
   "id": "1ed4c817",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Prepared docs: 50\n"
     ]
    }
   ],
   "source": [
    "\n",
    "\n",
    "# convert current list to Documents\n",
    "docs_to_index = ensure_documents(chunked_code)\n",
    "print(\"Prepared docs:\", len(docs_to_index))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "5aff21c0",
   "metadata": {},
   "outputs": [],
   "source": [
    "# index into Elasticsearch\n",
    "db = ElasticsearchStore.from_documents(\n",
-    "    docs_to_index,\n",
+    "    code_chunks,\n",
    "    embeddings,\n",
    "    client=es,\n",
-    "    index_name=ELASTICSEARCH_DOCS_INDEX,\n",
+    "    index_name=ELASTICSEARCH_CODE_INDEX,\n",
    "    distance_strategy=\"COSINE\",\n",
    ")"
   ]
@ -485,7 +514,7 @@
   "outputs": [],
   "source": [
    "response = es.search(\n",
-    "    index=ELASTICSEARCH_DOCS_INDEX,\n",
+    "    index=ELASTICSEARCH_CODE_INDEX,\n",
    "    body={\n",
    "        \"query\": {\"match_all\": {}},\n",
    "        \"size\": 10 \n",