From 63c5fc976f4db1f5ab6aecf1a5b633f0b05ca705 Mon Sep 17 00:00:00 2001 From: pseco Date: Tue, 3 Mar 2026 15:07:53 +0100 Subject: [PATCH] working on Dual Index --- .../pseco/agent/n00 LangGraph Agent v2.ipynb | 3 +- .../pseco/ingestion/n00 Dual Index v3.ipynb | 169 ++++++++++-------- 2 files changed, 101 insertions(+), 71 deletions(-) diff --git a/scratches/pseco/agent/n00 LangGraph Agent v2.ipynb b/scratches/pseco/agent/n00 LangGraph Agent v2.ipynb index d3840e0..bf6680f 100644 --- a/scratches/pseco/agent/n00 LangGraph Agent v2.ipynb +++ b/scratches/pseco/agent/n00 LangGraph Agent v2.ipynb @@ -39,13 +39,14 @@ }, { "cell_type": "code", - "execution_count": 146, + "execution_count": null, "id": "30edcecc", "metadata": {}, "outputs": [], "source": [ "ES_URL = os.getenv(\"ELASTICSEARCH_LOCAL_URL\")\n", "INDEX_NAME = os.getenv(\"ELASTICSEARCH_INDEX\")\n", + "CODE_INDE\n", "BASE_URL = os.getenv(\"OLLAMA_LOCAL_URL\")\n", "MODEL_NAME = os.getenv(\"OLLAMA_MODEL_NAME\")\n", "EMB_MODEL_NAME = os.getenv(\"OLLAMA_EMB_MODEL_NAME\")\n", diff --git a/scratches/pseco/ingestion/n00 Dual Index v3.ipynb b/scratches/pseco/ingestion/n00 Dual Index v3.ipynb index 50fe4d2..6881e7c 100644 --- a/scratches/pseco/ingestion/n00 Dual Index v3.ipynb +++ b/scratches/pseco/ingestion/n00 Dual Index v3.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 28, + "execution_count": 39, "id": "0a8abbfa", "metadata": {}, "outputs": [ @@ -12,7 +12,7 @@ "True" ] }, - "execution_count": 28, + "execution_count": 39, "metadata": {}, "output_type": "execute_result" } @@ -38,7 +38,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 40, "id": "5c9d292b", "metadata": {}, "outputs": [], @@ -58,7 +58,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 41, "id": "0e1cd9b9", "metadata": {}, "outputs": [], @@ -81,7 +81,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 42, "id": "ca43bd67", "metadata": {}, "outputs": [], @@ -138,7 +138,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 43, "id": "7969500e", "metadata": {}, "outputs": [], @@ -167,7 +167,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 44, "id": "7c67fa0b", "metadata": {}, "outputs": [], @@ -195,7 +195,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 45, "id": "ab8c2b9b", "metadata": {}, "outputs": [], @@ -229,7 +229,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 46, "id": "8ed54f3f", "metadata": {}, "outputs": [], @@ -270,7 +270,6 @@ " \"title\": titulo,\n", " \"content\": codigo_final, # El campo principal para el RAG\n", " \"metadata\": {\n", - " \"tipo_bloque\": \"ejemplo_fundamentos\",\n", " \"lenguaje\": \"AVAP\",\n", " \"line_count\": len(lineas_validas),\n", " \"char_count\": len(codigo_final),\n", @@ -284,7 +283,61 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 47, + "id": "fb52d20b", + "metadata": {}, + "outputs": [], + "source": [ + "def procesar_snippet_a_documentos(code, parser):\n", + " # 1. Separar por bloques numerados (ej: 1. **Nombre**)\n", + " patron_bloque = re.compile(r'\\d+\\.\\s+\\*\\*(.*?)\\*\\*')\n", + " bloques = patron_bloque.split(code)[1:] \n", + " \n", + " documentos_finales = [] # Aquí guardaremos objetos Document\n", + " \n", + " for i in range(0, len(bloques), 2):\n", + " titulo = bloques[i].strip()\n", + " contenido_bruto = bloques[i+1]\n", + " \n", + " # 2. Extraer código dentro de ``` ... ```\n", + " codigo_match = re.search(r'```(.*?)```', contenido_bruto, re.DOTALL)\n", + " if codigo_match:\n", + " codigo_bloque = codigo_match.group(1).strip()\n", + " \n", + " # 3. Validar con Lark cada línea\n", + " lineas_validas = []\n", + " for linea in codigo_bloque.split('\\n'):\n", + " linea_clean = linea.strip()\n", + " if linea_clean:\n", + " try:\n", + " parser.parse(linea_clean)\n", + " lineas_validas.append(linea_clean)\n", + " except:\n", + " print(f\"⚠️ Error BNF: {linea_clean}\")\n", + " \n", + " # 4. CREACIÓN DIRECTA DEL OBJETO DOCUMENT\n", + " if lineas_validas:\n", + " codigo_final = \"\\n\".join(lineas_validas)\n", + " \n", + " # Construimos el Documento con su ID y Metadatos integrados\n", + " doc = Document(\n", + " id=str(uuid.uuid4()),\n", + " page_content=codigo_final,\n", + " metadata={\n", + " \"title\": titulo,\n", + " \"language\": \"AVAP\",\n", + " \"type\": \"code_snippet\",\n", + " \"line_count\": len(lineas_validas)\n", + " }\n", + " )\n", + " documentos_finales.append(doc)\n", + " \n", + " return documentos_finales" + ] + }, + { + "cell_type": "code", + "execution_count": 48, "id": "560f9f86", "metadata": {}, "outputs": [], @@ -297,9 +350,26 @@ " out.append(x)\n", " continue\n", " if isinstance(x, dict):\n", - " content = x.get(\"content\") or x.get(\"page_content\") or x.get(\"text\") or x.get(\"code\") or str(x)\n", + " # 1. Extraemos el contenido buscando en todas las posibles claves de tu parser\n", + " content = x.get(\"content\") or x.get(\"codigo\") or x.get(\"page_content\") or x.get(\"text\") or str(x)\n", + " \n", + " # 2. Buscamos el ID\n", " id_ = x.get(\"_id\") or x.get(\"id\") or str(uuid.uuid4())\n", - " metadata = x.get(\"metadata\") or {k: v for k, v in x.items() if k not in (\"_id\", \"id\", \"content\", \"page_content\", \"text\", \"code\")}\n", + " \n", + " # 3. LÓGICA DE METADATOS: Combinamos el objeto 'metadata' con campos sueltos como 'titulo'\n", + " # Empezamos con lo que haya en la clave 'metadata'\n", + " metadata = x.get(\"metadata\", {}).copy()\n", + " \n", + " # Añadimos campos útiles que pusiste en la raíz del dict (como el título)\n", + " if \"titulo\" in x:\n", + " metadata[\"title\"] = x[\"titulo\"]\n", + " if \"tipo_bloque\" in x:\n", + " metadata[\"tipo_bloque\"] = x[\"tipo_bloque\"]\n", + " \n", + " # Si no hay metadatos en absoluto, ponemos una fuente por defecto\n", + " if not metadata:\n", + " metadata = {\"source\": \"parsed_code\"}\n", + "\n", " out.append(Document(id=id_, page_content=content, metadata=metadata))\n", " continue\n", " if isinstance(x, str):\n", @@ -325,7 +395,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 49, "id": "6e777f53", "metadata": {}, "outputs": [], @@ -335,32 +405,12 @@ }, { "cell_type": "code", - "execution_count": 8, - "id": "2981a944", + "execution_count": 50, + "id": "b01c3fda", "metadata": {}, "outputs": [], "source": [ - "code_chunks = extract_rag_chunks(source_code=code, parser=parser)" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "id": "950b5789", - "metadata": {}, - "outputs": [], - "source": [ - "chunked_code = extraer_codigo_puro(code=code, parser=parser)" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "id": "3be3c168", - "metadata": {}, - "outputs": [], - "source": [ - "code_snippets=procesar_snippet_con_metadata(code=code, parser=parser)" + "code_chunks = procesar_snippet_a_documentos(code=code, parser=parser)" ] }, { @@ -373,7 +423,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 51, "id": "09ce3e29", "metadata": {}, "outputs": [], @@ -388,18 +438,18 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 52, "id": "d575c386", "metadata": {}, "outputs": [], "source": [ - "if es.indices.exists(index=ELASTICSEARCH_DOCS_INDEX):\n", - " es.indices.delete(index=ELASTICSEARCH_DOCS_INDEX)" + "if es.indices.exists(index=ELASTICSEARCH_CODE_INDEX):\n", + " es.indices.delete(index=ELASTICSEARCH_CODE_INDEX)" ] }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 56, "id": "40ea0af8", "metadata": {}, "outputs": [ @@ -407,6 +457,7 @@ "name": "stdout", "output_type": "stream", "text": [ + "avap-code\n", "avap-docs-test\n" ] } @@ -418,7 +469,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 54, "id": "4e091b39", "metadata": {}, "outputs": [ @@ -428,7 +479,7 @@ "OllamaEmbeddings(model='qwen3-0.6B-emb:latest', validate_model_on_init=False, base_url='http://localhost:11434', client_kwargs={}, async_client_kwargs={}, sync_client_kwargs={}, mirostat=None, mirostat_eta=None, mirostat_tau=None, num_ctx=None, num_gpu=None, keep_alive=None, num_thread=None, repeat_last_n=None, repeat_penalty=None, temperature=None, stop=None, tfs_z=None, top_k=None, top_p=None)" ] }, - "execution_count": 19, + "execution_count": 54, "metadata": {}, "output_type": "execute_result" } @@ -440,39 +491,17 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "1ed4c817", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Prepared docs: 50\n" - ] - } - ], - "source": [ - "\n", - "\n", - "# convert current list to Documents\n", - "docs_to_index = ensure_documents(chunked_code)\n", - "print(\"Prepared docs:\", len(docs_to_index))" - ] - }, - { - "cell_type": "code", - "execution_count": 32, + "execution_count": 55, "id": "5aff21c0", "metadata": {}, "outputs": [], "source": [ "# index into Elasticsearch\n", "db = ElasticsearchStore.from_documents(\n", - " docs_to_index,\n", + " code_chunks,\n", " embeddings,\n", " client=es,\n", - " index_name=ELASTICSEARCH_DOCS_INDEX,\n", + " index_name=ELASTICSEARCH_CODE_INDEX,\n", " distance_strategy=\"COSINE\",\n", ")" ] @@ -485,7 +514,7 @@ "outputs": [], "source": [ "response = es.search(\n", - " index=ELASTICSEARCH_DOCS_INDEX,\n", + " index=ELASTICSEARCH_CODE_INDEX,\n", " body={\n", " \"query\": {\"match_all\": {}},\n", " \"size\": 10 \n",