working on Dual Index

This commit is contained in:
pseco 2026-03-03 15:07:53 +01:00
parent 9575af3ff0
commit 63c5fc976f
2 changed files with 101 additions and 71 deletions

View File

@ -39,13 +39,14 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 146, "execution_count": null,
"id": "30edcecc", "id": "30edcecc",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"ES_URL = os.getenv(\"ELASTICSEARCH_LOCAL_URL\")\n", "ES_URL = os.getenv(\"ELASTICSEARCH_LOCAL_URL\")\n",
"INDEX_NAME = os.getenv(\"ELASTICSEARCH_INDEX\")\n", "INDEX_NAME = os.getenv(\"ELASTICSEARCH_INDEX\")\n",
"CODE_INDE\n",
"BASE_URL = os.getenv(\"OLLAMA_LOCAL_URL\")\n", "BASE_URL = os.getenv(\"OLLAMA_LOCAL_URL\")\n",
"MODEL_NAME = os.getenv(\"OLLAMA_MODEL_NAME\")\n", "MODEL_NAME = os.getenv(\"OLLAMA_MODEL_NAME\")\n",
"EMB_MODEL_NAME = os.getenv(\"OLLAMA_EMB_MODEL_NAME\")\n", "EMB_MODEL_NAME = os.getenv(\"OLLAMA_EMB_MODEL_NAME\")\n",

View File

@ -2,7 +2,7 @@
"cells": [ "cells": [
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 28, "execution_count": 39,
"id": "0a8abbfa", "id": "0a8abbfa",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@ -12,7 +12,7 @@
"True" "True"
] ]
}, },
"execution_count": 28, "execution_count": 39,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -38,7 +38,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 2, "execution_count": 40,
"id": "5c9d292b", "id": "5c9d292b",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -58,7 +58,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 14, "execution_count": 41,
"id": "0e1cd9b9", "id": "0e1cd9b9",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -81,7 +81,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 4, "execution_count": 42,
"id": "ca43bd67", "id": "ca43bd67",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -138,7 +138,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 5, "execution_count": 43,
"id": "7969500e", "id": "7969500e",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -167,7 +167,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 6, "execution_count": 44,
"id": "7c67fa0b", "id": "7c67fa0b",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -195,7 +195,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 12, "execution_count": 45,
"id": "ab8c2b9b", "id": "ab8c2b9b",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -229,7 +229,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 26, "execution_count": 46,
"id": "8ed54f3f", "id": "8ed54f3f",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -270,7 +270,6 @@
" \"title\": titulo,\n", " \"title\": titulo,\n",
" \"content\": codigo_final, # El campo principal para el RAG\n", " \"content\": codigo_final, # El campo principal para el RAG\n",
" \"metadata\": {\n", " \"metadata\": {\n",
" \"tipo_bloque\": \"ejemplo_fundamentos\",\n",
" \"lenguaje\": \"AVAP\",\n", " \"lenguaje\": \"AVAP\",\n",
" \"line_count\": len(lineas_validas),\n", " \"line_count\": len(lineas_validas),\n",
" \"char_count\": len(codigo_final),\n", " \"char_count\": len(codigo_final),\n",
@ -284,7 +283,61 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 47,
"id": "fb52d20b",
"metadata": {},
"outputs": [],
"source": [
"def procesar_snippet_a_documentos(code, parser):\n",
" # 1. Separar por bloques numerados (ej: 1. **Nombre**)\n",
" patron_bloque = re.compile(r'\\d+\\.\\s+\\*\\*(.*?)\\*\\*')\n",
" bloques = patron_bloque.split(code)[1:] \n",
" \n",
" documentos_finales = [] # Aquí guardaremos objetos Document\n",
" \n",
" for i in range(0, len(bloques), 2):\n",
" titulo = bloques[i].strip()\n",
" contenido_bruto = bloques[i+1]\n",
" \n",
" # 2. Extraer código dentro de ``` ... ```\n",
" codigo_match = re.search(r'```(.*?)```', contenido_bruto, re.DOTALL)\n",
" if codigo_match:\n",
" codigo_bloque = codigo_match.group(1).strip()\n",
" \n",
" # 3. Validar con Lark cada línea\n",
" lineas_validas = []\n",
" for linea in codigo_bloque.split('\\n'):\n",
" linea_clean = linea.strip()\n",
" if linea_clean:\n",
" try:\n",
" parser.parse(linea_clean)\n",
" lineas_validas.append(linea_clean)\n",
" except:\n",
" print(f\"⚠️ Error BNF: {linea_clean}\")\n",
" \n",
" # 4. CREACIÓN DIRECTA DEL OBJETO DOCUMENT\n",
" if lineas_validas:\n",
" codigo_final = \"\\n\".join(lineas_validas)\n",
" \n",
" # Construimos el Documento con su ID y Metadatos integrados\n",
" doc = Document(\n",
" id=str(uuid.uuid4()),\n",
" page_content=codigo_final,\n",
" metadata={\n",
" \"title\": titulo,\n",
" \"language\": \"AVAP\",\n",
" \"type\": \"code_snippet\",\n",
" \"line_count\": len(lineas_validas)\n",
" }\n",
" )\n",
" documentos_finales.append(doc)\n",
" \n",
" return documentos_finales"
]
},
{
"cell_type": "code",
"execution_count": 48,
"id": "560f9f86", "id": "560f9f86",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -297,9 +350,26 @@
" out.append(x)\n", " out.append(x)\n",
" continue\n", " continue\n",
" if isinstance(x, dict):\n", " if isinstance(x, dict):\n",
" content = x.get(\"content\") or x.get(\"page_content\") or x.get(\"text\") or x.get(\"code\") or str(x)\n", " # 1. Extraemos el contenido buscando en todas las posibles claves de tu parser\n",
" content = x.get(\"content\") or x.get(\"codigo\") or x.get(\"page_content\") or x.get(\"text\") or str(x)\n",
" \n",
" # 2. Buscamos el ID\n",
" id_ = x.get(\"_id\") or x.get(\"id\") or str(uuid.uuid4())\n", " id_ = x.get(\"_id\") or x.get(\"id\") or str(uuid.uuid4())\n",
" metadata = x.get(\"metadata\") or {k: v for k, v in x.items() if k not in (\"_id\", \"id\", \"content\", \"page_content\", \"text\", \"code\")}\n", " \n",
" # 3. LÓGICA DE METADATOS: Combinamos el objeto 'metadata' con campos sueltos como 'titulo'\n",
" # Empezamos con lo que haya en la clave 'metadata'\n",
" metadata = x.get(\"metadata\", {}).copy()\n",
" \n",
" # Añadimos campos útiles que pusiste en la raíz del dict (como el título)\n",
" if \"titulo\" in x:\n",
" metadata[\"title\"] = x[\"titulo\"]\n",
" if \"tipo_bloque\" in x:\n",
" metadata[\"tipo_bloque\"] = x[\"tipo_bloque\"]\n",
" \n",
" # Si no hay metadatos en absoluto, ponemos una fuente por defecto\n",
" if not metadata:\n",
" metadata = {\"source\": \"parsed_code\"}\n",
"\n",
" out.append(Document(id=id_, page_content=content, metadata=metadata))\n", " out.append(Document(id=id_, page_content=content, metadata=metadata))\n",
" continue\n", " continue\n",
" if isinstance(x, str):\n", " if isinstance(x, str):\n",
@ -325,7 +395,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 7, "execution_count": 49,
"id": "6e777f53", "id": "6e777f53",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -335,32 +405,12 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 8, "execution_count": 50,
"id": "2981a944", "id": "b01c3fda",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"code_chunks = extract_rag_chunks(source_code=code, parser=parser)" "code_chunks = procesar_snippet_a_documentos(code=code, parser=parser)"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "950b5789",
"metadata": {},
"outputs": [],
"source": [
"chunked_code = extraer_codigo_puro(code=code, parser=parser)"
]
},
{
"cell_type": "code",
"execution_count": 29,
"id": "3be3c168",
"metadata": {},
"outputs": [],
"source": [
"code_snippets=procesar_snippet_con_metadata(code=code, parser=parser)"
] ]
}, },
{ {
@ -373,7 +423,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 16, "execution_count": 51,
"id": "09ce3e29", "id": "09ce3e29",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -388,18 +438,18 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 17, "execution_count": 52,
"id": "d575c386", "id": "d575c386",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"if es.indices.exists(index=ELASTICSEARCH_DOCS_INDEX):\n", "if es.indices.exists(index=ELASTICSEARCH_CODE_INDEX):\n",
" es.indices.delete(index=ELASTICSEARCH_DOCS_INDEX)" " es.indices.delete(index=ELASTICSEARCH_CODE_INDEX)"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 18, "execution_count": 56,
"id": "40ea0af8", "id": "40ea0af8",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@ -407,6 +457,7 @@
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"avap-code\n",
"avap-docs-test\n" "avap-docs-test\n"
] ]
} }
@ -418,7 +469,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 19, "execution_count": 54,
"id": "4e091b39", "id": "4e091b39",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@ -428,7 +479,7 @@
"OllamaEmbeddings(model='qwen3-0.6B-emb:latest', validate_model_on_init=False, base_url='http://localhost:11434', client_kwargs={}, async_client_kwargs={}, sync_client_kwargs={}, mirostat=None, mirostat_eta=None, mirostat_tau=None, num_ctx=None, num_gpu=None, keep_alive=None, num_thread=None, repeat_last_n=None, repeat_penalty=None, temperature=None, stop=None, tfs_z=None, top_k=None, top_p=None)" "OllamaEmbeddings(model='qwen3-0.6B-emb:latest', validate_model_on_init=False, base_url='http://localhost:11434', client_kwargs={}, async_client_kwargs={}, sync_client_kwargs={}, mirostat=None, mirostat_eta=None, mirostat_tau=None, num_ctx=None, num_gpu=None, keep_alive=None, num_thread=None, repeat_last_n=None, repeat_penalty=None, temperature=None, stop=None, tfs_z=None, top_k=None, top_p=None)"
] ]
}, },
"execution_count": 19, "execution_count": 54,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -440,39 +491,17 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 55,
"id": "1ed4c817",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Prepared docs: 50\n"
]
}
],
"source": [
"\n",
"\n",
"# convert current list to Documents\n",
"docs_to_index = ensure_documents(chunked_code)\n",
"print(\"Prepared docs:\", len(docs_to_index))"
]
},
{
"cell_type": "code",
"execution_count": 32,
"id": "5aff21c0", "id": "5aff21c0",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"# index into Elasticsearch\n", "# index into Elasticsearch\n",
"db = ElasticsearchStore.from_documents(\n", "db = ElasticsearchStore.from_documents(\n",
" docs_to_index,\n", " code_chunks,\n",
" embeddings,\n", " embeddings,\n",
" client=es,\n", " client=es,\n",
" index_name=ELASTICSEARCH_DOCS_INDEX,\n", " index_name=ELASTICSEARCH_CODE_INDEX,\n",
" distance_strategy=\"COSINE\",\n", " distance_strategy=\"COSINE\",\n",
")" ")"
] ]
@ -485,7 +514,7 @@
"outputs": [], "outputs": [],
"source": [ "source": [
"response = es.search(\n", "response = es.search(\n",
" index=ELASTICSEARCH_DOCS_INDEX,\n", " index=ELASTICSEARCH_CODE_INDEX,\n",
" body={\n", " body={\n",
" \"query\": {\"match_all\": {}},\n", " \"query\": {\"match_all\": {}},\n",
" \"size\": 10 \n", " \"size\": 10 \n",