Merge branch 'mrh-online-dev' of github.com:BRUNIX-AI/assistance-engine into mrh-online-dev
This commit is contained in:
commit
dcc07495e5
|
|
@ -39,13 +39,14 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 146,
|
||||
"execution_count": null,
|
||||
"id": "30edcecc",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"ES_URL = os.getenv(\"ELASTICSEARCH_LOCAL_URL\")\n",
|
||||
"INDEX_NAME = os.getenv(\"ELASTICSEARCH_INDEX\")\n",
|
||||
"CODE_INDE\n",
|
||||
"BASE_URL = os.getenv(\"OLLAMA_LOCAL_URL\")\n",
|
||||
"MODEL_NAME = os.getenv(\"OLLAMA_MODEL_NAME\")\n",
|
||||
"EMB_MODEL_NAME = os.getenv(\"OLLAMA_EMB_MODEL_NAME\")\n",
|
||||
|
|
|
|||
|
|
@ -2,7 +2,7 @@
|
|||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 28,
|
||||
"execution_count": 39,
|
||||
"id": "0a8abbfa",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
|
|
@ -12,7 +12,7 @@
|
|||
"True"
|
||||
]
|
||||
},
|
||||
"execution_count": 28,
|
||||
"execution_count": 39,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
|
|
@ -38,7 +38,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"execution_count": 40,
|
||||
"id": "5c9d292b",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
|
|
@ -58,7 +58,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"execution_count": 41,
|
||||
"id": "0e1cd9b9",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
|
|
@ -81,7 +81,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"execution_count": 42,
|
||||
"id": "ca43bd67",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
|
|
@ -138,7 +138,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"execution_count": 43,
|
||||
"id": "7969500e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
|
|
@ -167,7 +167,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"execution_count": 44,
|
||||
"id": "7c67fa0b",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
|
|
@ -195,7 +195,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"execution_count": 45,
|
||||
"id": "ab8c2b9b",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
|
|
@ -229,7 +229,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 26,
|
||||
"execution_count": 46,
|
||||
"id": "8ed54f3f",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
|
|
@ -270,7 +270,6 @@
|
|||
" \"title\": titulo,\n",
|
||||
" \"content\": codigo_final, # El campo principal para el RAG\n",
|
||||
" \"metadata\": {\n",
|
||||
" \"tipo_bloque\": \"ejemplo_fundamentos\",\n",
|
||||
" \"lenguaje\": \"AVAP\",\n",
|
||||
" \"line_count\": len(lineas_validas),\n",
|
||||
" \"char_count\": len(codigo_final),\n",
|
||||
|
|
@ -284,7 +283,61 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 47,
|
||||
"id": "fb52d20b",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def procesar_snippet_a_documentos(code, parser):\n",
|
||||
" # 1. Separar por bloques numerados (ej: 1. **Nombre**)\n",
|
||||
" patron_bloque = re.compile(r'\\d+\\.\\s+\\*\\*(.*?)\\*\\*')\n",
|
||||
" bloques = patron_bloque.split(code)[1:] \n",
|
||||
" \n",
|
||||
" documentos_finales = [] # Aquí guardaremos objetos Document\n",
|
||||
" \n",
|
||||
" for i in range(0, len(bloques), 2):\n",
|
||||
" titulo = bloques[i].strip()\n",
|
||||
" contenido_bruto = bloques[i+1]\n",
|
||||
" \n",
|
||||
" # 2. Extraer código dentro de ``` ... ```\n",
|
||||
" codigo_match = re.search(r'```(.*?)```', contenido_bruto, re.DOTALL)\n",
|
||||
" if codigo_match:\n",
|
||||
" codigo_bloque = codigo_match.group(1).strip()\n",
|
||||
" \n",
|
||||
" # 3. Validar con Lark cada línea\n",
|
||||
" lineas_validas = []\n",
|
||||
" for linea in codigo_bloque.split('\\n'):\n",
|
||||
" linea_clean = linea.strip()\n",
|
||||
" if linea_clean:\n",
|
||||
" try:\n",
|
||||
" parser.parse(linea_clean)\n",
|
||||
" lineas_validas.append(linea_clean)\n",
|
||||
" except:\n",
|
||||
" print(f\"⚠️ Error BNF: {linea_clean}\")\n",
|
||||
" \n",
|
||||
" # 4. CREACIÓN DIRECTA DEL OBJETO DOCUMENT\n",
|
||||
" if lineas_validas:\n",
|
||||
" codigo_final = \"\\n\".join(lineas_validas)\n",
|
||||
" \n",
|
||||
" # Construimos el Documento con su ID y Metadatos integrados\n",
|
||||
" doc = Document(\n",
|
||||
" id=str(uuid.uuid4()),\n",
|
||||
" page_content=codigo_final,\n",
|
||||
" metadata={\n",
|
||||
" \"title\": titulo,\n",
|
||||
" \"language\": \"AVAP\",\n",
|
||||
" \"type\": \"code_snippet\",\n",
|
||||
" \"line_count\": len(lineas_validas)\n",
|
||||
" }\n",
|
||||
" )\n",
|
||||
" documentos_finales.append(doc)\n",
|
||||
" \n",
|
||||
" return documentos_finales"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 48,
|
||||
"id": "560f9f86",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
|
|
@ -297,9 +350,26 @@
|
|||
" out.append(x)\n",
|
||||
" continue\n",
|
||||
" if isinstance(x, dict):\n",
|
||||
" content = x.get(\"content\") or x.get(\"page_content\") or x.get(\"text\") or x.get(\"code\") or str(x)\n",
|
||||
" # 1. Extraemos el contenido buscando en todas las posibles claves de tu parser\n",
|
||||
" content = x.get(\"content\") or x.get(\"codigo\") or x.get(\"page_content\") or x.get(\"text\") or str(x)\n",
|
||||
" \n",
|
||||
" # 2. Buscamos el ID\n",
|
||||
" id_ = x.get(\"_id\") or x.get(\"id\") or str(uuid.uuid4())\n",
|
||||
" metadata = x.get(\"metadata\") or {k: v for k, v in x.items() if k not in (\"_id\", \"id\", \"content\", \"page_content\", \"text\", \"code\")}\n",
|
||||
" \n",
|
||||
" # 3. LÓGICA DE METADATOS: Combinamos el objeto 'metadata' con campos sueltos como 'titulo'\n",
|
||||
" # Empezamos con lo que haya en la clave 'metadata'\n",
|
||||
" metadata = x.get(\"metadata\", {}).copy()\n",
|
||||
" \n",
|
||||
" # Añadimos campos útiles que pusiste en la raíz del dict (como el título)\n",
|
||||
" if \"titulo\" in x:\n",
|
||||
" metadata[\"title\"] = x[\"titulo\"]\n",
|
||||
" if \"tipo_bloque\" in x:\n",
|
||||
" metadata[\"tipo_bloque\"] = x[\"tipo_bloque\"]\n",
|
||||
" \n",
|
||||
" # Si no hay metadatos en absoluto, ponemos una fuente por defecto\n",
|
||||
" if not metadata:\n",
|
||||
" metadata = {\"source\": \"parsed_code\"}\n",
|
||||
"\n",
|
||||
" out.append(Document(id=id_, page_content=content, metadata=metadata))\n",
|
||||
" continue\n",
|
||||
" if isinstance(x, str):\n",
|
||||
|
|
@ -325,7 +395,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"execution_count": 49,
|
||||
"id": "6e777f53",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
|
|
@ -335,32 +405,12 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"id": "2981a944",
|
||||
"execution_count": 50,
|
||||
"id": "b01c3fda",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"code_chunks = extract_rag_chunks(source_code=code, parser=parser)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 21,
|
||||
"id": "950b5789",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"chunked_code = extraer_codigo_puro(code=code, parser=parser)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 29,
|
||||
"id": "3be3c168",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"code_snippets=procesar_snippet_con_metadata(code=code, parser=parser)"
|
||||
"code_chunks = procesar_snippet_a_documentos(code=code, parser=parser)"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
|
@ -373,7 +423,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"execution_count": 51,
|
||||
"id": "09ce3e29",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
|
|
@ -388,18 +438,18 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 17,
|
||||
"execution_count": 52,
|
||||
"id": "d575c386",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"if es.indices.exists(index=ELASTICSEARCH_DOCS_INDEX):\n",
|
||||
" es.indices.delete(index=ELASTICSEARCH_DOCS_INDEX)"
|
||||
"if es.indices.exists(index=ELASTICSEARCH_CODE_INDEX):\n",
|
||||
" es.indices.delete(index=ELASTICSEARCH_CODE_INDEX)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 18,
|
||||
"execution_count": 56,
|
||||
"id": "40ea0af8",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
|
|
@ -407,6 +457,7 @@
|
|||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"avap-code\n",
|
||||
"avap-docs-test\n"
|
||||
]
|
||||
}
|
||||
|
|
@ -418,7 +469,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 19,
|
||||
"execution_count": 54,
|
||||
"id": "4e091b39",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
|
|
@ -428,7 +479,7 @@
|
|||
"OllamaEmbeddings(model='qwen3-0.6B-emb:latest', validate_model_on_init=False, base_url='http://localhost:11434', client_kwargs={}, async_client_kwargs={}, sync_client_kwargs={}, mirostat=None, mirostat_eta=None, mirostat_tau=None, num_ctx=None, num_gpu=None, keep_alive=None, num_thread=None, repeat_last_n=None, repeat_penalty=None, temperature=None, stop=None, tfs_z=None, top_k=None, top_p=None)"
|
||||
]
|
||||
},
|
||||
"execution_count": 19,
|
||||
"execution_count": 54,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
|
|
@ -440,39 +491,17 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "1ed4c817",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Prepared docs: 50\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"\n",
|
||||
"\n",
|
||||
"# convert current list to Documents\n",
|
||||
"docs_to_index = ensure_documents(chunked_code)\n",
|
||||
"print(\"Prepared docs:\", len(docs_to_index))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 32,
|
||||
"execution_count": 55,
|
||||
"id": "5aff21c0",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# index into Elasticsearch\n",
|
||||
"db = ElasticsearchStore.from_documents(\n",
|
||||
" docs_to_index,\n",
|
||||
" code_chunks,\n",
|
||||
" embeddings,\n",
|
||||
" client=es,\n",
|
||||
" index_name=ELASTICSEARCH_DOCS_INDEX,\n",
|
||||
" index_name=ELASTICSEARCH_CODE_INDEX,\n",
|
||||
" distance_strategy=\"COSINE\",\n",
|
||||
")"
|
||||
]
|
||||
|
|
@ -485,7 +514,7 @@
|
|||
"outputs": [],
|
||||
"source": [
|
||||
"response = es.search(\n",
|
||||
" index=ELASTICSEARCH_DOCS_INDEX,\n",
|
||||
" index=ELASTICSEARCH_CODE_INDEX,\n",
|
||||
" body={\n",
|
||||
" \"query\": {\"match_all\": {}},\n",
|
||||
" \"size\": 10 \n",
|
||||
|
|
|
|||
Loading…
Reference in New Issue