assistance-engine/scratches/acano/test_chunker.ipynb

313 lines
14 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "f8ea7a75",
"metadata": {},
"outputs": [],
"source": [
"import re\n",
"import hashlib\n",
"from typing import Any\n",
"from enum import Enum\n",
"import typer\n",
"import logging\n",
"import os\n",
"from pathlib import Path\n",
"\n",
"from loguru import logger\n",
"from elasticsearch import Elasticsearch\n",
"from langchain_core.documents import Document\n",
"from langchain_elasticsearch import ElasticsearchStore\n",
"from langchain_community.embeddings import HuggingFaceEmbeddings\n",
"from chonkie import SemanticChunker\n",
"\n",
"from src.utils.emb_factory import create_embedding_model\n",
"from scripts.pipelines.tasks.chunk import read_files, get_chunk_docs, convert_chunks_to_document\n",
"from src.config import PROJ_ROOT\n",
"\n",
"ELASTICSEARCH_LOCAL_URL = os.getenv(\"ELASTICSEARCH_LOCAL_URL\")\n",
"OLLAMA_LOCAL_URL = os.getenv(\"OLLAMA_LOCAL_URL\")\n",
"ELASTICSEARCH_INDEX = os.getenv(\"ELASTICSEARCH_INDEX\")\n",
"OLLAMA_URL = os.getenv(\"OLLAMA_URL\")\n",
"OLLAMA_EMB_MODEL_NAME = os.getenv(\"OLLAMA_EMB_MODEL_NAME\")\n",
"AVAP_WEB_DOCS_URL = os.getenv(\"AVAP_WEB_DOCS_URL\")"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "a8b8de3f",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[32m2026-03-10 15:15:53.994\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m<module>\u001b[0m:\u001b[36m3\u001b[0m - \u001b[1mStarting Elasticsearch ingestion pipeline...\u001b[0m\n",
"\u001b[32m2026-03-10 15:15:53.996\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m<module>\u001b[0m:\u001b[36m4\u001b[0m - \u001b[1mReading and concatenating files from folder: docs/developer.avapframework.com\u001b[0m\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"1\n",
"1\n",
"0\n",
"24\n"
]
}
],
"source": [
"docs_folder_path = \"docs\"\n",
"\n",
"logger.info(\"Starting Elasticsearch ingestion pipeline...\")\n",
"logger.info(f\"Reading and concatenating files from folder: {docs_folder_path}/developer.avapframework.com\")\n",
"avap_github_docs = read_files(PROJ_ROOT / f\"{docs_folder_path}/avap_language_github_docs\", \"AVAP\", concatenate=False)\n",
"avap_web_docs_intro = read_files(PROJ_ROOT / f\"{docs_folder_path}/developer.avapframework.com\", \"intro\", concatenate=True)\n",
"\n",
"# Check chapters in developer.avapframework.com folder and read and concatenate files for each chapter\n",
"chapters = sorted({\n",
" p.name.split(\"_\")[0]\n",
" for p in Path(f\"{docs_folder_path}/developer.avapframework.com\").glob(\"chapter*.md\")\n",
"})\n",
"\n",
"avap_web_docs_chapters = [\n",
" item\n",
" for chapter in chapters\n",
" for item in read_files(\n",
" f\"{docs_folder_path}/developer.avapframework.com\",\n",
" f\"{chapter}_\",\n",
" concatenate=True\n",
" )\n",
"]\n",
"avap_web_docs_appendices = read_files(PROJ_ROOT / f\"{docs_folder_path}/developer.avapframework.com\", \"appendices_\", concatenate=False)\n",
"avap_examples_docs = read_files(PROJ_ROOT / f\"{docs_folder_path}/samples\", concatenate=False)\n",
"\n",
"print(len(avap_github_docs))\n",
"print(len(avap_web_docs_intro))\n",
"print(len(avap_web_docs_chapters))\n",
"print(len(avap_web_docs_appendices))"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "36abc025",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[{'content': 'nivel = 5\\nes_admin = nivel >= 10\\naddResult(es_admin)',\n",
" 'title': 'asignacion_booleana.avap'},\n",
" {'content': 'subtotal = 150.50\\niva = subtotal * 0.21\\ntotal = subtotal + iva\\naddResult(total)',\n",
" 'title': 'asignacion_matematica.avap'},\n",
" {'content': 'startLoop(i,1,10)\\n item = \"item_%s\" % i\\n AddvariableToJSON(item,\\'valor_generado\\',mi_json)\\nendLoop()\\naddResult(mi_json)',\n",
" 'title': 'bucle_1_10.avap'},\n",
" {'content': \"registros = ['1','2','3']\\ngetListLen(registros, total)\\ncontador = 0\\nstartLoop(idx, 0, 2)\\n actual = registros[int(idx)]\\nendLoop()\\naddResult(actual)\",\n",
" 'title': 'bucle_longitud_de_datos.avap'},\n",
" {'content': 'getDateTime(\"\", 86400, \"UTC\", expira)\\naddResult(expira)',\n",
" 'title': 'calculo_de_expiracion.avap'},\n",
" {'content': 'addParam(\"client_id\", id_interno)\\naddResult(id_interno)',\n",
" 'title': 'captura_de_id.avap'},\n",
" {'content': 'addParam(emails,emails)\\ngetQueryParamList(lista_correos)\\naddResult(lista_correos)',\n",
" 'title': 'captura_de_listas_multiples.avap'},\n",
" {'content': 'addParam(\"lang\", l)\\nif(l, \"es\", \"=\")\\n addVar(msg, \"Hola\")\\nend()\\naddResult(msg)',\n",
" 'title': 'comparacion_simple.avap'},\n",
" {'content': 'nombre = \"Sistema\"\\nlog = \"Evento registrado por: %s\" % nombre\\naddResult(log)',\n",
" 'title': 'concatenacion_dinamica.avap'},\n",
" {'content': 'datos_cliente = \"datos\"\\naddVar(clave, \"cliente_vip\")\\nAddvariableToJSON(clave, datos_cliente, mi_json_final)\\naddResult(mi_json_final)',\n",
" 'title': 'construccion_dinamica_de_objeto.avap'},\n",
" {'content': 'addParam(\"data_list\", mi_lista)\\ngetListLen(mi_lista, cantidad)\\naddResult(cantidad)',\n",
" 'title': 'contador_de_parametros.avap'},\n",
" {'content': 'stampToDatetime(1708726162, \"%d/%m/%Y\", 0, fecha_human)\\naddResult(fecha_human)',\n",
" 'title': 'conversion_timestamp_legible.avap'},\n",
" {'content': 'addParam(sal_par,saldo)\\nif(saldo, 0, \">\")\\n permitir = True\\nelse()\\n permitir = False\\nend()\\naddResult(permitir)',\n",
" 'title': 'else_estandar.avap'},\n",
" {'content': 'addParam(userrype, user_type)\\naddParam(sells, compras)\\nif(None, None, \" user_type == \\'VIP\\' or compras > 100\")\\n addVar(descuento, 0.20)\\nend()\\naddResult(descuento)',\n",
" 'title': 'expresion_compleja.avap'},\n",
" {'content': 'getDateTime(\"%Y-%m-%d %H:%M:%S\", 0, \"Europe/Madrid\", sql_date)\\naddResult(sql_date)',\n",
" 'title': 'fecha_para_base_de_datos.avap'},\n",
" {'content': 'function suma(a, b){\\n total = a + b\\n return(total)\\n }\\nresultado = suma(10, 20)\\naddResult(resultado)',\n",
" 'title': 'funcion_de_suma.avap'},\n",
" {'content': 'function es_valido(token){\\n response = False\\n if(token, \"SECRET\", \"=\")\\n response = True\\n end()\\n return(response)\\n }\\nautorizado = es_valido(\"SECRET\")\\naddResult(autorizado)',\n",
" 'title': 'funcion_validacion_acceso.avap'},\n",
" {'content': 'randomString(\"[A-Z]\\\\d\", 32, token_seguridad)\\naddResult(token_seguridad)',\n",
" 'title': 'generador_de_tokens_aleatorios.avap'},\n",
" {'content': 'encodeSHA256(\"payload_data\", checksum)\\naddResult(checksum)',\n",
" 'title': 'hash_SHA256_para_integridad.avap'},\n",
" {'content': 'addVar(mensaje, \"Hola mundo desde AVAP\")\\naddResult(mensaje)',\n",
" 'title': 'hola_mundo.avap'},\n",
" {'content': 'addParam(password,pass_nueva)\\npass_antigua = \"password\"\\nif(pass_nueva, pass_antigua, \"!=\")\\n addVar(cambio, \"Contraseña actualizada\")\\nend()\\naddResult(cambio)',\n",
" 'title': 'if_desigualdad.avap'},\n",
" {'content': 'replace(\"REF_1234_OLD\",\"OLD\", \"NEW\", ref_actualizada)\\naddResult(ref_actualizada)',\n",
" 'title': 'limpieza_de_strings.avap'},\n",
" {'content': 'try()\\n ormDirect(\"UPDATE table_inexistente SET a=1\", res)\\nexception(e)\\n addVar(_status,500)\\n addResult(\"Error de base de datos\")',\n",
" 'title': 'manejo_error_sql_critico.avap'},\n",
" {'content': 'getDateTime(\"\", 0, \"UTC\", ahora)\\naddResult(ahora)',\n",
" 'title': 'obtencion_timestamp.avap'},\n",
" {'content': 'ormCheckTable(tabla_pruebas,resultado_comprobacion)\\nif(resultado_comprobacion,False,\\'==\\')\\n ormCreateTable(\"username,age\",\\'VARCHAR,INTEGER\\',tabla_pruebas,resultado_creacion)\\nend()\\naddResult(resultado_comprobacion)\\naddResult(resultado_creacion)',\n",
" 'title': 'ormAccessCreate.avap'},\n",
" {'content': 'addParam(\"page\", p)\\naddParam(\"size\", s)\\nregistros = [\"u1\", \"u2\", \"u3\", \"u4\", \"u5\", \"u6\"]\\noffset = int(p) * int(s)\\nlimite = offset + int(s)\\ncontador = 0\\naddResult(offset)\\naddResult(limite)\\nstartLoop(i, 2, limite)\\n actual = registros[int(i)]\\n titulo = \"reg_%s\" % i\\n AddvariableToJSON(titulo, actual, pagina_json)\\nendLoop()\\naddResult(pagina_json)',\n",
" 'title': 'paginacion_dinamica_recursos.avap'},\n",
" {'content': 'addVar(base, 1000)\\naddVar(copia, $base)\\naddResult(copia)',\n",
" 'title': 'referencia_por_valor.avap'},\n",
" {'content': 'addVar(code, 200)\\naddVar(status, \"Success\")\\naddResult(code)\\naddResult(status)',\n",
" 'title': 'respuesta_multiple.avap'},\n",
" {'content': 'encontrado = False\\nstartLoop(i, 1, 10)\\n if(i, 5, \"==\")\\n encontrado = True\\n i = 11 \\n end()\\nendLoop()\\naddResult(encontrado)',\n",
" 'title': 'salida_bucle_correcta.avap'},\n",
" {'content': 'try()\\n RequestGet(\"https://api.test.com/data\", 0, 0, respuesta)\\nexception(e)\\n addVar(error_trace, \"Fallo de conexión: %s\" % e)\\n addResult(error_trace)',\n",
" 'title': 'try_catch_request.avap'},\n",
" {'content': 'addParam(\"api_key\", key)\\nif(key, None, \"==\")\\n addVar(_status, 403)\\n addVar(error, \"Acceso denegado: falta API KEY\")\\n addResult(error)\\nend()',\n",
" 'title': 'validacion_de_nulo.avap'},\n",
" {'content': 'addParam(\"rol\", r)\\nif(r, [\"admin\", \"editor\", \"root\"], \"in\")\\n acceso = True\\nend()\\naddResult(acceso)',\n",
" 'title': 'validacion_in_pertenece_a_lista.avap'}]"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"avap_examples_docs"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "16a9e8ce",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "679e5f8c",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 4,
"id": "27e5774d",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[32m2026-03-10 15:15:54.053\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m<module>\u001b[0m:\u001b[36m1\u001b[0m - \u001b[1mChunking documents...\u001b[0m\n"
]
}
],
"source": [
"logger.info(\"Chunking documents...\")\n",
"chunker = SemanticChunker(\n",
" embedding_model=os.getenv(\"HF_EMB_MODEL_NAME\"),\n",
" chunk_size=2048,\n",
" threshold=0.5,\n",
" skip_window=1\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "a5ce984e",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[32m2026-03-10 15:16:04.305\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m<module>\u001b[0m:\u001b[36m1\u001b[0m - \u001b[1mChunking AVAP GitHub docs...\u001b[0m\n",
"\u001b[32m2026-03-10 15:20:42.896\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mscripts.pipelines.tasks.chunk\u001b[0m:\u001b[36mget_chunk_docs\u001b[0m:\u001b[36m102\u001b[0m - \u001b[1mFinished chunking AVAP.md\u001b[0m\n",
"\u001b[32m2026-03-10 15:20:42.897\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m<module>\u001b[0m:\u001b[36m4\u001b[0m - \u001b[1mChunking AVAP web docs chapters...\u001b[0m\n",
"\u001b[32m2026-03-10 15:20:42.897\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m<module>\u001b[0m:\u001b[36m7\u001b[0m - \u001b[1mCreating Langchain Document to index...\u001b[0m\n"
]
}
],
"source": [
"logger.info(\"Chunking AVAP GitHub docs...\")\n",
"avap_github_docs_chunks = get_chunk_docs(avap_github_docs, chunker)\n",
"\n",
"logger.info(\"Chunking AVAP web docs chapters...\")\n",
"# avap_web_docs_chapters_chunks = get_chunk_docs(avap_web_docs_chapters, chunker)\n",
"\n",
"logger.info(\"Creating Langchain Document to index...\")\n",
"avap_github_langchain_docs = convert_chunks_to_document(avap_github_docs_chunks)\n",
"# avap_web_chapters_langchain_docs = convert_chunks_to_document(avap_web_docs_chapters_chunks)\n",
"avap_web_intro_langchain_docs = convert_chunks_to_document(avap_web_docs_intro)\n",
"avap_web_appendices_langchain_docs = convert_chunks_to_document(avap_web_docs_appendices)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "dd1f4d79",
"metadata": {},
"outputs": [],
"source": [
"avap_github_langchain_docs"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c24e8a8f",
"metadata": {},
"outputs": [],
"source": [
"avap_web_chapters_langchain_docs"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f6782a34",
"metadata": {},
"outputs": [],
"source": [
"avap_web_intro_langchain_docs"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "78c1190e",
"metadata": {},
"outputs": [],
"source": [
"avap_web_appendices_langchain_docs"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "assistance-engine",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.13"
}
},
"nbformat": 4,
"nbformat_minor": 5
}