{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "f8ea7a75", "metadata": {}, "outputs": [], "source": [ "import re\n", "import hashlib\n", "from typing import Any\n", "from enum import Enum\n", "import typer\n", "import logging\n", "import os\n", "from pathlib import Path\n", "\n", "from loguru import logger\n", "from elasticsearch import Elasticsearch\n", "from langchain_core.documents import Document\n", "from langchain_elasticsearch import ElasticsearchStore\n", "from langchain_community.embeddings import HuggingFaceEmbeddings\n", "from chonkie import SemanticChunker\n", "\n", "from src.utils.emb_factory import create_embedding_model\n", "from scripts.pipelines.tasks.chunk import read_concat_files, get_chunk_docs, chunks_to_document\n", "from src.config import PROJ_ROOT\n", "\n", "ELASTICSEARCH_LOCAL_URL = os.getenv(\"ELASTICSEARCH_LOCAL_URL\")\n", "OLLAMA_LOCAL_URL = os.getenv(\"OLLAMA_LOCAL_URL\")\n", "ELASTICSEARCH_INDEX = os.getenv(\"ELASTICSEARCH_INDEX\")\n", "OLLAMA_URL = os.getenv(\"OLLAMA_URL\")\n", "OLLAMA_EMB_MODEL_NAME = os.getenv(\"OLLAMA_EMB_MODEL_NAME\")\n", "AVAP_WEB_DOCS_URL = os.getenv(\"AVAP_WEB_DOCS_URL\")" ] }, { "cell_type": "code", "execution_count": null, "id": "a8b8de3f", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "\u001b[32m2026-03-10 13:58:32.657\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m\u001b[0m:\u001b[36m3\u001b[0m - \u001b[1mStarting Elasticsearch ingestion pipeline...\u001b[0m\n", "\u001b[32m2026-03-10 13:58:32.658\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m\u001b[0m:\u001b[36m4\u001b[0m - \u001b[1mReading and concatenating files from folder: docs/developer.avapframework.com\u001b[0m\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1\n", "1\n", "14\n", "24\n" ] } ], "source": [ "docs_folder_path = \"docs\"\n", "\n", "logger.info(\"Starting Elasticsearch ingestion pipeline...\")\n", "logger.info(f\"Reading and concatenating files from folder: {docs_folder_path}/developer.avapframework.com\")\n", "avap_github_docs = read_concat_files(PROJ_ROOT / f\"{docs_folder_path}/avap_language_github_docs\", \"AVAP\", concatenate=False)\n", "avap_web_docs_intro = read_concat_files(PROJ_ROOT / f\"{docs_folder_path}/developer.avapframework.com\", \"intro\", concatenate=True)\n", "\n", "# Check chapters in developer.avapframework.com folder and read and concatenate files for each chapter\n", "chapters = sorted({\n", " p.name.split(\"_\")[0]\n", " for p in Path(f\"{docs_folder_path}/developer.avapframework.com\").glob(\"chapter*.md\")\n", "})\n", "\n", "avap_web_docs_chapters = [\n", " item\n", " for chapter in chapters\n", " for item in read_concat_files(\n", " f\"{docs_folder_path}/developer.avapframework.com\",\n", " f\"{chapter}_\",\n", " concatenate=True\n", " )\n", "]\n", "avap_web_docs_appendices = read_concat_files(PROJ_ROOT / f\"{docs_folder_path}/developer.avapframework.com\", \"appendices_\", concatenate=False)\n", "avap_examples_docs = read_concat_files(PROJ_ROOT / f\"{docs_folder_path}/samples\", concatenate=False)\n", "\n", "print(len(avap_github_docs))\n", "print(len(avap_web_docs_intro))\n", "print(len(avap_web_docs_chapters))\n", "print(len(avap_web_docs_appendices))" ] }, { "cell_type": "code", "execution_count": 12, "id": "36abc025", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[{'content': 'nivel = 5\\nes_admin = nivel >= 10\\naddResult(es_admin)',\n", " 'title': 'asignacion_booleana.avap'},\n", " {'content': 'subtotal = 150.50\\niva = subtotal * 0.21\\ntotal = subtotal + iva\\naddResult(total)',\n", " 'title': 'asignacion_matematica.avap'},\n", " {'content': 'startLoop(i,1,10)\\n item = \"item_%s\" % i\\n AddvariableToJSON(item,\\'valor_generado\\',mi_json)\\nendLoop()\\naddResult(mi_json)',\n", " 'title': 'bucle_1_10.avap'},\n", " {'content': \"registros = ['1','2','3']\\ngetListLen(registros, total)\\ncontador = 0\\nstartLoop(idx, 0, 2)\\n actual = registros[int(idx)]\\nendLoop()\\naddResult(actual)\",\n", " 'title': 'bucle_longitud_de_datos.avap'},\n", " {'content': 'getDateTime(\"\", 86400, \"UTC\", expira)\\naddResult(expira)',\n", " 'title': 'calculo_de_expiracion.avap'},\n", " {'content': 'addParam(\"client_id\", id_interno)\\naddResult(id_interno)',\n", " 'title': 'captura_de_id.avap'},\n", " {'content': 'addParam(emails,emails)\\ngetQueryParamList(lista_correos)\\naddResult(lista_correos)',\n", " 'title': 'captura_de_listas_multiples.avap'},\n", " {'content': 'addParam(\"lang\", l)\\nif(l, \"es\", \"=\")\\n addVar(msg, \"Hola\")\\nend()\\naddResult(msg)',\n", " 'title': 'comparacion_simple.avap'},\n", " {'content': 'nombre = \"Sistema\"\\nlog = \"Evento registrado por: %s\" % nombre\\naddResult(log)',\n", " 'title': 'concatenacion_dinamica.avap'},\n", " {'content': 'datos_cliente = \"datos\"\\naddVar(clave, \"cliente_vip\")\\nAddvariableToJSON(clave, datos_cliente, mi_json_final)\\naddResult(mi_json_final)',\n", " 'title': 'construccion_dinamica_de_objeto.avap'},\n", " {'content': 'addParam(\"data_list\", mi_lista)\\ngetListLen(mi_lista, cantidad)\\naddResult(cantidad)',\n", " 'title': 'contador_de_parametros.avap'},\n", " {'content': 'stampToDatetime(1708726162, \"%d/%m/%Y\", 0, fecha_human)\\naddResult(fecha_human)',\n", " 'title': 'conversion_timestamp_legible.avap'},\n", " {'content': 'addParam(sal_par,saldo)\\nif(saldo, 0, \">\")\\n permitir = True\\nelse()\\n permitir = False\\nend()\\naddResult(permitir)',\n", " 'title': 'else_estandar.avap'},\n", " {'content': 'addParam(userrype, user_type)\\naddParam(sells, compras)\\nif(None, None, \" user_type == \\'VIP\\' or compras > 100\")\\n addVar(descuento, 0.20)\\nend()\\naddResult(descuento)',\n", " 'title': 'expresion_compleja.avap'},\n", " {'content': 'getDateTime(\"%Y-%m-%d %H:%M:%S\", 0, \"Europe/Madrid\", sql_date)\\naddResult(sql_date)',\n", " 'title': 'fecha_para_base_de_datos.avap'},\n", " {'content': 'function suma(a, b){\\n total = a + b\\n return(total)\\n }\\nresultado = suma(10, 20)\\naddResult(resultado)',\n", " 'title': 'funcion_de_suma.avap'},\n", " {'content': 'function es_valido(token){\\n response = False\\n if(token, \"SECRET\", \"=\")\\n response = True\\n end()\\n return(response)\\n }\\nautorizado = es_valido(\"SECRET\")\\naddResult(autorizado)',\n", " 'title': 'funcion_validacion_acceso.avap'},\n", " {'content': 'randomString(\"[A-Z]\\\\d\", 32, token_seguridad)\\naddResult(token_seguridad)',\n", " 'title': 'generador_de_tokens_aleatorios.avap'},\n", " {'content': 'encodeSHA256(\"payload_data\", checksum)\\naddResult(checksum)',\n", " 'title': 'hash_SHA256_para_integridad.avap'},\n", " {'content': 'addVar(mensaje, \"Hola mundo desde AVAP\")\\naddResult(mensaje)',\n", " 'title': 'hola_mundo.avap'},\n", " {'content': 'addParam(password,pass_nueva)\\npass_antigua = \"password\"\\nif(pass_nueva, pass_antigua, \"!=\")\\n addVar(cambio, \"Contraseña actualizada\")\\nend()\\naddResult(cambio)',\n", " 'title': 'if_desigualdad.avap'},\n", " {'content': 'replace(\"REF_1234_OLD\",\"OLD\", \"NEW\", ref_actualizada)\\naddResult(ref_actualizada)',\n", " 'title': 'limpieza_de_strings.avap'},\n", " {'content': 'try()\\n ormDirect(\"UPDATE table_inexistente SET a=1\", res)\\nexception(e)\\n addVar(_status,500)\\n addResult(\"Error de base de datos\")',\n", " 'title': 'manejo_error_sql_critico.avap'},\n", " {'content': 'getDateTime(\"\", 0, \"UTC\", ahora)\\naddResult(ahora)',\n", " 'title': 'obtencion_timestamp.avap'},\n", " {'content': 'ormCheckTable(tabla_pruebas,resultado_comprobacion)\\nif(resultado_comprobacion,False,\\'==\\')\\n ormCreateTable(\"username,age\",\\'VARCHAR,INTEGER\\',tabla_pruebas,resultado_creacion)\\nend()\\naddResult(resultado_comprobacion)\\naddResult(resultado_creacion)',\n", " 'title': 'ormAccessCreate.avap'},\n", " {'content': 'addParam(\"page\", p)\\naddParam(\"size\", s)\\nregistros = [\"u1\", \"u2\", \"u3\", \"u4\", \"u5\", \"u6\"]\\noffset = int(p) * int(s)\\nlimite = offset + int(s)\\ncontador = 0\\naddResult(offset)\\naddResult(limite)\\nstartLoop(i, 2, limite)\\n actual = registros[int(i)]\\n titulo = \"reg_%s\" % i\\n AddvariableToJSON(titulo, actual, pagina_json)\\nendLoop()\\naddResult(pagina_json)',\n", " 'title': 'paginacion_dinamica_recursos.avap'},\n", " {'content': 'addVar(base, 1000)\\naddVar(copia, $base)\\naddResult(copia)',\n", " 'title': 'referencia_por_valor.avap'},\n", " {'content': 'addVar(code, 200)\\naddVar(status, \"Success\")\\naddResult(code)\\naddResult(status)',\n", " 'title': 'respuesta_multiple.avap'},\n", " {'content': 'encontrado = False\\nstartLoop(i, 1, 10)\\n if(i, 5, \"==\")\\n encontrado = True\\n i = 11 \\n end()\\nendLoop()\\naddResult(encontrado)',\n", " 'title': 'salida_bucle_correcta.avap'},\n", " {'content': 'try()\\n RequestGet(\"https://api.test.com/data\", 0, 0, respuesta)\\nexception(e)\\n addVar(error_trace, \"Fallo de conexión: %s\" % e)\\n addResult(error_trace)',\n", " 'title': 'try_catch_request.avap'},\n", " {'content': 'addParam(\"api_key\", key)\\nif(key, None, \"==\")\\n addVar(_status, 403)\\n addVar(error, \"Acceso denegado: falta API KEY\")\\n addResult(error)\\nend()',\n", " 'title': 'validacion_de_nulo.avap'},\n", " {'content': 'addParam(\"rol\", r)\\nif(r, [\"admin\", \"editor\", \"root\"], \"in\")\\n acceso = True\\nend()\\naddResult(acceso)',\n", " 'title': 'validacion_in_pertenece_a_lista.avap'}]" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "avap_examples_docs" ] }, { "cell_type": "code", "execution_count": null, "id": "16a9e8ce", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "679e5f8c", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 6, "id": "27e5774d", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "\u001b[32m2026-03-10 13:58:34.531\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m\u001b[0m:\u001b[36m1\u001b[0m - \u001b[1mChunking documents...\u001b[0m\n" ] } ], "source": [ "logger.info(\"Chunking documents...\")\n", "chunker = SemanticChunker(\n", " embedding_model=os.getenv(\"HF_EMB_MODEL_NAME\"),\n", " chunk_size=2048,\n", " threshold=0.5,\n", " skip_window=1\n", ")" ] }, { "cell_type": "code", "execution_count": 7, "id": "a5ce984e", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "\u001b[32m2026-03-10 13:58:51.740\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m\u001b[0m:\u001b[36m1\u001b[0m - \u001b[1mChunking AVAP GitHub docs...\u001b[0m\n", "\u001b[32m2026-03-10 14:01:00.535\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mscripts.pipelines.tasks.chunk\u001b[0m:\u001b[36mchunk_docs\u001b[0m:\u001b[36m181\u001b[0m - \u001b[1mFinished chunking AVAP.md\u001b[0m\n", "\u001b[32m2026-03-10 14:01:00.536\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m\u001b[0m:\u001b[36m4\u001b[0m - \u001b[1mChunking AVAP web docs chapters...\u001b[0m\n", "\u001b[32m2026-03-10 14:01:09.128\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mscripts.pipelines.tasks.chunk\u001b[0m:\u001b[36mchunk_docs\u001b[0m:\u001b[36m181\u001b[0m - \u001b[1mFinished chunking chapter1_\u001b[0m\n", "\u001b[32m2026-03-10 14:01:12.763\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mscripts.pipelines.tasks.chunk\u001b[0m:\u001b[36mchunk_docs\u001b[0m:\u001b[36m181\u001b[0m - \u001b[1mFinished chunking chapter2_\u001b[0m\n", "\u001b[32m2026-03-10 14:01:42.995\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mscripts.pipelines.tasks.chunk\u001b[0m:\u001b[36mchunk_docs\u001b[0m:\u001b[36m181\u001b[0m - \u001b[1mFinished chunking chapter3_\u001b[0m\n", "\u001b[32m2026-03-10 14:01:48.772\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mscripts.pipelines.tasks.chunk\u001b[0m:\u001b[36mchunk_docs\u001b[0m:\u001b[36m181\u001b[0m - \u001b[1mFinished chunking chapter4_\u001b[0m\n", "\u001b[32m2026-03-10 14:01:48.772\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mscripts.pipelines.tasks.chunk\u001b[0m:\u001b[36mchunk_docs\u001b[0m:\u001b[36m181\u001b[0m - \u001b[1mFinished chunking chapter5_\u001b[0m\n", "\u001b[32m2026-03-10 14:01:48.773\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mscripts.pipelines.tasks.chunk\u001b[0m:\u001b[36mchunk_docs\u001b[0m:\u001b[36m181\u001b[0m - \u001b[1mFinished chunking chapter6_\u001b[0m\n", "\u001b[32m2026-03-10 14:02:06.408\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mscripts.pipelines.tasks.chunk\u001b[0m:\u001b[36mchunk_docs\u001b[0m:\u001b[36m181\u001b[0m - \u001b[1mFinished chunking chapter7_\u001b[0m\n", "\u001b[32m2026-03-10 14:02:21.501\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mscripts.pipelines.tasks.chunk\u001b[0m:\u001b[36mchunk_docs\u001b[0m:\u001b[36m181\u001b[0m - \u001b[1mFinished chunking chapter8_\u001b[0m\n", "\u001b[32m2026-03-10 14:07:27.158\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mscripts.pipelines.tasks.chunk\u001b[0m:\u001b[36mchunk_docs\u001b[0m:\u001b[36m181\u001b[0m - \u001b[1mFinished chunking chapter9_\u001b[0m\n", "\u001b[32m2026-03-10 14:07:48.389\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mscripts.pipelines.tasks.chunk\u001b[0m:\u001b[36mchunk_docs\u001b[0m:\u001b[36m181\u001b[0m - \u001b[1mFinished chunking chapter10_\u001b[0m\n", "\u001b[32m2026-03-10 14:08:10.823\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mscripts.pipelines.tasks.chunk\u001b[0m:\u001b[36mchunk_docs\u001b[0m:\u001b[36m181\u001b[0m - \u001b[1mFinished chunking chapter11_\u001b[0m\n", "\u001b[32m2026-03-10 14:08:27.335\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mscripts.pipelines.tasks.chunk\u001b[0m:\u001b[36mchunk_docs\u001b[0m:\u001b[36m181\u001b[0m - \u001b[1mFinished chunking chapter12_\u001b[0m\n", "\u001b[32m2026-03-10 14:08:55.010\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mscripts.pipelines.tasks.chunk\u001b[0m:\u001b[36mchunk_docs\u001b[0m:\u001b[36m181\u001b[0m - \u001b[1mFinished chunking chapter13_\u001b[0m\n", "\u001b[32m2026-03-10 14:09:10.211\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mscripts.pipelines.tasks.chunk\u001b[0m:\u001b[36mchunk_docs\u001b[0m:\u001b[36m181\u001b[0m - \u001b[1mFinished chunking chapter14_\u001b[0m\n", "\u001b[32m2026-03-10 14:09:10.211\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m\u001b[0m:\u001b[36m7\u001b[0m - \u001b[1mCreating Langchain Document to index...\u001b[0m\n" ] } ], "source": [ "logger.info(\"Chunking AVAP GitHub docs...\")\n", "avap_github_docs_chunks = chunk_docs(avap_github_docs, chunker)\n", "\n", "logger.info(\"Chunking AVAP web docs chapters...\")\n", "avap_web_docs_chapters_chunks = chunk_docs(avap_web_docs_chapters, chunker)\n", "\n", "logger.info(\"Creating Langchain Document to index...\")\n", "avap_github_langchain_docs = chunks_to_document(avap_github_docs_chunks)\n", "avap_web_chapters_langchain_docs = chunks_to_document(avap_web_docs_chapters_chunks)\n", "avap_web_intro_langchain_docs = chunks_to_document(avap_web_docs_intro)\n", "avap_web_appendices_langchain_docs = chunks_to_document(avap_web_docs_appendices)" ] }, { "cell_type": "code", "execution_count": null, "id": "dd1f4d79", "metadata": {}, "outputs": [], "source": [ "avap_github_langchain_docs" ] }, { "cell_type": "code", "execution_count": null, "id": "c24e8a8f", "metadata": {}, "outputs": [], "source": [ "avap_web_chapters_langchain_docs" ] }, { "cell_type": "code", "execution_count": null, "id": "f6782a34", "metadata": {}, "outputs": [], "source": [ "avap_web_intro_langchain_docs" ] }, { "cell_type": "code", "execution_count": null, "id": "78c1190e", "metadata": {}, "outputs": [], "source": [ "avap_web_appendices_langchain_docs" ] } ], "metadata": { "kernelspec": { "display_name": "assistance-engine", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.13" } }, "nbformat": 4, "nbformat_minor": 5 }