{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "f8ea7a75", "metadata": {}, "outputs": [], "source": [ "import re\n", "import hashlib\n", "from typing import Any\n", "from enum import Enum\n", "import typer\n", "import logging\n", "import os\n", "from pathlib import Path\n", "\n", "from loguru import logger\n", "from elasticsearch import Elasticsearch\n", "from langchain_core.documents import Document\n", "from langchain_elasticsearch import ElasticsearchStore\n", "from langchain_community.embeddings import HuggingFaceEmbeddings\n", "from chonkie import SemanticChunker\n", "\n", "from src.utils.emb_factory import create_embedding_model\n", "from scripts.pipelines.tasks.chunk import read_files, get_chunk_docs, convert_chunks_to_document\n", "from src.config import PROJ_ROOT\n", "\n", "ELASTICSEARCH_LOCAL_URL = os.getenv(\"ELASTICSEARCH_LOCAL_URL\")\n", "OLLAMA_LOCAL_URL = os.getenv(\"OLLAMA_LOCAL_URL\")\n", "ELASTICSEARCH_INDEX = os.getenv(\"ELASTICSEARCH_INDEX\")\n", "OLLAMA_URL = os.getenv(\"OLLAMA_URL\")\n", "OLLAMA_EMB_MODEL_NAME = os.getenv(\"OLLAMA_EMB_MODEL_NAME\")\n", "AVAP_WEB_DOCS_URL = os.getenv(\"AVAP_WEB_DOCS_URL\")" ] }, { "cell_type": "code", "execution_count": 2, "id": "a8b8de3f", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "\u001b[32m2026-03-10 15:15:53.994\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m\u001b[0m:\u001b[36m3\u001b[0m - \u001b[1mStarting Elasticsearch ingestion pipeline...\u001b[0m\n", "\u001b[32m2026-03-10 15:15:53.996\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m\u001b[0m:\u001b[36m4\u001b[0m - \u001b[1mReading and concatenating files from folder: docs/developer.avapframework.com\u001b[0m\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1\n", "1\n", "0\n", "24\n" ] } ], "source": [ "docs_folder_path = \"docs\"\n", "\n", "logger.info(\"Starting Elasticsearch ingestion pipeline...\")\n", "logger.info(f\"Reading and concatenating files from folder: {docs_folder_path}/developer.avapframework.com\")\n", "avap_github_docs = read_files(PROJ_ROOT / f\"{docs_folder_path}/avap_language_github_docs\", \"AVAP\", concatenate=False)\n", "avap_web_docs_intro = read_files(PROJ_ROOT / f\"{docs_folder_path}/developer.avapframework.com\", \"intro\", concatenate=True)\n", "\n", "# Check chapters in developer.avapframework.com folder and read and concatenate files for each chapter\n", "chapters = sorted({\n", " p.name.split(\"_\")[0]\n", " for p in Path(f\"{docs_folder_path}/developer.avapframework.com\").glob(\"chapter*.md\")\n", "})\n", "\n", "avap_web_docs_chapters = [\n", " item\n", " for chapter in chapters\n", " for item in read_files(\n", " f\"{docs_folder_path}/developer.avapframework.com\",\n", " f\"{chapter}_\",\n", " concatenate=True\n", " )\n", "]\n", "avap_web_docs_appendices = read_files(PROJ_ROOT / f\"{docs_folder_path}/developer.avapframework.com\", \"appendices_\", concatenate=False)\n", "avap_examples_docs = read_files(PROJ_ROOT / f\"{docs_folder_path}/samples\", concatenate=False)\n", "\n", "print(len(avap_github_docs))\n", "print(len(avap_web_docs_intro))\n", "print(len(avap_web_docs_chapters))\n", "print(len(avap_web_docs_appendices))" ] }, { "cell_type": "code", "execution_count": 3, "id": "36abc025", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[{'content': 'nivel = 5\\nes_admin = nivel >= 10\\naddResult(es_admin)',\n", " 'title': 'asignacion_booleana.avap'},\n", " {'content': 'subtotal = 150.50\\niva = subtotal * 0.21\\ntotal = subtotal + iva\\naddResult(total)',\n", " 'title': 'asignacion_matematica.avap'},\n", " {'content': 'startLoop(i,1,10)\\n item = \"item_%s\" % i\\n AddvariableToJSON(item,\\'valor_generado\\',mi_json)\\nendLoop()\\naddResult(mi_json)',\n", " 'title': 'bucle_1_10.avap'},\n", " {'content': \"registros = ['1','2','3']\\ngetListLen(registros, total)\\ncontador = 0\\nstartLoop(idx, 0, 2)\\n actual = registros[int(idx)]\\nendLoop()\\naddResult(actual)\",\n", " 'title': 'bucle_longitud_de_datos.avap'},\n", " {'content': 'getDateTime(\"\", 86400, \"UTC\", expira)\\naddResult(expira)',\n", " 'title': 'calculo_de_expiracion.avap'},\n", " {'content': 'addParam(\"client_id\", id_interno)\\naddResult(id_interno)',\n", " 'title': 'captura_de_id.avap'},\n", " {'content': 'addParam(emails,emails)\\ngetQueryParamList(lista_correos)\\naddResult(lista_correos)',\n", " 'title': 'captura_de_listas_multiples.avap'},\n", " {'content': 'addParam(\"lang\", l)\\nif(l, \"es\", \"=\")\\n addVar(msg, \"Hola\")\\nend()\\naddResult(msg)',\n", " 'title': 'comparacion_simple.avap'},\n", " {'content': 'nombre = \"Sistema\"\\nlog = \"Evento registrado por: %s\" % nombre\\naddResult(log)',\n", " 'title': 'concatenacion_dinamica.avap'},\n", " {'content': 'datos_cliente = \"datos\"\\naddVar(clave, \"cliente_vip\")\\nAddvariableToJSON(clave, datos_cliente, mi_json_final)\\naddResult(mi_json_final)',\n", " 'title': 'construccion_dinamica_de_objeto.avap'},\n", " {'content': 'addParam(\"data_list\", mi_lista)\\ngetListLen(mi_lista, cantidad)\\naddResult(cantidad)',\n", " 'title': 'contador_de_parametros.avap'},\n", " {'content': 'stampToDatetime(1708726162, \"%d/%m/%Y\", 0, fecha_human)\\naddResult(fecha_human)',\n", " 'title': 'conversion_timestamp_legible.avap'},\n", " {'content': 'addParam(sal_par,saldo)\\nif(saldo, 0, \">\")\\n permitir = True\\nelse()\\n permitir = False\\nend()\\naddResult(permitir)',\n", " 'title': 'else_estandar.avap'},\n", " {'content': 'addParam(userrype, user_type)\\naddParam(sells, compras)\\nif(None, None, \" user_type == \\'VIP\\' or compras > 100\")\\n addVar(descuento, 0.20)\\nend()\\naddResult(descuento)',\n", " 'title': 'expresion_compleja.avap'},\n", " {'content': 'getDateTime(\"%Y-%m-%d %H:%M:%S\", 0, \"Europe/Madrid\", sql_date)\\naddResult(sql_date)',\n", " 'title': 'fecha_para_base_de_datos.avap'},\n", " {'content': 'function suma(a, b){\\n total = a + b\\n return(total)\\n }\\nresultado = suma(10, 20)\\naddResult(resultado)',\n", " 'title': 'funcion_de_suma.avap'},\n", " {'content': 'function es_valido(token){\\n response = False\\n if(token, \"SECRET\", \"=\")\\n response = True\\n end()\\n return(response)\\n }\\nautorizado = es_valido(\"SECRET\")\\naddResult(autorizado)',\n", " 'title': 'funcion_validacion_acceso.avap'},\n", " {'content': 'randomString(\"[A-Z]\\\\d\", 32, token_seguridad)\\naddResult(token_seguridad)',\n", " 'title': 'generador_de_tokens_aleatorios.avap'},\n", " {'content': 'encodeSHA256(\"payload_data\", checksum)\\naddResult(checksum)',\n", " 'title': 'hash_SHA256_para_integridad.avap'},\n", " {'content': 'addVar(mensaje, \"Hola mundo desde AVAP\")\\naddResult(mensaje)',\n", " 'title': 'hola_mundo.avap'},\n", " {'content': 'addParam(password,pass_nueva)\\npass_antigua = \"password\"\\nif(pass_nueva, pass_antigua, \"!=\")\\n addVar(cambio, \"Contraseña actualizada\")\\nend()\\naddResult(cambio)',\n", " 'title': 'if_desigualdad.avap'},\n", " {'content': 'replace(\"REF_1234_OLD\",\"OLD\", \"NEW\", ref_actualizada)\\naddResult(ref_actualizada)',\n", " 'title': 'limpieza_de_strings.avap'},\n", " {'content': 'try()\\n ormDirect(\"UPDATE table_inexistente SET a=1\", res)\\nexception(e)\\n addVar(_status,500)\\n addResult(\"Error de base de datos\")',\n", " 'title': 'manejo_error_sql_critico.avap'},\n", " {'content': 'getDateTime(\"\", 0, \"UTC\", ahora)\\naddResult(ahora)',\n", " 'title': 'obtencion_timestamp.avap'},\n", " {'content': 'ormCheckTable(tabla_pruebas,resultado_comprobacion)\\nif(resultado_comprobacion,False,\\'==\\')\\n ormCreateTable(\"username,age\",\\'VARCHAR,INTEGER\\',tabla_pruebas,resultado_creacion)\\nend()\\naddResult(resultado_comprobacion)\\naddResult(resultado_creacion)',\n", " 'title': 'ormAccessCreate.avap'},\n", " {'content': 'addParam(\"page\", p)\\naddParam(\"size\", s)\\nregistros = [\"u1\", \"u2\", \"u3\", \"u4\", \"u5\", \"u6\"]\\noffset = int(p) * int(s)\\nlimite = offset + int(s)\\ncontador = 0\\naddResult(offset)\\naddResult(limite)\\nstartLoop(i, 2, limite)\\n actual = registros[int(i)]\\n titulo = \"reg_%s\" % i\\n AddvariableToJSON(titulo, actual, pagina_json)\\nendLoop()\\naddResult(pagina_json)',\n", " 'title': 'paginacion_dinamica_recursos.avap'},\n", " {'content': 'addVar(base, 1000)\\naddVar(copia, $base)\\naddResult(copia)',\n", " 'title': 'referencia_por_valor.avap'},\n", " {'content': 'addVar(code, 200)\\naddVar(status, \"Success\")\\naddResult(code)\\naddResult(status)',\n", " 'title': 'respuesta_multiple.avap'},\n", " {'content': 'encontrado = False\\nstartLoop(i, 1, 10)\\n if(i, 5, \"==\")\\n encontrado = True\\n i = 11 \\n end()\\nendLoop()\\naddResult(encontrado)',\n", " 'title': 'salida_bucle_correcta.avap'},\n", " {'content': 'try()\\n RequestGet(\"https://api.test.com/data\", 0, 0, respuesta)\\nexception(e)\\n addVar(error_trace, \"Fallo de conexión: %s\" % e)\\n addResult(error_trace)',\n", " 'title': 'try_catch_request.avap'},\n", " {'content': 'addParam(\"api_key\", key)\\nif(key, None, \"==\")\\n addVar(_status, 403)\\n addVar(error, \"Acceso denegado: falta API KEY\")\\n addResult(error)\\nend()',\n", " 'title': 'validacion_de_nulo.avap'},\n", " {'content': 'addParam(\"rol\", r)\\nif(r, [\"admin\", \"editor\", \"root\"], \"in\")\\n acceso = True\\nend()\\naddResult(acceso)',\n", " 'title': 'validacion_in_pertenece_a_lista.avap'}]" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "avap_examples_docs" ] }, { "cell_type": "code", "execution_count": null, "id": "16a9e8ce", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "679e5f8c", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 4, "id": "27e5774d", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "\u001b[32m2026-03-10 15:15:54.053\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m\u001b[0m:\u001b[36m1\u001b[0m - \u001b[1mChunking documents...\u001b[0m\n" ] } ], "source": [ "logger.info(\"Chunking documents...\")\n", "chunker = SemanticChunker(\n", " embedding_model=os.getenv(\"HF_EMB_MODEL_NAME\"),\n", " chunk_size=2048,\n", " threshold=0.5,\n", " skip_window=1\n", ")" ] }, { "cell_type": "code", "execution_count": 5, "id": "a5ce984e", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "\u001b[32m2026-03-10 15:16:04.305\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m\u001b[0m:\u001b[36m1\u001b[0m - \u001b[1mChunking AVAP GitHub docs...\u001b[0m\n", "\u001b[32m2026-03-10 15:20:42.896\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mscripts.pipelines.tasks.chunk\u001b[0m:\u001b[36mget_chunk_docs\u001b[0m:\u001b[36m102\u001b[0m - \u001b[1mFinished chunking AVAP.md\u001b[0m\n", "\u001b[32m2026-03-10 15:20:42.897\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m\u001b[0m:\u001b[36m4\u001b[0m - \u001b[1mChunking AVAP web docs chapters...\u001b[0m\n", "\u001b[32m2026-03-10 15:20:42.897\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m\u001b[0m:\u001b[36m7\u001b[0m - \u001b[1mCreating Langchain Document to index...\u001b[0m\n" ] } ], "source": [ "logger.info(\"Chunking AVAP GitHub docs...\")\n", "avap_github_docs_chunks = get_chunk_docs(avap_github_docs, chunker)\n", "\n", "logger.info(\"Chunking AVAP web docs chapters...\")\n", "# avap_web_docs_chapters_chunks = get_chunk_docs(avap_web_docs_chapters, chunker)\n", "\n", "logger.info(\"Creating Langchain Document to index...\")\n", "avap_github_langchain_docs = convert_chunks_to_document(avap_github_docs_chunks)\n", "# avap_web_chapters_langchain_docs = convert_chunks_to_document(avap_web_docs_chapters_chunks)\n", "avap_web_intro_langchain_docs = convert_chunks_to_document(avap_web_docs_intro)\n", "avap_web_appendices_langchain_docs = convert_chunks_to_document(avap_web_docs_appendices)" ] }, { "cell_type": "code", "execution_count": null, "id": "dd1f4d79", "metadata": {}, "outputs": [], "source": [ "avap_github_langchain_docs" ] }, { "cell_type": "code", "execution_count": null, "id": "c24e8a8f", "metadata": {}, "outputs": [], "source": [ "avap_web_chapters_langchain_docs" ] }, { "cell_type": "code", "execution_count": null, "id": "f6782a34", "metadata": {}, "outputs": [], "source": [ "avap_web_intro_langchain_docs" ] }, { "cell_type": "code", "execution_count": null, "id": "78c1190e", "metadata": {}, "outputs": [], "source": [ "avap_web_appendices_langchain_docs" ] } ], "metadata": { "kernelspec": { "display_name": "assistance-engine", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.13" } }, "nbformat": 4, "nbformat_minor": 5 }