{ "cells": [ { "cell_type": "markdown", "id": "ed60d28c", "metadata": {}, "source": [ "# Libreries" ] }, { "cell_type": "code", "execution_count": 1, "id": "95cf533e", "metadata": {}, "outputs": [], "source": [ "from transformers import AutoTokenizer\n", "from Docker.config import settings\n", "from pathlib import Path\n", "from transformers import AutoConfig\n", "import os" ] }, { "cell_type": "markdown", "id": "c9b7265a", "metadata": {}, "source": [ "# Functions" ] }, { "cell_type": "code", "execution_count": 2, "id": "6fd7de78", "metadata": {}, "outputs": [], "source": [ "def load_text_from_file(file_path: str) -> str:\n", " \"\"\"\n", " Load text content from a specified file.\n", "\n", " Args:\n", " file_path: Path to the .txt file to load.\n", "\n", " Returns:\n", " The text content of the file.\n", "\n", " Raises:\n", " FileNotFoundError: If the file does not exist.\n", " IOError: If the file cannot be read.\n", " \"\"\"\n", " try:\n", " with open(file_path, \"r\", encoding=\"utf-8\") as file:\n", " return file.read()\n", " except FileNotFoundError:\n", " raise FileNotFoundError(f\"El archivo '{file_path}' no existe.\")\n", " except IOError as error:\n", " raise IOError(f\"Error al leer '{file_path}': {error}\")" ] }, { "cell_type": "code", "execution_count": 3, "id": "6389092e", "metadata": {}, "outputs": [], "source": [ "def infer_context_window(model_id: str, tokenizer_obj) -> int:\n", " \"\"\"Infer context window from tokenizer/model config.\"\"\"\n", " large_sentinel = int(1e9)\n", "\n", " tokenizer_limit = getattr(tokenizer_obj, \"model_max_length\", None)\n", " if isinstance(tokenizer_limit, int) and 0 < tokenizer_limit < large_sentinel:\n", " return tokenizer_limit\n", "\n", " config = AutoConfig.from_pretrained(model_id)\n", "\n", " for field_name in (\n", " \"max_position_embeddings\",\n", " \"n_positions\",\n", " \"seq_length\",\n", " \"model_max_length\",\n", " ):\n", " value = getattr(config, field_name, None)\n", " if isinstance(value, int) and value > 0:\n", " return value\n", "\n", " raise ValueError(\n", " \"No se pudo inferir la ventana de contexto del modelo. \"\n", " \"Define context_window manualmente.\"\n", " )" ] }, { "cell_type": "code", "execution_count": 4, "id": "7190080b", "metadata": {}, "outputs": [], "source": [ "def run_token_count_test() -> dict[str, int | bool]:\n", " \"\"\"Run token count test across all raw .txt files.\"\"\"\n", " raw_dir = Path(settings.raw_path)\n", " txt_files = sorted(raw_dir.glob(\"*.txt\"))\n", "\n", " if not txt_files:\n", " print(f\"No se encontraron .txt en: {raw_dir}\")\n", " return {\n", " \"total_tokens_without_special\": 0,\n", " \"total_tokens_with_special\": 0,\n", " \"fits_without\": True,\n", " \"fits_with\": True,\n", " }\n", "\n", " total_tokens_without_special = 0\n", " total_tokens_with_special = 0\n", "\n", " print(f\"Modelo: {model_name}\")\n", " print(f\"Ventana de contexto detectada: {context_window}\")\n", " print(f\"Archivos analizados: {len(txt_files)}\")\n", " print(\"-\" * 80)\n", "\n", " for file_path in txt_files:\n", " content = load_text_from_file(str(file_path))\n", " token_ids_without_special = tokenizer.encode(\n", " content, add_special_tokens=False\n", " )\n", " token_ids_with_special = tokenizer.encode(content)\n", "\n", " count_without_special = len(token_ids_without_special)\n", " count_with_special = len(token_ids_with_special)\n", "\n", " total_tokens_without_special += count_without_special\n", " total_tokens_with_special += count_with_special\n", "\n", " print(\n", " f\"{file_path.name:<35} \"\n", " f\"sin especiales: {count_without_special:>8} | \"\n", " f\"con especiales: {count_with_special:>8}\"\n", " )\n", "\n", " print(\"-\" * 80)\n", " print(\n", " f\"TOTAL sin especiales: {total_tokens_without_special} tokens\"\n", " )\n", " print(\n", " f\"TOTAL con especiales: {total_tokens_with_special} tokens\"\n", " )\n", "\n", " fits_without = total_tokens_without_special <= context_window\n", " fits_with = total_tokens_with_special <= context_window\n", "\n", " print(\n", " f\"¿Cabe en ventana ({context_window}) sin especiales? \"\n", " f\"{'Sí' if fits_without else 'No'}\"\n", " )\n", " print(\n", " f\"¿Cabe en ventana ({context_window}) con especiales? \"\n", " f\"{'Sí' if fits_with else 'No'}\"\n", " )\n", "\n", " if not fits_with:\n", " overflow = total_tokens_with_special - context_window\n", " print(\n", " f\"Exceso aproximado: {overflow} tokens\"\n", " )\n", "\n", " return {\n", " \"total_tokens_without_special\": total_tokens_without_special,\n", " \"total_tokens_with_special\": total_tokens_with_special,\n", " \"fits_without\": fits_without,\n", " \"fits_with\": fits_with,\n", " }" ] }, { "cell_type": "markdown", "id": "04e0f72f", "metadata": {}, "source": [ "# Model Data" ] }, { "cell_type": "code", "execution_count": 5, "id": "19c815e4", "metadata": {}, "outputs": [], "source": [ "model_name = os.getenv(\"HF_EMB_MODEL_NAME\")\n", "if not model_name:\n", " raise ValueError(\n", " \"No se encontró HF_EMB_MODEL_NAME en variables de entorno.\"\n", " )\n", "\n", "tokenizer = AutoTokenizer.from_pretrained(model_name)\n", "context_window = infer_context_window(model_name, tokenizer)" ] }, { "cell_type": "markdown", "id": "22bcc0fe", "metadata": {}, "source": [ "# Test" ] }, { "cell_type": "code", "execution_count": 6, "id": "f6517705", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Modelo: Qwen/Qwen3-Embedding-0.6B\n", "Ventana de contexto detectada: 131072\n", "Archivos analizados: 24\n", "--------------------------------------------------------------------------------\n", "10_Execution_model_in_avap.txt sin especiales: 10349 | con especiales: 10350\n", "11_Conditional_statements.txt sin especiales: 524 | con especiales: 525\n", "12_Loop_statement.txt sin especiales: 594 | con especiales: 595\n", "13_Api_inbound_interface.txt sin especiales: 415 | con especiales: 416\n", "14_Working_with_libraries.txt sin especiales: 873 | con especiales: 874\n", "15_Function_declaration.txt sin especiales: 394 | con especiales: 395\n", "16_Appendix.txt sin especiales: 9209 | con especiales: 9210\n", "17_Architecture_memory_foundations.txt sin especiales: 1086 | con especiales: 1087\n", "18_Input_output_management.txt sin especiales: 1104 | con especiales: 1105\n", "19_Control_logic_decision_structures.txt sin especiales: 1166 | con especiales: 1167\n", "1_Introduction.txt sin especiales: 1072 | con especiales: 1073\n", "20_Concurrency_asynchrony.txt sin especiales: 1049 | con especiales: 1050\n", "21_Persistance_connectors_orm.txt sin especiales: 1135 | con especiales: 1136\n", "22_System_utilities_transformation.txt sin especiales: 882 | con especiales: 883\n", "23_Function_architecture_scopes.txt sin especiales: 604 | con especiales: 605\n", "24_Master_example.txt sin especiales: 241 | con especiales: 242\n", "2_Dynamic_Programming_Language.txt sin especiales: 707 | con especiales: 708\n", "3_Notation.txt sin especiales: 1368 | con especiales: 1369\n", "4_Lexics.txt sin especiales: 750 | con especiales: 751\n", "5_Data_Model.txt sin especiales: 605 | con especiales: 606\n", "6_Data_Types.txt sin especiales: 611 | con especiales: 612\n", "7_Working_With_Variables.txt sin especiales: 601 | con especiales: 602\n", "8_How_to_work_with_comments.txt sin especiales: 726 | con especiales: 727\n", "9_Expressions_in_avap.txt sin especiales: 646 | con especiales: 647\n", "--------------------------------------------------------------------------------\n", "TOTAL sin especiales: 36711 tokens\n", "TOTAL con especiales: 36735 tokens\n", "¿Cabe en ventana (131072) sin especiales? Sí\n", "¿Cabe en ventana (131072) con especiales? Sí\n" ] }, { "data": { "text/plain": [ "{'total_tokens_without_special': 36711,\n", " 'total_tokens_with_special': 36735,\n", " 'fits_without': True,\n", " 'fits_with': True}" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "test_result = run_token_count_test()\n", "test_result" ] } ], "metadata": { "kernelspec": { "display_name": "assistance-engine", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.11" } }, "nbformat": 4, "nbformat_minor": 5 }