From 12eef38f3376d5c003b0711decf562e6c13f2b31 Mon Sep 17 00:00:00 2001 From: pseco Date: Wed, 25 Feb 2026 17:17:20 +0100 Subject: [PATCH] count tokens files --- Makefile | 6 +- .../pseco/ingestion/n00 Count tokens.ipynb | 216 ++++++++++++++++-- 2 files changed, 199 insertions(+), 23 deletions(-) diff --git a/Makefile b/Makefile index 6303f8a..26c6c5b 100644 --- a/Makefile +++ b/Makefile @@ -40,4 +40,8 @@ sync_data_down: .PHONY: sync_data_up sync_data_up: aws s3 sync --exclude "*.gitkeep" data/ \ - s3://mrh-avap/data \ No newline at end of file + s3://mrh-avap/data + +.PHONY: ollama_local +ollama_local: + ssh -i ~/.ssh/mrh-transformers.pem -L 11434:localhost:11434 ubuntu@172.18.14.34 diff --git a/scratches/pseco/ingestion/n00 Count tokens.ipynb b/scratches/pseco/ingestion/n00 Count tokens.ipynb index e97f077..d5aa12f 100644 --- a/scratches/pseco/ingestion/n00 Count tokens.ipynb +++ b/scratches/pseco/ingestion/n00 Count tokens.ipynb @@ -10,13 +10,15 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "id": "95cf533e", "metadata": {}, "outputs": [], "source": [ "from transformers import AutoTokenizer\n", "from Docker.config import settings\n", + "from pathlib import Path\n", + "from transformers import AutoConfig\n", "import os" ] }, @@ -30,7 +32,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 2, "id": "6fd7de78", "metadata": {}, "outputs": [], @@ -58,6 +60,146 @@ " raise IOError(f\"Error al leer '{file_path}': {error}\")" ] }, + { + "cell_type": "code", + "execution_count": 3, + "id": "6389092e", + "metadata": {}, + "outputs": [], + "source": [ + "def infer_context_window(model_id: str, tokenizer_obj) -> int:\n", + " \"\"\"Infer context window from tokenizer/model config.\"\"\"\n", + " large_sentinel = int(1e9)\n", + "\n", + " tokenizer_limit = getattr(tokenizer_obj, \"model_max_length\", None)\n", + " if isinstance(tokenizer_limit, int) and 0 < tokenizer_limit < large_sentinel:\n", + " return tokenizer_limit\n", + "\n", + " config = AutoConfig.from_pretrained(model_id)\n", + "\n", + " for field_name in (\n", + " \"max_position_embeddings\",\n", + " \"n_positions\",\n", + " \"seq_length\",\n", + " \"model_max_length\",\n", + " ):\n", + " value = getattr(config, field_name, None)\n", + " if isinstance(value, int) and value > 0:\n", + " return value\n", + "\n", + " raise ValueError(\n", + " \"No se pudo inferir la ventana de contexto del modelo. \"\n", + " \"Define context_window manualmente.\"\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "7190080b", + "metadata": {}, + "outputs": [], + "source": [ + "def run_token_count_test() -> dict[str, int | bool]:\n", + " \"\"\"Run token count test across all raw .txt files.\"\"\"\n", + " raw_dir = Path(settings.raw_path)\n", + " txt_files = sorted(raw_dir.glob(\"*.txt\"))\n", + "\n", + " if not txt_files:\n", + " print(f\"No se encontraron .txt en: {raw_dir}\")\n", + " return {\n", + " \"total_tokens_without_special\": 0,\n", + " \"total_tokens_with_special\": 0,\n", + " \"fits_without\": True,\n", + " \"fits_with\": True,\n", + " }\n", + "\n", + " total_tokens_without_special = 0\n", + " total_tokens_with_special = 0\n", + "\n", + " print(f\"Modelo: {model_name}\")\n", + " print(f\"Ventana de contexto detectada: {context_window}\")\n", + " print(f\"Archivos analizados: {len(txt_files)}\")\n", + " print(\"-\" * 80)\n", + "\n", + " for file_path in txt_files:\n", + " content = load_text_from_file(str(file_path))\n", + " token_ids_without_special = tokenizer.encode(\n", + " content, add_special_tokens=False\n", + " )\n", + " token_ids_with_special = tokenizer.encode(content)\n", + "\n", + " count_without_special = len(token_ids_without_special)\n", + " count_with_special = len(token_ids_with_special)\n", + "\n", + " total_tokens_without_special += count_without_special\n", + " total_tokens_with_special += count_with_special\n", + "\n", + " print(\n", + " f\"{file_path.name:<35} \"\n", + " f\"sin especiales: {count_without_special:>8} | \"\n", + " f\"con especiales: {count_with_special:>8}\"\n", + " )\n", + "\n", + " print(\"-\" * 80)\n", + " print(\n", + " f\"TOTAL sin especiales: {total_tokens_without_special} tokens\"\n", + " )\n", + " print(\n", + " f\"TOTAL con especiales: {total_tokens_with_special} tokens\"\n", + " )\n", + "\n", + " fits_without = total_tokens_without_special <= context_window\n", + " fits_with = total_tokens_with_special <= context_window\n", + "\n", + " print(\n", + " f\"¿Cabe en ventana ({context_window}) sin especiales? \"\n", + " f\"{'Sí' if fits_without else 'No'}\"\n", + " )\n", + " print(\n", + " f\"¿Cabe en ventana ({context_window}) con especiales? \"\n", + " f\"{'Sí' if fits_with else 'No'}\"\n", + " )\n", + "\n", + " if not fits_with:\n", + " overflow = total_tokens_with_special - context_window\n", + " print(\n", + " f\"Exceso aproximado: {overflow} tokens\"\n", + " )\n", + "\n", + " return {\n", + " \"total_tokens_without_special\": total_tokens_without_special,\n", + " \"total_tokens_with_special\": total_tokens_with_special,\n", + " \"fits_without\": fits_without,\n", + " \"fits_with\": fits_with,\n", + " }" + ] + }, + { + "cell_type": "markdown", + "id": "04e0f72f", + "metadata": {}, + "source": [ + "# Model Data" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "19c815e4", + "metadata": {}, + "outputs": [], + "source": [ + "model_name = os.getenv(\"HF_EMB_MODEL_NAME\")\n", + "if not model_name:\n", + " raise ValueError(\n", + " \"No se encontró HF_EMB_MODEL_NAME en variables de entorno.\"\n", + " )\n", + "\n", + "tokenizer = AutoTokenizer.from_pretrained(model_name)\n", + "context_window = infer_context_window(model_name, tokenizer)" + ] + }, { "cell_type": "markdown", "id": "22bcc0fe", @@ -68,36 +210,66 @@ }, { "cell_type": "code", - "execution_count": 4, - "id": "19c815e4", - "metadata": {}, - "outputs": [], - "source": [ - "model_name = os.getenv(\"HF_EMB_MODEL_NAME\")" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "4ff2484d", + "execution_count": 6, + "id": "f6517705", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Número de tokens: 1073\n" + "Modelo: Qwen/Qwen3-Embedding-0.6B\n", + "Ventana de contexto detectada: 131072\n", + "Archivos analizados: 24\n", + "--------------------------------------------------------------------------------\n", + "10_Execution_model_in_avap.txt sin especiales: 10349 | con especiales: 10350\n", + "11_Conditional_statements.txt sin especiales: 524 | con especiales: 525\n", + "12_Loop_statement.txt sin especiales: 594 | con especiales: 595\n", + "13_Api_inbound_interface.txt sin especiales: 415 | con especiales: 416\n", + "14_Working_with_libraries.txt sin especiales: 873 | con especiales: 874\n", + "15_Function_declaration.txt sin especiales: 394 | con especiales: 395\n", + "16_Appendix.txt sin especiales: 9209 | con especiales: 9210\n", + "17_Architecture_memory_foundations.txt sin especiales: 1086 | con especiales: 1087\n", + "18_Input_output_management.txt sin especiales: 1104 | con especiales: 1105\n", + "19_Control_logic_decision_structures.txt sin especiales: 1166 | con especiales: 1167\n", + "1_Introduction.txt sin especiales: 1072 | con especiales: 1073\n", + "20_Concurrency_asynchrony.txt sin especiales: 1049 | con especiales: 1050\n", + "21_Persistance_connectors_orm.txt sin especiales: 1135 | con especiales: 1136\n", + "22_System_utilities_transformation.txt sin especiales: 882 | con especiales: 883\n", + "23_Function_architecture_scopes.txt sin especiales: 604 | con especiales: 605\n", + "24_Master_example.txt sin especiales: 241 | con especiales: 242\n", + "2_Dynamic_Programming_Language.txt sin especiales: 707 | con especiales: 708\n", + "3_Notation.txt sin especiales: 1368 | con especiales: 1369\n", + "4_Lexics.txt sin especiales: 750 | con especiales: 751\n", + "5_Data_Model.txt sin especiales: 605 | con especiales: 606\n", + "6_Data_Types.txt sin especiales: 611 | con especiales: 612\n", + "7_Working_With_Variables.txt sin especiales: 601 | con especiales: 602\n", + "8_How_to_work_with_comments.txt sin especiales: 726 | con especiales: 727\n", + "9_Expressions_in_avap.txt sin especiales: 646 | con especiales: 647\n", + "--------------------------------------------------------------------------------\n", + "TOTAL sin especiales: 36711 tokens\n", + "TOTAL con especiales: 36735 tokens\n", + "¿Cabe en ventana (131072) sin especiales? Sí\n", + "¿Cabe en ventana (131072) con especiales? Sí\n" ] + }, + { + "data": { + "text/plain": [ + "{'total_tokens_without_special': 36711,\n", + " 'total_tokens_with_special': 36735,\n", + " 'fits_without': True,\n", + " 'fits_with': True}" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ - "tokenizer = AutoTokenizer.from_pretrained(model_name)\n", - "\n", - "text = load_text_from_file(settings.raw_path / '1_Introduction.txt')\n", - "\n", - "tokens = tokenizer.encode(text)\n", - "\n", - "print(\"Número de tokens:\", len(tokens))" + "test_result = run_token_count_test()\n", + "test_result" ] } ],