{ "cells": [ { "cell_type": "markdown", "id": "ed60d28c", "metadata": {}, "source": [ "# Libreries" ] }, { "cell_type": "code", "execution_count": 2, "id": "95cf533e", "metadata": {}, "outputs": [], "source": [ "from transformers import AutoTokenizer\n", "from Docker.config import settings\n", "import os" ] }, { "cell_type": "markdown", "id": "c9b7265a", "metadata": {}, "source": [ "# Functions" ] }, { "cell_type": "code", "execution_count": 6, "id": "6fd7de78", "metadata": {}, "outputs": [], "source": [ "def load_text_from_file(file_path: str) -> str:\n", " \"\"\"\n", " Load text content from a specified file.\n", "\n", " Args:\n", " file_path: Path to the .txt file to load.\n", "\n", " Returns:\n", " The text content of the file.\n", "\n", " Raises:\n", " FileNotFoundError: If the file does not exist.\n", " IOError: If the file cannot be read.\n", " \"\"\"\n", " try:\n", " with open(file_path, \"r\", encoding=\"utf-8\") as file:\n", " return file.read()\n", " except FileNotFoundError:\n", " raise FileNotFoundError(f\"El archivo '{file_path}' no existe.\")\n", " except IOError as error:\n", " raise IOError(f\"Error al leer '{file_path}': {error}\")" ] }, { "cell_type": "markdown", "id": "22bcc0fe", "metadata": {}, "source": [ "# Test" ] }, { "cell_type": "code", "execution_count": 4, "id": "19c815e4", "metadata": {}, "outputs": [], "source": [ "model_name = os.getenv(\"HF_EMB_MODEL_NAME\")" ] }, { "cell_type": "code", "execution_count": 8, "id": "4ff2484d", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Número de tokens: 1073\n" ] } ], "source": [ "tokenizer = AutoTokenizer.from_pretrained(model_name)\n", "\n", "text = load_text_from_file(settings.raw_path / '1_Introduction.txt')\n", "\n", "tokens = tokenizer.encode(text)\n", "\n", "print(\"Número de tokens:\", len(tokens))" ] } ], "metadata": { "kernelspec": { "display_name": "assistance-engine", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.11" } }, "nbformat": 4, "nbformat_minor": 5 }