From 71cb79985ca08c02abfda95f19c9474d9e315340 Mon Sep 17 00:00:00 2001 From: pseco Date: Wed, 25 Feb 2026 14:59:57 +0100 Subject: [PATCH] added config and count tokens --- Docker/config.py | 49 +++++++ .../pseco/ingestion/n00 Count tokens.ipynb | 125 ++++++++++++++++++ 2 files changed, 174 insertions(+) create mode 100644 Docker/config.py create mode 100644 scratches/pseco/ingestion/n00 Count tokens.ipynb diff --git a/Docker/config.py b/Docker/config.py new file mode 100644 index 0000000..cfb627c --- /dev/null +++ b/Docker/config.py @@ -0,0 +1,49 @@ +from pathlib import Path +from pydantic_settings import BaseSettings, SettingsConfigDict +from dotenv import load_dotenv +from datetime import timedelta +import warnings + +load_dotenv() + + +class Settings(BaseSettings): + raw_path_: str + processed_path_: str + models_path_:str + interim_path_:str + external_path_:str + + model_config = SettingsConfigDict( + env_prefix="mrh_avap_", + env_file=".env", + env_file_encoding="utf-8", + case_sensitive=False, + extra="ignore", + ) + + @property + def raw_path(self) -> Path: + return Path(self.raw_path_) + + @property + def processed_path(self) -> Path: + return Path(self.processed_path_) + + @property + def proj_root(self) -> Path: + return Path(__file__).resolve().parents[1] + + @property + def interim_path(self) -> Path: + return Path(self.interim_path_) + + @property + def external_path(self) -> Path: + return Path(self.external_path_) + + @property + def models_path(self) -> Path: + return Path(self.models_path_) + +settings = Settings() diff --git a/scratches/pseco/ingestion/n00 Count tokens.ipynb b/scratches/pseco/ingestion/n00 Count tokens.ipynb new file mode 100644 index 0000000..e97f077 --- /dev/null +++ b/scratches/pseco/ingestion/n00 Count tokens.ipynb @@ -0,0 +1,125 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "ed60d28c", + "metadata": {}, + "source": [ + "# Libreries" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "95cf533e", + "metadata": {}, + "outputs": [], + "source": [ + "from transformers import AutoTokenizer\n", + "from Docker.config import settings\n", + "import os" + ] + }, + { + "cell_type": "markdown", + "id": "c9b7265a", + "metadata": {}, + "source": [ + "# Functions" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "6fd7de78", + "metadata": {}, + "outputs": [], + "source": [ + "def load_text_from_file(file_path: str) -> str:\n", + " \"\"\"\n", + " Load text content from a specified file.\n", + "\n", + " Args:\n", + " file_path: Path to the .txt file to load.\n", + "\n", + " Returns:\n", + " The text content of the file.\n", + "\n", + " Raises:\n", + " FileNotFoundError: If the file does not exist.\n", + " IOError: If the file cannot be read.\n", + " \"\"\"\n", + " try:\n", + " with open(file_path, \"r\", encoding=\"utf-8\") as file:\n", + " return file.read()\n", + " except FileNotFoundError:\n", + " raise FileNotFoundError(f\"El archivo '{file_path}' no existe.\")\n", + " except IOError as error:\n", + " raise IOError(f\"Error al leer '{file_path}': {error}\")" + ] + }, + { + "cell_type": "markdown", + "id": "22bcc0fe", + "metadata": {}, + "source": [ + "# Test" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "19c815e4", + "metadata": {}, + "outputs": [], + "source": [ + "model_name = os.getenv(\"HF_EMB_MODEL_NAME\")" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "4ff2484d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Número de tokens: 1073\n" + ] + } + ], + "source": [ + "tokenizer = AutoTokenizer.from_pretrained(model_name)\n", + "\n", + "text = load_text_from_file(settings.raw_path / '1_Introduction.txt')\n", + "\n", + "tokens = tokenizer.encode(text)\n", + "\n", + "print(\"Número de tokens:\", len(tokens))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "assistance-engine", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}