added config and count tokens
This commit is contained in:
parent
b01a76e71d
commit
71cb79985c
|
|
@ -0,0 +1,49 @@
|
|||
from pathlib import Path
|
||||
from pydantic_settings import BaseSettings, SettingsConfigDict
|
||||
from dotenv import load_dotenv
|
||||
from datetime import timedelta
|
||||
import warnings
|
||||
|
||||
load_dotenv()
|
||||
|
||||
|
||||
class Settings(BaseSettings):
|
||||
raw_path_: str
|
||||
processed_path_: str
|
||||
models_path_:str
|
||||
interim_path_:str
|
||||
external_path_:str
|
||||
|
||||
model_config = SettingsConfigDict(
|
||||
env_prefix="mrh_avap_",
|
||||
env_file=".env",
|
||||
env_file_encoding="utf-8",
|
||||
case_sensitive=False,
|
||||
extra="ignore",
|
||||
)
|
||||
|
||||
@property
|
||||
def raw_path(self) -> Path:
|
||||
return Path(self.raw_path_)
|
||||
|
||||
@property
|
||||
def processed_path(self) -> Path:
|
||||
return Path(self.processed_path_)
|
||||
|
||||
@property
|
||||
def proj_root(self) -> Path:
|
||||
return Path(__file__).resolve().parents[1]
|
||||
|
||||
@property
|
||||
def interim_path(self) -> Path:
|
||||
return Path(self.interim_path_)
|
||||
|
||||
@property
|
||||
def external_path(self) -> Path:
|
||||
return Path(self.external_path_)
|
||||
|
||||
@property
|
||||
def models_path(self) -> Path:
|
||||
return Path(self.models_path_)
|
||||
|
||||
settings = Settings()
|
||||
|
|
@ -0,0 +1,125 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "ed60d28c",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Libreries"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "95cf533e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from transformers import AutoTokenizer\n",
|
||||
"from Docker.config import settings\n",
|
||||
"import os"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "c9b7265a",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Functions"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "6fd7de78",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def load_text_from_file(file_path: str) -> str:\n",
|
||||
" \"\"\"\n",
|
||||
" Load text content from a specified file.\n",
|
||||
"\n",
|
||||
" Args:\n",
|
||||
" file_path: Path to the .txt file to load.\n",
|
||||
"\n",
|
||||
" Returns:\n",
|
||||
" The text content of the file.\n",
|
||||
"\n",
|
||||
" Raises:\n",
|
||||
" FileNotFoundError: If the file does not exist.\n",
|
||||
" IOError: If the file cannot be read.\n",
|
||||
" \"\"\"\n",
|
||||
" try:\n",
|
||||
" with open(file_path, \"r\", encoding=\"utf-8\") as file:\n",
|
||||
" return file.read()\n",
|
||||
" except FileNotFoundError:\n",
|
||||
" raise FileNotFoundError(f\"El archivo '{file_path}' no existe.\")\n",
|
||||
" except IOError as error:\n",
|
||||
" raise IOError(f\"Error al leer '{file_path}': {error}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "22bcc0fe",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Test"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "19c815e4",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"model_name = os.getenv(\"HF_EMB_MODEL_NAME\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"id": "4ff2484d",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Número de tokens: 1073\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
|
||||
"\n",
|
||||
"text = load_text_from_file(settings.raw_path / '1_Introduction.txt')\n",
|
||||
"\n",
|
||||
"tokens = tokenizer.encode(text)\n",
|
||||
"\n",
|
||||
"print(\"Número de tokens:\", len(tokens))"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "assistance-engine",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.12.11"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
Loading…
Reference in New Issue