assistance-engine/research/embeddings/notebooks/n00 first Analysis.ipynb

290 lines
10 KiB
Plaintext
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "markdown",
"id": "096e6224",
"metadata": {},
"source": [
"# Libraries"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "4b0853e9",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_2931729/1845255288.py:4: DeprecationWarning: Importing SemanticSimilarity from 'ragas.metrics' is deprecated and will be removed in v1.0. Please use 'ragas.metrics.collections' instead. Example: from ragas.metrics.collections import SemanticSimilarity\n",
" from ragas.metrics import SemanticSimilarity\n"
]
}
],
"source": [
"# ...existing code...\n",
"from datasets import load_dataset\n",
"from ragas import EvaluationDataset, evaluate\n",
"from ragas.metrics import SemanticSimilarity\n",
"from langchain_community.embeddings import OllamaEmbeddings\n",
"import asyncio\n",
"from typing import Sequence\n",
"from ragas.embeddings.base import BaseRagasEmbedding\n",
"import os\n",
"from transformers import AutoConfig\n",
"import nltk\n",
"# ...existing code..."
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "6bfe1ca0",
"metadata": {},
"outputs": [],
"source": [
"nltk.download(\"punkt\", quiet=True)\n",
"\n",
"ES_URL = os.getenv(\"ELASTICSEARCH_LOCAL_URL\")\n",
"ES_INDEX_NAME = os.getenv(\"ELASTICSEARCH_INDEX\")\n",
"HF_EMBEDDING_MODEL_NAME = os.getenv(\"HF_EMBEDDING_MODEL_NAME\")\n",
"BASE_URL = os.getenv(\"LLM_BASE_LOCAL_URL\")\n",
"MODEL_NAME = os.getenv(\"OLLAMA_MODEL_NAME\")\n",
"\n",
"config = AutoConfig.from_pretrained(HF_EMBEDDING_MODEL_NAME)\n",
"embedding_dim = config.hidden_size"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "ea41ce0f",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_2931729/256987240.py:1: LangChainDeprecationWarning: The class `OllamaEmbeddings` was deprecated in LangChain 0.3.1 and will be removed in 1.0.0. An updated version of the class exists in the `langchain-ollama package and should be used instead. To use it run `pip install -U `langchain-ollama` and import as `from `langchain_ollama import OllamaEmbeddings``.\n",
" embeddings = OllamaEmbeddings(base_url=BASE_URL, model=MODEL_NAME)\n"
]
}
],
"source": [
"embeddings = OllamaEmbeddings(base_url=BASE_URL, model=MODEL_NAME)"
]
},
{
"cell_type": "markdown",
"id": "8eee9390",
"metadata": {},
"source": [
"# Similitud Aleatoria"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d7b150e5",
"metadata": {},
"outputs": [],
"source": [
"from datasets import load_dataset\n",
"from ragas import EvaluationDataset\n",
"\n",
"\n",
"def _normalize_answer(answer_value: object) -> str:\n",
" \"\"\"\n",
" Normalize answer values to a single string.\n",
" \"\"\"\n",
" if isinstance(answer_value, dict):\n",
" text_value = answer_value.get(\"text\")\n",
" if isinstance(text_value, list):\n",
" return str(text_value[0]) if text_value else \"\"\n",
" if text_value is not None:\n",
" return str(text_value)\n",
"\n",
" if isinstance(answer_value, list):\n",
" return str(answer_value[0]) if answer_value else \"\"\n",
"\n",
" return str(answer_value)\n",
"\n",
"\n",
"def _first_existing_key(candidates: list[str], keys: set[str]) -> str | None:\n",
" \"\"\"\n",
" Return the first key present in keys from candidates.\n",
" \"\"\"\n",
" for candidate in candidates:\n",
" if candidate in keys:\n",
" return candidate\n",
" return None\n",
"\n",
"\n",
"ds = load_dataset(\"sentence-transformers/natural-questions\")\n",
"train_ds = ds[\"train\"]\n",
"\n",
"max_questions = min(100, len(train_ds))\n",
"train_ds = train_ds.select(range(max_questions))\n",
"\n",
"available_keys = set(train_ds.column_names)\n",
"reference_key = _first_existing_key(\n",
" [\"question\", \"query\", \"text\", \"input\"], available_keys\n",
")\n",
"response_key = _first_existing_key(\n",
" [\"answer\", \"answers\", \"response\", \"output\"], available_keys\n",
")\n",
"\n",
"if reference_key is None or response_key is None:\n",
" raise KeyError(\n",
" f\"Expected question/answer-like columns not found. \"\n",
" f\"Available columns: {train_ds.column_names}\"\n",
" )\n",
"\n",
"rows = []\n",
"for row in train_ds:\n",
" rows.append(\n",
" {\n",
" \"reference\": str(row[reference_key]),\n",
" \"response\": _normalize_answer(row[response_key]),\n",
" }\n",
" )\n",
"\n",
"eval_ds = EvaluationDataset.from_list(rows)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "753aab30",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"DatasetDict({\n",
" train: Dataset({\n",
" features: ['query', 'answer'],\n",
" num_rows: 100231\n",
" })\n",
"})\n",
"['query', 'answer']\n",
"{'query': 'when did richmond last play in a preliminary final', 'answer': \"Richmond Football Club Richmond began 2017 with 5 straight wins, a feat it had not achieved since 1995. A series of close losses hampered the Tigers throughout the middle of the season, including a 5-point loss to the Western Bulldogs, 2-point loss to Fremantle, and a 3-point loss to the Giants. Richmond ended the season strongly with convincing victories over Fremantle and St Kilda in the final two rounds, elevating the club to 3rd on the ladder. Richmond's first final of the season against the Cats at the MCG attracted a record qualifying final crowd of 95,028; the Tigers won by 51 points. Having advanced to the first preliminary finals for the first time since 2001, Richmond defeated Greater Western Sydney by 36 points in front of a crowd of 94,258 to progress to the Grand Final against Adelaide, their first Grand Final appearance since 1982. The attendance was 100,021, the largest crowd to a grand final since 1986. The Crows led at quarter time and led by as many as 13, but the Tigers took over the game as it progressed and scored seven straight goals at one point. They eventually would win by 48 points 16.12 (108) to Adelaide's 8.12 (60) to end their 37-year flag drought.[22] Dustin Martin also became the first player to win a Premiership medal, the Brownlow Medal and the Norm Smith Medal in the same season, while Damien Hardwick was named AFL Coaches Association Coach of the Year. Richmond's jump from 13th to premiers also marked the biggest jump from one AFL season to the next.\"}\n"
]
}
],
"source": [
"print(ds)\n",
"print(ds[\"train\"].column_names)\n",
"print(ds[\"train\"][0])"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "6c3d4235",
"metadata": {},
"outputs": [],
"source": [
"# ...existing code...\n",
"class OllamaRagasEmbeddingAdapter(BaseRagasEmbedding):\n",
" \"\"\"Adaptador de LangChain Ollama a la API moderna de embeddings en Ragas.\"\"\"\n",
"\n",
" def __init__(self, base_url: str, model_name: str) -> None:\n",
" self._client = OllamaEmbeddings(base_url=base_url, model=model_name)\n",
"\n",
" def embed_text(self, text: str) -> list[float]:\n",
" return self._client.embed_query(text)\n",
"\n",
" async def aembed_text(self, text: str) -> list[float]:\n",
" return await asyncio.to_thread(self.embed_text, text)\n",
"\n",
" def embed_query(self, text: str) -> list[float]:\n",
" return self.embed_text(text)\n",
"\n",
" def embed_documents(self, texts: Sequence[str]) -> list[list[float]]:\n",
" return self._client.embed_documents(list(texts))\n",
"\n",
" async def aembed_query(self, text: str) -> list[float]:\n",
" return await self.aembed_text(text)\n",
"\n",
" async def aembed_documents(\n",
" self, texts: Sequence[str]\n",
" ) -> list[list[float]]:\n",
" return await asyncio.to_thread(self.embed_documents, texts)\n",
"\n",
"\n",
"if not BASE_URL or not MODEL_NAME:\n",
" raise ValueError(\n",
" \"Faltan variables de entorno: LLM_BASE_LOCAL_URL u OLLAMA_MODEL_NAME.\"\n",
" )\n",
"\n",
"embeddings = OllamaRagasEmbeddingAdapter(\n",
" base_url=BASE_URL,\n",
" model_name=MODEL_NAME,\n",
")\n",
"\n",
"semantic_sim = SemanticSimilarity()\n",
"# ...existing code..."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "54aacf01",
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "6a4b6e91c71d4849922f36d45f3e9f7f",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Evaluating: 0%| | 0/100231 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# ...existing code...\n",
"result = evaluate(\n",
" dataset=eval_ds,\n",
" metrics=[semantic_sim],\n",
" embeddings=embeddings,\n",
")\n",
"\n",
"print(result)\n",
"# ...existing code...\n",
"# ...existing code..."
]
}
],
"metadata": {
"kernelspec": {
"display_name": "assistance-engine",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.11"
}
},
"nbformat": 4,
"nbformat_minor": 5
}