assistance-engine/research/embeddings/notebooks/n00 Beir Analysis_cosqa.ipynb

333 lines
9.9 KiB
Plaintext

{
"cells": [
{
"cell_type": "markdown",
"id": "66cbbaf8",
"metadata": {},
"source": [
"# Libraries"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "c01c19dc",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import json\n",
"from typing import Dict, List, Union\n",
"import numpy as np\n",
"from datasets import load_dataset\n",
"from langchain_ollama import OllamaEmbeddings\n",
"from beir.datasets.data_loader import GenericDataLoader\n",
"from beir.retrieval.search.dense import DenseRetrievalExactSearch\n",
"from beir.retrieval.evaluation import EvaluateRetrieval\n",
"from beir import util"
]
},
{
"cell_type": "markdown",
"id": "ac011c1c",
"metadata": {},
"source": [
"# Utils"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "b83e7900",
"metadata": {},
"outputs": [],
"source": [
"class BEIROllamaEmbeddings:\n",
" \"\"\"\n",
" Adapter that makes LangChain's OllamaEmbeddings compatible with BEIR.\n",
" \"\"\"\n",
"\n",
" def __init__(\n",
" self,\n",
" base_url: str,\n",
" model: str,\n",
" batch_size: int = 64,\n",
" ) -> None:\n",
" self.batch_size = batch_size\n",
" self.embeddings = OllamaEmbeddings(\n",
" base_url=base_url,\n",
" model=model,\n",
" )\n",
"\n",
" def _batch_embed(self, texts: List[str]) -> np.ndarray:\n",
" vectors = []\n",
"\n",
" for i in range(0, len(texts), self.batch_size):\n",
" batch = texts[i : i + self.batch_size]\n",
" batch_vectors = self.embeddings.embed_documents(batch)\n",
" vectors.extend(batch_vectors)\n",
"\n",
" return np.asarray(vectors, dtype=np.float32)\n",
"\n",
" def encode_queries(self, queries: List[str], **kwargs) -> np.ndarray:\n",
" \"\"\"\n",
" BEIR query encoder\n",
" \"\"\"\n",
" return self._batch_embed(queries)\n",
"\n",
" def encode_corpus(\n",
" self,\n",
" corpus: Union[List[Dict[str, str]], Dict[str, Dict[str, str]]],\n",
" **kwargs,\n",
" ) -> np.ndarray:\n",
" \"\"\"\n",
" BEIR corpus encoder\n",
" \"\"\"\n",
" if isinstance(corpus, dict):\n",
" corpus = list(corpus.values())\n",
"\n",
" texts = []\n",
" for doc in corpus:\n",
" title = (doc.get(\"title\") or \"\").strip()\n",
" text = (doc.get(\"text\") or \"\").strip()\n",
"\n",
" if title:\n",
" texts.append(f\"{title}\\n{text}\")\n",
" else:\n",
" texts.append(text)\n",
"\n",
" return self._batch_embed(texts)"
]
},
{
"cell_type": "markdown",
"id": "c9528fb6",
"metadata": {},
"source": [
"# Data"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "230aae25",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Descargando datos de Hugging Face...\n",
"Cargando con BEIR GenericDataLoader...\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "0e67479e959248f598db3415efbb13ae",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/20604 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"dataset_name = \"cosqa\"\n",
"data_path = f\"/home/acano/PycharmProjects/assistance-engine/data/external/{dataset_name}\"\n",
"\n",
"os.makedirs(f\"{data_path}/qrels\", exist_ok=True)\n",
"\n",
"# 1. Cargar desde Hugging Face con los nombres de configuración correctos\n",
"print(\"Descargando datos de Hugging Face...\")\n",
"hf_corpus = load_dataset(\"CoIR-Retrieval/cosqa\", \"corpus\", split=\"corpus\")\n",
"hf_queries = load_dataset(\"CoIR-Retrieval/cosqa\", \"queries\", split=\"queries\")\n",
"# Los qrels están en la config 'default'\n",
"hf_qrels = load_dataset(\"CoIR-Retrieval/cosqa\", \"default\", split=\"test\")\n",
"\n",
"# 2. Guardar Corpus\n",
"with open(f\"{data_path}/corpus.jsonl\", \"w\") as f:\n",
" for item in hf_corpus:\n",
" f.write(json.dumps({\"_id\": str(item[\"_id\"]), \"text\": item[\"text\"], \"title\": \"\"}) + \"\\n\")\n",
"\n",
"# 3. Guardar Queries\n",
"with open(f\"{data_path}/queries.jsonl\", \"w\") as f:\n",
" for item in hf_queries:\n",
" f.write(json.dumps({\"_id\": str(item[\"_id\"]), \"text\": item[\"text\"]}) + \"\\n\")\n",
"\n",
"# 4. Guardar Qrels (Formato TSV para BEIR)\n",
"with open(f\"{data_path}/qrels/test.tsv\", \"w\") as f:\n",
" f.write(\"query-id\\tcorpus-id\\tscore\\n\")\n",
" for item in hf_qrels:\n",
" # En la config 'default', los campos suelen ser 'query-id' y 'corpus-id'\n",
" f.write(f\"{item['query-id']}\\t{item['corpus-id']}\\t{item['score']}\\n\")\n",
"\n",
"print(\"Cargando con BEIR GenericDataLoader...\")\n",
"corpus, queries, qrels = GenericDataLoader(data_path).load(split=\"test\")"
]
},
{
"cell_type": "markdown",
"id": "13050d31",
"metadata": {},
"source": [
"# Test qwen3-0.6B-emb:latest"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "514540af",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"NDCG: {'NDCG@1': 0.174, 'NDCG@3': 0.27374, 'NDCG@5': 0.33509, 'NDCG@10': 0.39086, 'NDCG@100': 0.45099}\n",
"MAP: {'MAP@1': 0.174, 'MAP@3': 0.247, 'MAP@5': 0.2808, 'MAP@10': 0.30466, 'MAP@100': 0.31702}\n",
"Recall: {'Recall@1': 0.174, 'Recall@3': 0.352, 'Recall@5': 0.502, 'Recall@10': 0.67, 'Recall@100': 0.952}\n",
"Precision: {'P@1': 0.174, 'P@3': 0.11733, 'P@5': 0.1004, 'P@10': 0.067, 'P@100': 0.00952}\n"
]
}
],
"source": [
"model = BEIROllamaEmbeddings(\n",
" base_url=\"http://localhost:11434\",\n",
" model=\"qwen3-0.6B-emb:latest\",\n",
" batch_size=64,\n",
")\n",
"\n",
"retriever = DenseRetrievalExactSearch(model, batch_size=64)\n",
"evaluator = EvaluateRetrieval(retriever, score_function=\"cos_sim\")\n",
"\n",
"results = evaluator.retrieve(corpus, queries)\n",
"ndcg, _map, recall, precision = evaluator.evaluate(\n",
" qrels, results, [1, 3, 5, 10, 100]\n",
")\n",
"\n",
"print(\"NDCG:\", ndcg)\n",
"print(\"MAP:\", _map)\n",
"print(\"Recall:\", recall)\n",
"print(\"Precision:\", precision)"
]
},
{
"cell_type": "markdown",
"id": "c4e643ca",
"metadata": {},
"source": [
"# Test qwen2.5:1.5b"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "5ced1c25",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"NDCG: {'NDCG@1': 0.0, 'NDCG@3': 0.0, 'NDCG@5': 0.0, 'NDCG@10': 0.0, 'NDCG@100': 0.0021}\n",
"MAP: {'MAP@1': 0.0, 'MAP@3': 0.0, 'MAP@5': 0.0, 'MAP@10': 0.0, 'MAP@100': 0.00043}\n",
"Recall: {'Recall@1': 0.0, 'Recall@3': 0.0, 'Recall@5': 0.0, 'Recall@10': 0.0, 'Recall@100': 0.01}\n",
"Precision: {'P@1': 0.0, 'P@3': 0.0, 'P@5': 0.0, 'P@10': 0.0, 'P@100': 0.0001}\n"
]
}
],
"source": [
"model_qwen2 = BEIROllamaEmbeddings(\n",
" base_url=\"http://localhost:11434\",\n",
" model=\"qwen2.5:1.5b\",\n",
" batch_size=64,\n",
")\n",
"\n",
"retriever_qwen_2 = DenseRetrievalExactSearch(model_qwen2, batch_size=64)\n",
"evaluator_qwen_2 = EvaluateRetrieval(retriever_qwen_2, score_function=\"cos_sim\")\n",
"\n",
"results_qwen_2 = evaluator_qwen_2.retrieve(corpus, queries)\n",
"ndcg_qwen_2, _map_qwen_2, recall_qwen_2, precision_qwen_2 = evaluator_qwen_2.evaluate(\n",
" qrels, results_qwen_2, [1, 3, 5, 10, 100]\n",
")\n",
"\n",
"print(\"NDCG:\", ndcg_qwen_2)\n",
"print(\"MAP:\", _map_qwen_2)\n",
"print(\"Recall:\", recall_qwen_2)\n",
"print(\"Precision:\", precision_qwen_2)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "1db7d110",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Resultados guardados en /home/acano/PycharmProjects/assistance-engine/data/interim/beir_cosqa_results.json\n"
]
}
],
"source": [
"results_data = {\n",
" \"qwen3-0.6B-emb:latest\": {\n",
" \"NDCG\": ndcg,\n",
" \"MAP\": _map,\n",
" \"Recall\": recall,\n",
" \"Precision\": precision,\n",
" },\n",
" \"qwen2.5:1.5b\": {\n",
" \"NDCG\": ndcg_qwen_2,\n",
" \"MAP\": _map_qwen_2,\n",
" \"Recall\": recall_qwen_2,\n",
" \"Precision\": precision_qwen_2,\n",
" }\n",
"}\n",
" \n",
"output_file = \"/home/acano/PycharmProjects/assistance-engine/data/interim/beir_cosqa_results.json\"\n",
"with open(output_file, \"w\") as f:\n",
" json.dump(results_data, f, indent=2)\n",
" \n",
"print(f\"Resultados guardados en {output_file}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e4f8d78b",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "assistance-engine",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.13"
}
},
"nbformat": 4,
"nbformat_minor": 5
}