{ "cells": [ { "cell_type": "markdown", "id": "66cbbaf8", "metadata": {}, "source": [ "# Libraries" ] }, { "cell_type": "code", "execution_count": 15, "id": "c01c19dc", "metadata": {}, "outputs": [], "source": [ "from typing import Dict, List, Union\n", "import numpy as np\n", "from langchain_ollama import OllamaEmbeddings\n", "from beir.datasets.data_loader import GenericDataLoader\n", "from beir.retrieval.search.dense import DenseRetrievalExactSearch\n", "from beir.retrieval.evaluation import EvaluateRetrieval\n", "from beir import util\n", "import json" ] }, { "cell_type": "markdown", "id": "ac011c1c", "metadata": {}, "source": [ "# Utils" ] }, { "cell_type": "code", "execution_count": 10, "id": "b83e7900", "metadata": {}, "outputs": [], "source": [ "class BEIROllamaEmbeddings:\n", " \"\"\"\n", " Adapter that makes LangChain's OllamaEmbeddings compatible with BEIR.\n", " \"\"\"\n", "\n", " def __init__(\n", " self,\n", " base_url: str,\n", " model: str,\n", " batch_size: int = 64,\n", " ) -> None:\n", " self.batch_size = batch_size\n", " self.embeddings = OllamaEmbeddings(\n", " base_url=base_url,\n", " model=model,\n", " )\n", "\n", " def _batch_embed(self, texts: List[str]) -> np.ndarray:\n", " vectors = []\n", "\n", " for i in range(0, len(texts), self.batch_size):\n", " batch = texts[i : i + self.batch_size]\n", " batch_vectors = self.embeddings.embed_documents(batch)\n", " vectors.extend(batch_vectors)\n", "\n", " return np.asarray(vectors, dtype=np.float32)\n", "\n", " def encode_queries(self, queries: List[str], **kwargs) -> np.ndarray:\n", " \"\"\"\n", " BEIR query encoder\n", " \"\"\"\n", " return self._batch_embed(queries)\n", "\n", " def encode_corpus(\n", " self,\n", " corpus: Union[List[Dict[str, str]], Dict[str, Dict[str, str]]],\n", " **kwargs,\n", " ) -> np.ndarray:\n", " \"\"\"\n", " BEIR corpus encoder\n", " \"\"\"\n", " if isinstance(corpus, dict):\n", " corpus = list(corpus.values())\n", "\n", " texts = []\n", " for doc in corpus:\n", " title = (doc.get(\"title\") or \"\").strip()\n", " text = (doc.get(\"text\") or \"\").strip()\n", "\n", " if title:\n", " texts.append(f\"{title}\\n{text}\")\n", " else:\n", " texts.append(text)\n", "\n", " return self._batch_embed(texts)" ] }, { "cell_type": "code", "execution_count": null, "id": "af3eb66d", "metadata": {}, "outputs": [], "source": [ "def convert_codexglue_to_beir(input_file):\n", " corpus, queries, qrels = {}, {}, {}\n", " with open(input_file, 'r') as f:\n", " for i, line in enumerate(f):\n", " data = json.loads(line)\n", " docid = f\"doc_{i}\"\n", " queryid = f\"q_{i}\"\n", " \n", " # El código es nuestro documento (Corpus)\n", " corpus[docid] = {\"title\": \"\", \"text\": data['code']}\n", " # El docstring es nuestra consulta (Query)\n", " queries[queryid] = data['docstring']\n", " # En CodeXGLUE, la consulta i corresponde al código i\n", " qrels[queryid] = {docid: 1}\n", " \n", " return corpus, queries, qrels\n", "\n", "# Carga tus datos (ejemplo con el set de test de AdvTest)\n", "corpus, queries, qrels = convert_codexglue_to_beir(\"test.jsonl\")\n" ] }, { "cell_type": "markdown", "id": "c9528fb6", "metadata": {}, "source": [ "# Data" ] }, { "cell_type": "code", "execution_count": 11, "id": "230aae25", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "1915c67ec20f4806b30b48eff9a132e2", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/5183 [00:00