update makefile

2026-02-24 14:52:48 +01:00 · 2026-02-24 14:52:48 +01:00 · ff438ea6c4
parent 9b6726c232
commit ff438ea6c4
3 changed files with 300 additions and 2 deletions
--- a/14
+++ b/14
@ -6,15 +6,18 @@ help:
 	@echo "  make tunnels_up                - Start tunnels"
 	@echo "  make compose_up                - Run tunnels script and start Docker Compose"

+.PHONY: sync_requirements
 sync_requirements:
 	@echo "Exporting dependencies from pyproject.toml to requirements.txt..."
 	uv export --format requirements-txt --no-hashes --no-dev -o Docker/requirements.txt
 	@echo "✓ requirements.txt updated successfully"

+.PHONY: tunnels_up
 tunnels_up:
 	bash ./scripts/start-tunnels.sh < /dev/null &
 	@echo "✓ Tunnels started!"

+.PHONY: compose_up
 compose_up:
 	bash ./scripts/start-tunnels.sh < /dev/null &
 	sleep 2
@ -27,3 +30,14 @@ tunnels_down:
 	@echo "Killing all kubectl port-forward tunnels..."
 	-pkill -f 'kubectl port-forward' || true
 	@echo "✓ All tunnels killed!"
+
+.PHONY: sync_data_down
+sync_data_down:
+	aws s3 sync s3://mrh-avap/data/ \
+		data/ 
+	
+## Upload Data to storage system
+.PHONY: sync_data_up
+sync_data_up:
+	aws s3 sync --exclude "*.gitkeep" data/ \
+		s3://mrh-avap/data 
--- a/scratches/pseco/evaluation/embeddings/n00
+++ b/scratches/pseco/evaluation/embeddings/n00
@ -0,0 +1,255 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "66cbbaf8",
+   "metadata": {},
+   "source": [
+    "# Libraries"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "c01c19dc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from typing import Dict, List, Union\n",
+    "import numpy as np\n",
+    "from langchain_ollama import OllamaEmbeddings\n",
+    "from beir.datasets.data_loader import GenericDataLoader\n",
+    "from beir.retrieval.search.dense import DenseRetrievalExactSearch\n",
+    "from beir.retrieval.evaluation import EvaluateRetrieval\n",
+    "from beir import util\n",
+    "import json\n",
+    "from datasets import load_dataset"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ac011c1c",
+   "metadata": {},
+   "source": [
+    "# Utils"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "b83e7900",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class BEIROllamaEmbeddings:\n",
+    "    \"\"\"\n",
+    "    Adapter that makes LangChain's OllamaEmbeddings compatible with BEIR.\n",
+    "    \"\"\"\n",
+    "\n",
+    "    def __init__(\n",
+    "        self,\n",
+    "        base_url: str,\n",
+    "        model: str,\n",
+    "        batch_size: int = 64,\n",
+    "    ) -> None:\n",
+    "        self.batch_size = batch_size\n",
+    "        self.embeddings = OllamaEmbeddings(\n",
+    "            base_url=base_url,\n",
+    "            model=model,\n",
+    "        )\n",
+    "\n",
+    "    def _batch_embed(self, texts: List[str]) -> np.ndarray:\n",
+    "        vectors = []\n",
+    "\n",
+    "        for i in range(0, len(texts), self.batch_size):\n",
+    "            batch = texts[i : i + self.batch_size]\n",
+    "            batch_vectors = self.embeddings.embed_documents(batch)\n",
+    "            vectors.extend(batch_vectors)\n",
+    "\n",
+    "        return np.asarray(vectors, dtype=np.float32)\n",
+    "\n",
+    "    def encode_queries(self, queries: List[str], **kwargs) -> np.ndarray:\n",
+    "        \"\"\"\n",
+    "        BEIR query encoder\n",
+    "        \"\"\"\n",
+    "        return self._batch_embed(queries)\n",
+    "\n",
+    "    def encode_corpus(\n",
+    "        self,\n",
+    "        corpus: Union[List[Dict[str, str]], Dict[str, Dict[str, str]]],\n",
+    "        **kwargs,\n",
+    "    ) -> np.ndarray:\n",
+    "        \"\"\"\n",
+    "        BEIR corpus encoder\n",
+    "        \"\"\"\n",
+    "        if isinstance(corpus, dict):\n",
+    "            corpus = list(corpus.values())\n",
+    "\n",
+    "        texts = []\n",
+    "        for doc in corpus:\n",
+    "            title = (doc.get(\"title\") or \"\").strip()\n",
+    "            text = (doc.get(\"text\") or \"\").strip()\n",
+    "\n",
+    "            if title:\n",
+    "                texts.append(f\"{title}\\n{text}\")\n",
+    "            else:\n",
+    "                texts.append(text)\n",
+    "\n",
+    "        return self._batch_embed(texts)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "af3eb66d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def convert_hf_to_beir(hf_dataset):\n",
+    "    corpus, queries, qrels = {}, {}, {}\n",
+    "    \n",
+    "    for i, data in enumerate(hf_dataset):\n",
+    "        docid = f\"doc_{i}\"\n",
+    "        queryid = f\"q_{i}\"\n",
+    "        \n",
+    "        # El código es el documento (lo que el agente debe recuperar)\n",
+    "        corpus[docid] = {\"title\": data.get(\"func_name\", \"\"), \"text\": data['code']}\n",
+    "        \n",
+    "        # El docstring es la consulta (lo que el usuario pide)\n",
+    "        queries[queryid] = data['docstring']\n",
+    "        \n",
+    "        # Relación 1 a 1: la query i busca el código i\n",
+    "        qrels[queryid] = {docid: 1}\n",
+    "            \n",
+    "    return corpus, queries, qrels"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c9528fb6",
+   "metadata": {},
+   "source": [
+    "# Data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "230aae25",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "raw_dataset = load_dataset(\"google/code_x_glue_tc_nl_code_search_adv\", split=\"test\")\n",
+    "corpus, queries, qrels = convert_hf_to_beir(raw_dataset)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "13050d31",
+   "metadata": {},
+   "source": [
+    "# Test qwen3-0.6B-emb:latest"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "514540af",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model = BEIROllamaEmbeddings(\n",
+    "    base_url=\"http://localhost:11434\",\n",
+    "    model=\"qwen3-0.6B-emb:latest\",\n",
+    "    batch_size=64,\n",
+    ")\n",
+    "\n",
+    "# Inicializar buscador y evaluador\n",
+    "retriever = DenseRetrievalExactSearch(model, batch_size=64)\n",
+    "evaluator = EvaluateRetrieval(retriever, score_function=\"cos_sim\")\n",
+    "\n",
+    "# Ejecutar recuperación\n",
+    "results = evaluator.retrieve(corpus, queries)\n",
+    "\n",
+    "# Evaluar métricas (NDCG, MAP, Recall, Precision)\n",
+    "ndcg, _map, recall, precision = evaluator.evaluate(\n",
+    "    qrels, results, [1, 3, 5, 10]\n",
+    ")\n",
+    "\n",
+    "print(f\"Resultados para CodeXGLUE:\")\n",
+    "print(\"NDCG@10:\", ndcg[\"NDCG@10\"])\n",
+    "print(\"Recall@10:\", recall[\"Recall@10\"])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c4e643ca",
+   "metadata": {},
+   "source": [
+    "# Test qwen2.5:1.5b"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5ced1c25",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "NDCG: {'NDCG@1': 0.02333, 'NDCG@3': 0.03498, 'NDCG@5': 0.0404, 'NDCG@10': 0.04619, 'NDCG@100': 0.07768}\n",
+      "MAP: {'MAP@1': 0.02083, 'MAP@3': 0.03083, 'MAP@5': 0.03375, 'MAP@10': 0.03632, 'MAP@100': 0.04123}\n",
+      "Recall: {'Recall@1': 0.02083, 'Recall@3': 0.04417, 'Recall@5': 0.0575, 'Recall@10': 0.07417, 'Recall@100': 0.23144}\n",
+      "Precision: {'P@1': 0.02333, 'P@3': 0.01556, 'P@5': 0.01267, 'P@10': 0.00833, 'P@100': 0.00277}\n"
+     ]
+    }
+   ],
+   "source": [
+    "model = BEIROllamaEmbeddings(\n",
+    "    base_url=\"http://localhost:11434\",\n",
+    "    model=\"qwen2.5:1.5b\",\n",
+    "    batch_size=64,\n",
+    ")\n",
+    "\n",
+    "# Inicializar buscador y evaluador\n",
+    "retriever = DenseRetrievalExactSearch(model, batch_size=64)\n",
+    "evaluator = EvaluateRetrieval(retriever, score_function=\"cos_sim\")\n",
+    "\n",
+    "# Ejecutar recuperación\n",
+    "results = evaluator.retrieve(corpus, queries)\n",
+    "\n",
+    "# Evaluar métricas (NDCG, MAP, Recall, Precision)\n",
+    "ndcg, _map, recall, precision = evaluator.evaluate(\n",
+    "    qrels, results, [1, 3, 5, 10]\n",
+    ")\n",
+    "\n",
+    "print(f\"Resultados para CodeXGLUE:\")\n",
+    "print(\"NDCG@10:\", ndcg[\"NDCG@10\"])\n",
+    "print(\"Recall@10:\", recall[\"Recall@10\"])"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "assistance-engine",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/scratches/pseco/evaluation/embeddings/n00
+++ b/scratches/pseco/evaluation/embeddings/n00
@ -10,7 +10,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": null,
   "id": "c01c19dc",
   "metadata": {},
   "outputs": [],
@ -21,7 +21,8 @@
    "from beir.datasets.data_loader import GenericDataLoader\n",
    "from beir.retrieval.search.dense import DenseRetrievalExactSearch\n",
    "from beir.retrieval.evaluation import EvaluateRetrieval\n",
-    "from beir import util"
+    "from beir import util\n",
+    "import json"
   ]
  },
  {
@ -96,6 +97,34 @@
    "        return self._batch_embed(texts)"
   ]
  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "af3eb66d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def convert_codexglue_to_beir(input_file):\n",
+    "    corpus, queries, qrels = {}, {}, {}\n",
+    "    with open(input_file, 'r') as f:\n",
+    "        for i, line in enumerate(f):\n",
+    "            data = json.loads(line)\n",
+    "            docid = f\"doc_{i}\"\n",
+    "            queryid = f\"q_{i}\"\n",
+    "            \n",
+    "            # El código es nuestro documento (Corpus)\n",
+    "            corpus[docid] = {\"title\": \"\", \"text\": data['code']}\n",
+    "            # El docstring es nuestra consulta (Query)\n",
+    "            queries[queryid] = data['docstring']\n",
+    "            # En CodeXGLUE, la consulta i corresponde al código i\n",
+    "            qrels[queryid] = {docid: 1}\n",
+    "            \n",
+    "    return corpus, queries, qrels\n",
+    "\n",
+    "# Carga tus datos (ejemplo con el set de test de AdvTest)\n",
+    "corpus, queries, qrels = convert_codexglue_to_beir(\"test.jsonl\")\n"
+   ]
+  },
  {
   "cell_type": "markdown",
   "id": "c9528fb6",