diff --git a/Makefile b/Makefile index a7954d3..6303f8a 100644 --- a/Makefile +++ b/Makefile @@ -6,15 +6,18 @@ help: @echo " make tunnels_up - Start tunnels" @echo " make compose_up - Run tunnels script and start Docker Compose" +.PHONY: sync_requirements sync_requirements: @echo "Exporting dependencies from pyproject.toml to requirements.txt..." uv export --format requirements-txt --no-hashes --no-dev -o Docker/requirements.txt @echo "✓ requirements.txt updated successfully" +.PHONY: tunnels_up tunnels_up: bash ./scripts/start-tunnels.sh < /dev/null & @echo "✓ Tunnels started!" +.PHONY: compose_up compose_up: bash ./scripts/start-tunnels.sh < /dev/null & sleep 2 @@ -27,3 +30,14 @@ tunnels_down: @echo "Killing all kubectl port-forward tunnels..." -pkill -f 'kubectl port-forward' || true @echo "✓ All tunnels killed!" + +.PHONY: sync_data_down +sync_data_down: + aws s3 sync s3://mrh-avap/data/ \ + data/ + +## Upload Data to storage system +.PHONY: sync_data_up +sync_data_up: + aws s3 sync --exclude "*.gitkeep" data/ \ + s3://mrh-avap/data \ No newline at end of file diff --git a/scratches/pseco/evaluation/embeddings/n00 Beir Analysis CodeXGlue.ipynb b/scratches/pseco/evaluation/embeddings/n00 Beir Analysis CodeXGlue.ipynb new file mode 100644 index 0000000..0ba4395 --- /dev/null +++ b/scratches/pseco/evaluation/embeddings/n00 Beir Analysis CodeXGlue.ipynb @@ -0,0 +1,255 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "66cbbaf8", + "metadata": {}, + "source": [ + "# Libraries" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "c01c19dc", + "metadata": {}, + "outputs": [], + "source": [ + "from typing import Dict, List, Union\n", + "import numpy as np\n", + "from langchain_ollama import OllamaEmbeddings\n", + "from beir.datasets.data_loader import GenericDataLoader\n", + "from beir.retrieval.search.dense import DenseRetrievalExactSearch\n", + "from beir.retrieval.evaluation import EvaluateRetrieval\n", + "from beir import util\n", + "import json\n", + "from datasets import load_dataset" + ] + }, + { + "cell_type": "markdown", + "id": "ac011c1c", + "metadata": {}, + "source": [ + "# Utils" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "b83e7900", + "metadata": {}, + "outputs": [], + "source": [ + "class BEIROllamaEmbeddings:\n", + " \"\"\"\n", + " Adapter that makes LangChain's OllamaEmbeddings compatible with BEIR.\n", + " \"\"\"\n", + "\n", + " def __init__(\n", + " self,\n", + " base_url: str,\n", + " model: str,\n", + " batch_size: int = 64,\n", + " ) -> None:\n", + " self.batch_size = batch_size\n", + " self.embeddings = OllamaEmbeddings(\n", + " base_url=base_url,\n", + " model=model,\n", + " )\n", + "\n", + " def _batch_embed(self, texts: List[str]) -> np.ndarray:\n", + " vectors = []\n", + "\n", + " for i in range(0, len(texts), self.batch_size):\n", + " batch = texts[i : i + self.batch_size]\n", + " batch_vectors = self.embeddings.embed_documents(batch)\n", + " vectors.extend(batch_vectors)\n", + "\n", + " return np.asarray(vectors, dtype=np.float32)\n", + "\n", + " def encode_queries(self, queries: List[str], **kwargs) -> np.ndarray:\n", + " \"\"\"\n", + " BEIR query encoder\n", + " \"\"\"\n", + " return self._batch_embed(queries)\n", + "\n", + " def encode_corpus(\n", + " self,\n", + " corpus: Union[List[Dict[str, str]], Dict[str, Dict[str, str]]],\n", + " **kwargs,\n", + " ) -> np.ndarray:\n", + " \"\"\"\n", + " BEIR corpus encoder\n", + " \"\"\"\n", + " if isinstance(corpus, dict):\n", + " corpus = list(corpus.values())\n", + "\n", + " texts = []\n", + " for doc in corpus:\n", + " title = (doc.get(\"title\") or \"\").strip()\n", + " text = (doc.get(\"text\") or \"\").strip()\n", + "\n", + " if title:\n", + " texts.append(f\"{title}\\n{text}\")\n", + " else:\n", + " texts.append(text)\n", + "\n", + " return self._batch_embed(texts)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "af3eb66d", + "metadata": {}, + "outputs": [], + "source": [ + "def convert_hf_to_beir(hf_dataset):\n", + " corpus, queries, qrels = {}, {}, {}\n", + " \n", + " for i, data in enumerate(hf_dataset):\n", + " docid = f\"doc_{i}\"\n", + " queryid = f\"q_{i}\"\n", + " \n", + " # El código es el documento (lo que el agente debe recuperar)\n", + " corpus[docid] = {\"title\": data.get(\"func_name\", \"\"), \"text\": data['code']}\n", + " \n", + " # El docstring es la consulta (lo que el usuario pide)\n", + " queries[queryid] = data['docstring']\n", + " \n", + " # Relación 1 a 1: la query i busca el código i\n", + " qrels[queryid] = {docid: 1}\n", + " \n", + " return corpus, queries, qrels" + ] + }, + { + "cell_type": "markdown", + "id": "c9528fb6", + "metadata": {}, + "source": [ + "# Data" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "230aae25", + "metadata": {}, + "outputs": [], + "source": [ + "raw_dataset = load_dataset(\"google/code_x_glue_tc_nl_code_search_adv\", split=\"test\")\n", + "corpus, queries, qrels = convert_hf_to_beir(raw_dataset)" + ] + }, + { + "cell_type": "markdown", + "id": "13050d31", + "metadata": {}, + "source": [ + "# Test qwen3-0.6B-emb:latest" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "514540af", + "metadata": {}, + "outputs": [], + "source": [ + "model = BEIROllamaEmbeddings(\n", + " base_url=\"http://localhost:11434\",\n", + " model=\"qwen3-0.6B-emb:latest\",\n", + " batch_size=64,\n", + ")\n", + "\n", + "# Inicializar buscador y evaluador\n", + "retriever = DenseRetrievalExactSearch(model, batch_size=64)\n", + "evaluator = EvaluateRetrieval(retriever, score_function=\"cos_sim\")\n", + "\n", + "# Ejecutar recuperación\n", + "results = evaluator.retrieve(corpus, queries)\n", + "\n", + "# Evaluar métricas (NDCG, MAP, Recall, Precision)\n", + "ndcg, _map, recall, precision = evaluator.evaluate(\n", + " qrels, results, [1, 3, 5, 10]\n", + ")\n", + "\n", + "print(f\"Resultados para CodeXGLUE:\")\n", + "print(\"NDCG@10:\", ndcg[\"NDCG@10\"])\n", + "print(\"Recall@10:\", recall[\"Recall@10\"])" + ] + }, + { + "cell_type": "markdown", + "id": "c4e643ca", + "metadata": {}, + "source": [ + "# Test qwen2.5:1.5b" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5ced1c25", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "NDCG: {'NDCG@1': 0.02333, 'NDCG@3': 0.03498, 'NDCG@5': 0.0404, 'NDCG@10': 0.04619, 'NDCG@100': 0.07768}\n", + "MAP: {'MAP@1': 0.02083, 'MAP@3': 0.03083, 'MAP@5': 0.03375, 'MAP@10': 0.03632, 'MAP@100': 0.04123}\n", + "Recall: {'Recall@1': 0.02083, 'Recall@3': 0.04417, 'Recall@5': 0.0575, 'Recall@10': 0.07417, 'Recall@100': 0.23144}\n", + "Precision: {'P@1': 0.02333, 'P@3': 0.01556, 'P@5': 0.01267, 'P@10': 0.00833, 'P@100': 0.00277}\n" + ] + } + ], + "source": [ + "model = BEIROllamaEmbeddings(\n", + " base_url=\"http://localhost:11434\",\n", + " model=\"qwen2.5:1.5b\",\n", + " batch_size=64,\n", + ")\n", + "\n", + "# Inicializar buscador y evaluador\n", + "retriever = DenseRetrievalExactSearch(model, batch_size=64)\n", + "evaluator = EvaluateRetrieval(retriever, score_function=\"cos_sim\")\n", + "\n", + "# Ejecutar recuperación\n", + "results = evaluator.retrieve(corpus, queries)\n", + "\n", + "# Evaluar métricas (NDCG, MAP, Recall, Precision)\n", + "ndcg, _map, recall, precision = evaluator.evaluate(\n", + " qrels, results, [1, 3, 5, 10]\n", + ")\n", + "\n", + "print(f\"Resultados para CodeXGLUE:\")\n", + "print(\"NDCG@10:\", ndcg[\"NDCG@10\"])\n", + "print(\"Recall@10:\", recall[\"Recall@10\"])" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "assistance-engine", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/scratches/pseco/evaluation/embeddings/n00 Beir Analysis.ipynb b/scratches/pseco/evaluation/embeddings/n00 Beir Analysis.ipynb index 0076014..b7c1ae2 100644 --- a/scratches/pseco/evaluation/embeddings/n00 Beir Analysis.ipynb +++ b/scratches/pseco/evaluation/embeddings/n00 Beir Analysis.ipynb @@ -10,7 +10,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "id": "c01c19dc", "metadata": {}, "outputs": [], @@ -21,7 +21,8 @@ "from beir.datasets.data_loader import GenericDataLoader\n", "from beir.retrieval.search.dense import DenseRetrievalExactSearch\n", "from beir.retrieval.evaluation import EvaluateRetrieval\n", - "from beir import util" + "from beir import util\n", + "import json" ] }, { @@ -96,6 +97,34 @@ " return self._batch_embed(texts)" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "af3eb66d", + "metadata": {}, + "outputs": [], + "source": [ + "def convert_codexglue_to_beir(input_file):\n", + " corpus, queries, qrels = {}, {}, {}\n", + " with open(input_file, 'r') as f:\n", + " for i, line in enumerate(f):\n", + " data = json.loads(line)\n", + " docid = f\"doc_{i}\"\n", + " queryid = f\"q_{i}\"\n", + " \n", + " # El código es nuestro documento (Corpus)\n", + " corpus[docid] = {\"title\": \"\", \"text\": data['code']}\n", + " # El docstring es nuestra consulta (Query)\n", + " queries[queryid] = data['docstring']\n", + " # En CodeXGLUE, la consulta i corresponde al código i\n", + " qrels[queryid] = {docid: 1}\n", + " \n", + " return corpus, queries, qrels\n", + "\n", + "# Carga tus datos (ejemplo con el set de test de AdvTest)\n", + "corpus, queries, qrels = convert_codexglue_to_beir(\"test.jsonl\")\n" + ] + }, { "cell_type": "markdown", "id": "c9528fb6",