From 21bc6fc3f0384735bedcb2fdbfeae6da084dfd9b Mon Sep 17 00:00:00 2001 From: acano Date: Wed, 25 Mar 2026 10:40:49 +0100 Subject: [PATCH] feat: add embedding evaluation results and task processing notebook --- .../emb_models_results.json} | 0 scratches/acano/test_multipl_e_agent.ipynb | 462 ++++++++++++++++++ 2 files changed, 462 insertions(+) rename research/{embedding_eval_results/results.json => embeddings/embedding_eval_results/emb_models_results.json} (100%) create mode 100644 scratches/acano/test_multipl_e_agent.ipynb diff --git a/research/embedding_eval_results/results.json b/research/embeddings/embedding_eval_results/emb_models_results.json similarity index 100% rename from research/embedding_eval_results/results.json rename to research/embeddings/embedding_eval_results/emb_models_results.json diff --git a/scratches/acano/test_multipl_e_agent.ipynb b/scratches/acano/test_multipl_e_agent.ipynb new file mode 100644 index 0000000..136ab6c --- /dev/null +++ b/scratches/acano/test_multipl_e_agent.ipynb @@ -0,0 +1,462 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "b15c29f3", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loaded 30 tasks. 'code' fields cleared.\n" + ] + }, + { + "data": { + "text/plain": [ + "{'task_id': 1,\n", + " 'text': \"Captura el parámetro 'username' de la petición HTTP y devuélvelo como resultado. Si no existe, la variable será None.\",\n", + " 'code': '',\n", + " 'test_inputs': {'username': 'alice'},\n", + " 'test_list': [\"re.match(r'^alice$', str(username))\"]}" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import json\n", + "import copy\n", + "\n", + "from src.config import settings\n", + "\n", + "INPUT_PATH = settings.proj_root / \"synthetic_datasets/synthetic_data_generated_bedrock.json\"\n", + "OUTPUT_PATH = settings.proj_root / \"synthetic_datasets/multipl_e_synthetic_dataset.json\"\n", + "\n", + "with open(INPUT_PATH) as f:\n", + " dataset = json.load(f)\n", + "\n", + "# Deep copy with code emptied\n", + "tasks = copy.deepcopy(dataset)\n", + "for task in tasks:\n", + " task[\"code\"] = \"\"\n", + "\n", + "print(f\"Loaded {len(tasks)} tasks. 'code' fields cleared.\")\n", + "tasks[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "d469eaa5", + "metadata": {}, + "outputs": [], + "source": [ + "import subprocess\n", + "import time\n", + "import re\n", + "\n", + "GRPC_HOST = \"localhost:50052\"\n", + "SERVICE = \"brunix.AssistanceEngine/AskAgent\"\n", + "SESSION_ID = \"dev-test-123\"\n", + "\n", + "AVAP_BLOCK_RE = re.compile(r\"```avap\\s*\\n(.*?)```\", re.DOTALL)\n", + "\n", + "\n", + "def ask_agent(query: str) -> str:\n", + " \"\"\"Call gRPC AskAgent and extract code from ```avap``` blocks in the response.\"\"\"\n", + " payload = json.dumps({\"query\": query, \"session_id\": SESSION_ID})\n", + " cmd = [\n", + " \"grpcurl\", \"-plaintext\",\n", + " \"-d\", payload,\n", + " GRPC_HOST,\n", + " SERVICE,\n", + " ]\n", + " result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)\n", + " if result.returncode != 0:\n", + " raise RuntimeError(f\"grpcurl failed: {result.stderr}\")\n", + "\n", + " # Collect all text fragments from the streamed responses\n", + " raw = result.stdout.strip()\n", + " full_text = \"\"\n", + " for block in raw.split(\"\\n}\\n\"):\n", + " block = block.strip()\n", + " if not block:\n", + " continue\n", + " if not block.endswith(\"}\"):\n", + " block += \"}\"\n", + " try:\n", + " msg = json.loads(block)\n", + " full_text += msg.get(\"text\", \"\")\n", + " except json.JSONDecodeError:\n", + " continue\n", + "\n", + " # Extract code from ```avap ... ``` blocks\n", + " matches = AVAP_BLOCK_RE.findall(full_text)\n", + " return \"\\n\".join(m.strip() for m in matches) if matches else \"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "9d2dc8c1", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[1/30] Task 1: Captura el parámetro 'username' de la petición HTTP y devuélvelo como resultado....\n", + " -> Got 188 chars of code\n", + "[2/30] Task 2: Recibe el parámetro 'email' y establece el código de estado HTTP en 200. Devuelv...\n", + " -> Got 79 chars of code\n", + "[3/30] Task 3: Recibe el parámetro 'password', genera su hash SHA-256 y devuelve el hash como r...\n", + " -> Got 73 chars of code\n", + "[4/30] Task 4: Recibe el parámetro 'text', reemplaza todos los espacios por guiones bajos y dev...\n", + " -> Got 63 chars of code\n", + "[5/30] Task 5: Genera un token aleatorio de 32 caracteres alfanuméricos y devuélvelo como resul...\n", + " -> Got 90 chars of code\n", + "[6/30] Task 6: Recibe el parámetro 'age'. Si age es mayor que 18, devuelve 'adulto'; de lo cont...\n", + " -> Got 131 chars of code\n", + "[7/30] Task 7: Recibe el parámetro 'score'. Si score es igual a 100, establece _status en 200 y...\n", + " -> Got 134 chars of code\n", + "[8/30] Task 8: Crea una lista con el elemento 'item1', obtén su longitud y devuelve la longitud...\n", + " -> Got 78 chars of code\n", + "[9/30] Task 9: Recibe el parámetro 'items' como lista de query params, obtén su longitud y devu...\n", + " -> Got 85 chars of code\n", + "[10/30] Task 10: Recibe el parámetro 'data' como JSON, extrae el campo 'name' y devuélvelo como r...\n", + " -> Got 66 chars of code\n", + "[11/30] Task 11: Crea un objeto JSON vacío, agrega el campo 'status' con valor 'ok' y devuelve el...\n", + " -> Got 61 chars of code\n", + "[12/30] Task 12: Recibe el parámetro 'password', genera su hash MD5 y devuelve el hash como resul...\n", + " -> Got 44 chars of code\n", + "[13/30] Task 13: Obtén la fecha y hora actual en formato 'YYYY-MM-DD' en la zona horaria 'UTC' y ...\n", + " -> Got 85 chars of code\n", + "[14/30] Task 14: Recibe el parámetro 'epoch', conviértelo a string de fecha en formato 'YYYY-MM-D...\n", + " -> Got 94 chars of code\n", + "[15/30] Task 15: Recibe el parámetro 'date_str' en formato 'YYYY-MM-DD', conviértelo a epoch y de...\n", + " -> Got 102 chars of code\n", + "[16/30] Task 16: Define una función que recibe un número y devuelve su cuadrado. Llama a la funci...\n", + " -> Got 89 chars of code\n", + "[17/30] Task 17: Define una función que recibe dos números y devuelve su suma. Llama a la función...\n", + " -> Got 89 chars of code\n", + "[18/30] Task 18: Usa un bloque try/exception para intentar dividir el parámetro 'num' entre 0. Si...\n", + " -> Got 116 chars of code\n", + "[19/30] Task 19: Recibe el parámetro 'url', realiza una petición GET a esa URL con timeout de 500...\n", + " -> Got 86 chars of code\n", + "[20/30] Task 20: Recibe los parámetros 'url' y 'body', realiza una petición POST con timeout de 3...\n", + " -> Got 115 chars of code\n", + "[21/30] Task 21: Instancia un conector externo con UUID '20908e93260147acb2636967021fbf5d', llama...\n", + " -> Got 131 chars of code\n", + "[22/30] Task 22: Lanza una función 'fetchData' de forma asíncrona con go, espera el resultado con...\n", + " -> Got 81 chars of code\n", + "[23/30] Task 23: Recibe el parámetro 'n', itera desde 0 hasta n acumulando la suma y devuelve la ...\n", + " -> Got 126 chars of code\n", + "[24/30] Task 24: Recibe el parámetro 'value'. Usando if Modo 2, si value es mayor que 0 y menor q...\n", + " -> Got 180 chars of code\n", + "[25/30] Task 25: Realiza una consulta ORM a la tabla 'users' seleccionando todos los campos sin f...\n", + " -> Got 69 chars of code\n", + "[26/30] Task 26: Recibe los parámetros 'username' y 'email', inserta un registro en la tabla 'use...\n", + " -> Got 74 chars of code\n", + "[27/30] Task 27: Recibe el parámetro 'user_id', actualiza el campo 'active' a 1 en la tabla 'user...\n", + " -> Got 82 chars of code\n", + "[28/30] Task 28: Importa la librería nativa 'math', calcula el cuadrado de 9 usando una función y...\n", + " -> Got 48 chars of code\n", + "[29/30] Task 29: Recibe el parámetro 'items_json' como JSON con una lista bajo la clave 'items'. ...\n", + " -> Got 81 chars of code\n", + "[30/30] Task 30: Recibe el parámetro 'token'. Si el token tiene exactamente 32 caracteres (usando...\n", + " -> Got 145 chars of code\n", + "\n", + "Done. 30 succeeded, 0 errors.\n" + ] + } + ], + "source": [ + "# Process all tasks – call the agent for each one\n", + "errors = []\n", + "\n", + "for i, task in enumerate(tasks):\n", + " query = task[\"text\"]\n", + " task_id = task[\"task_id\"]\n", + " print(f\"[{i + 1}/{len(tasks)}] Task {task_id}: {query[:80]}...\")\n", + "\n", + " try:\n", + " code = ask_agent(query)\n", + " task[\"code\"] = code\n", + " print(f\" -> Got {len(code)} chars of code\")\n", + " except Exception as e:\n", + " errors.append({\"task_id\": task_id, \"error\": str(e)})\n", + " print(f\" -> ERROR: {e}\")\n", + "\n", + " time.sleep(0.5) # small delay between requests\n", + "\n", + "print(f\"\\nDone. {len(tasks) - len(errors)} succeeded, {len(errors)} errors.\")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "3ce3ef4a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Task 1:\n", + " text: Captura el parámetro 'username' de la petición HTTP y devuélvelo como resultado.\n", + " code: addParam(\"username\", targetUsername) # or: targetUsername = \"username\"\n", + "targetUsername = addVar(targetUsername, \"None\")\n", + "\n", + "Task 2:\n", + " text: Recibe el parámetro 'email' y establece el código de estado HTTP en 200. Devuelv\n", + " code: addVar(_status, 200) # OK\n", + "addParam(\"email\", targetEmail)\n", + "addResult(targetEmail)\n", + "\n", + "Task 3:\n", + " text: Recibe el parámetro 'password', genera su hash SHA-256 y devuelve el hash como r\n", + " code: hash = generateSHA256(password)\n", + "addVar(_status, 200) # OK\n", + "addResult(hash)\n", + "\n", + "Task 4:\n", + " text: Recibe el parámetro 'text', reemplaza todos los espacios por guiones bajos y dev\n", + " code: replaceSpacesWithDashes(text)\n", + "addResult(text.replace(\" \", \"-\"))\n", + "\n", + "Task 5:\n", + " text: Genera un token aleatorio de 32 caracteres alfanuméricos y devuélvelo como resul\n", + " code: randomString(\"[0-9a-zA-Z]\", 32, \"dash\")\n", + "addResult(randomString(\"[0-9a-zA-Z]\", 32, \"dash\"))\n", + "\n", + "Task 6:\n", + " text: Recibe el parámetro 'age'. Si age es mayor que 18, devuelve 'adulto'; de lo cont\n", + " code: if(age > 18):\n", + " addVar(_status, 200) # OK\n", + "else:\n", + " addVar(_status, 403) # Forbidden\n", + "addResult(\"adulto\" if age > 18 el\n", + "\n", + "Task 7:\n", + " text: Recibe el parámetro 'score'. Si score es igual a 100, establece _status en 200 y\n", + " code: if(score == 100):\n", + " addVar(_status, 200) # OK\n", + "else:\n", + " addVar(_status, 400)\n", + "addResult(\"perfecto\" if score == 100 else\n", + "\n", + "Task 8:\n", + " text: Crea una lista con el elemento 'item1', obtén su longitud y devuelve la longitud\n", + " code: variableToList(\"item1\", targetList)\n", + "getListLen(targetList, len)\n", + "addResult(len)\n", + "\n", + "Task 9:\n", + " text: Recibe el parámetro 'items' como lista de query params, obtén su longitud y devu\n", + " code: getQueryParamList(\"paramName\", targetList)\n", + "getListLen(targetList, len)\n", + "addResult(len)\n", + "\n", + "Task 10:\n", + " text: Recibe el parámetro 'data' como JSON, extrae el campo 'name' y devuélvelo como r\n", + " code: variableFromJSON(\"data\", \"name\", targetName)\n", + "addResult(targetName)\n", + "\n", + "Task 11:\n", + " text: Crea un objeto JSON vacío, agrega el campo 'status' con valor 'ok' y devuelve el\n", + " code: emptyObject()\n", + "addVar(\"status\", \"ok\")\n", + "addResult(emptyObject())\n", + "\n", + "Task 12:\n", + " text: Recibe el parámetro 'password', genera su hash MD5 y devuelve el hash como resul\n", + " code: hash = generateMD5(password)\n", + "addResult(hash)\n", + "\n", + "Task 13:\n", + " text: Obtén la fecha y hora actual en formato 'YYYY-MM-DD' en la zona horaria 'UTC' y \n", + " code: getDateTime(\"UTC\", \"local\", 0, targetDate)\n", + "addResult(targetDate.strftime(\"%Y-%m-%d\"))\n", + "\n", + "Task 14:\n", + " text: Recibe el parámetro 'epoch', conviértelo a string de fecha en formato 'YYYY-MM-D\n", + " code: getDateTime(\"UTC\", \"local\", 0, targetDate)\n", + "addResult(targetDate.strftime(\"%Y-%m-%d %H:%M:%S\"))\n", + "\n", + "Task 15:\n", + " text: Recibe el parámetro 'date_str' en formato 'YYYY-MM-DD', conviértelo a epoch y de\n", + " code: getDateTime(\"UTC\", \"local\", 0, targetDate)\n", + "addResult(targetDate.strftime(\"%Y-%m-%d\").replace(\"-\", \"\"))\n", + "\n", + "Task 16:\n", + " text: Define una función que recibe un número y devuelve su cuadrado. Llama a la funci\n", + " code: def square(n):\n", + " result = n * n\n", + " return result\n", + "\n", + "result = square(5)\n", + "addResult(result)\n", + "\n", + "Task 17:\n", + " text: Define una función que recibe dos números y devuelve su suma. Llama a la función\n", + " code: def add(a, b):\n", + " result = a + b\n", + " return result\n", + "\n", + "result = add(5, 3)\n", + "addResult(result)\n", + "\n", + "Task 18:\n", + " text: Usa un bloque try/exception para intentar dividir el parámetro 'num' entre 0. Si\n", + " code: try:\n", + " result = num / 0\n", + "except ZeroDivisionError:\n", + " addVar(_status, 403) # Forbidden\n", + "addResult(\"error_division\")\n", + "\n", + "Task 19:\n", + " text: Recibe el parámetro 'url', realiza una petición GET a esa URL con timeout de 500\n", + " code: addVar(_status, 200) # OK\n", + "addParam(\"url\", targetUrl)\n", + "addResult(getResponse(targetUrl))\n", + "\n", + "Task 20:\n", + " text: Recibe los parámetros 'url' y 'body', realiza una petición POST con timeout de 3\n", + " code: addVar(_status, 200) # OK\n", + "addParam(\"url\", targetUrl)\n", + "addParam(\"body\", targetBody)\n", + "addResult(getResponse(targetUrl))\n", + "\n", + "Task 21:\n", + " text: Instancia un conector externo con UUID '20908e93260147acb2636967021fbf5d', llama\n", + " code: belvo_connector = avapConnector(\"20908e93260147acb2636967021fbf5d\")\n", + "addVar(_status, 200) # OK\n", + "addResult(getStatus(belvo_\n", + "\n", + "Task 22:\n", + " text: Lanza una función 'fetchData' de forma asíncrona con go, espera el resultado con\n", + " code: go fetchData()\n", + "resultado = gather(\"fetchData\", timeout=2000)\n", + "addResult(resultado)\n", + "\n", + "Task 23:\n", + " text: Recibe el parámetro 'n', itera desde 0 hasta n acumulando la suma y devuelve la \n", + " code: def sum(n):\n", + " result = 0\n", + " for i in range(n + 1):\n", + " result += i\n", + " return result\n", + "\n", + "result = sum(5)\n", + "addResult(r\n", + "\n", + "Task 24:\n", + " text: Recibe el parámetro 'value'. Usando if Modo 2, si value es mayor que 0 y menor q\n", + " code: if(value > 0 and value < 100):\n", + " addVar(_status, 200) # OK\n", + "else:\n", + " addVar(_status, 403) # Forbidden\n", + "addResult(\"rango\n", + "\n", + "Task 25:\n", + " text: Realiza una consulta ORM a la tabla 'users' seleccionando todos los campos sin f\n", + " code: ormAccessSelect(\"*\", \"users\", \"\", targetUsers)\n", + "addResult(targetUsers)\n", + "\n", + "Task 26:\n", + " text: Recibe los parámetros 'username' y 'email', inserta un registro en la tabla 'use\n", + " code: ormAccessInsert(\"username\", \"email\", \"\", targetUser)\n", + "addResult(targetUser)\n", + "\n", + "Task 27:\n", + " text: Recibe el parámetro 'user_id', actualiza el campo 'active' a 1 en la tabla 'user\n", + " code: ormAccessUpdate(\"active = 1\", \"users\", \"id = ?\", targetUser)\n", + "addResult(targetUser)\n", + "\n", + "Task 28:\n", + " text: Importa la librería nativa 'math', calcula el cuadrado de 9 usando una función y\n", + " code: import math\n", + "result = square(9)\n", + "addResult(result)\n", + "\n", + "Task 29:\n", + " text: Recibe el parámetro 'items_json' como JSON con una lista bajo la clave 'items'. \n", + " code: getQueryParamList(\"items\", targetList)\n", + "getListLen(targetList, len)\n", + "addResult(len)\n", + "\n", + "Task 30:\n", + " text: Recibe el parámetro 'token'. Si el token tiene exactamente 32 caracteres (usando\n", + " code: if(len(token) == 32):\n", + " addVar(_status, 200)\n", + "else:\n", + " addVar(_status, 401)\n", + "addResult(\"token_valido\" if len(token) == \n", + "\n" + ] + } + ], + "source": [ + "# Preview a few results\n", + "for task in tasks:\n", + " print(f\"Task {task['task_id']}:\")\n", + " print(f\" text: {task['text'][:80]}\")\n", + " print(f\" code: {task['code'][:120]}\")\n", + " print()\n", + "\n", + "if errors:\n", + " print(\"Errors:\")\n", + " for e in errors:\n", + " print(f\" Task {e['task_id']}: {e['error']}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "d19a6325", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Saved to /home/acano/PycharmProjects/assistance-engine/synthetic_datasets/multipl_e_synthetic_dataset.json\n" + ] + } + ], + "source": [ + "# Save the completed dataset\n", + "with open(OUTPUT_PATH, \"w\", encoding=\"utf-8\") as f:\n", + " json.dump(tasks, f, ensure_ascii=False, indent=2)\n", + "\n", + "print(f\"Saved to {OUTPUT_PATH.resolve()}\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "assistance-engine", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}