assistance-engine/scratches/acano/evaluate_retrieve.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "8fed4518",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_782131/4243561678.py:18: DeprecationWarning: Importing faithfulness from 'ragas.metrics' is deprecated and will be removed in v1.0. Please use 'ragas.metrics.collections' instead. Example: from ragas.metrics.collections import faithfulness\n",
      "  from ragas.metrics import (\n",
      "/tmp/ipykernel_782131/4243561678.py:18: DeprecationWarning: Importing answer_relevancy from 'ragas.metrics' is deprecated and will be removed in v1.0. Please use 'ragas.metrics.collections' instead. Example: from ragas.metrics.collections import answer_relevancy\n",
      "  from ragas.metrics import (\n",
      "/tmp/ipykernel_782131/4243561678.py:18: DeprecationWarning: Importing context_recall from 'ragas.metrics' is deprecated and will be removed in v1.0. Please use 'ragas.metrics.collections' instead. Example: from ragas.metrics.collections import context_recall\n",
      "  from ragas.metrics import (\n",
      "/tmp/ipykernel_782131/4243561678.py:18: DeprecationWarning: Importing context_precision from 'ragas.metrics' is deprecated and will be removed in v1.0. Please use 'ragas.metrics.collections' instead. Example: from ragas.metrics.collections import context_precision\n",
      "  from ragas.metrics import (\n",
      "/tmp/ipykernel_782131/4243561678.py:18: DeprecationWarning: Importing context_entity_recall from 'ragas.metrics' is deprecated and will be removed in v1.0. Please use 'ragas.metrics.collections' instead. Example: from ragas.metrics.collections import context_entity_recall\n",
      "  from ragas.metrics import (\n",
      "/tmp/ipykernel_782131/4243561678.py:18: DeprecationWarning: Importing answer_similarity from 'ragas.metrics' is deprecated and will be removed in v1.0. Please use 'ragas.metrics.collections' instead. Example: from ragas.metrics.collections import answer_similarity\n",
      "  from ragas.metrics import (\n",
      "/tmp/ipykernel_782131/4243561678.py:18: DeprecationWarning: Importing answer_correctness from 'ragas.metrics' is deprecated and will be removed in v1.0. Please use 'ragas.metrics.collections' instead. Example: from ragas.metrics.collections import answer_correctness\n",
      "  from ragas.metrics import (\n",
      "/tmp/ipykernel_782131/4243561678.py:18: DeprecationWarning: Importing NonLLMContextRecall from 'ragas.metrics' is deprecated and will be removed in v1.0. Please use 'ragas.metrics.collections' instead. Example: from ragas.metrics.collections import NonLLMContextRecall\n",
      "  from ragas.metrics import (\n",
      "/tmp/ipykernel_782131/4243561678.py:18: DeprecationWarning: Importing NonLLMContextPrecisionWithReference from 'ragas.metrics' is deprecated and will be removed in v1.0. Please use 'ragas.metrics.collections' instead. Example: from ragas.metrics.collections import NonLLMContextPrecisionWithReference\n",
      "  from ragas.metrics import (\n"
     ]
    }
   ],
   "source": [
    "import sys\n",
    "from pathlib import Path\n",
    "\n",
    "# Ensure the project root is on the path so `src` is importable\n",
    "_project_root = str(Path(__file__).resolve().parents[2]) if \"__file__\" in dir() else str(Path.cwd().parents[1])\n",
    "if _project_root not in sys.path:\n",
    "    sys.path.insert(0, _project_root)\n",
    "\n",
    "from langchain_core.documents import Document\n",
    "from langchain_classic.chains.retrieval_qa.base import RetrievalQA\n",
    "from langchain_elasticsearch import ElasticsearchStore\n",
    "from ragas import evaluate, SingleTurnSample\n",
    "from ragas.llms import LangchainLLMWrapper\n",
    "from ragas.embeddings import LangchainEmbeddingsWrapper\n",
    "from ragas.testset import TestsetGenerator\n",
    "from ragas.testset.persona import Persona\n",
    "from ragas.testset.synthesizers.single_hop.specific import SingleHopSpecificQuerySynthesizer\n",
    "from ragas.metrics import (\n",
    "    faithfulness,\n",
    "    answer_relevancy,\n",
    "    context_recall,\n",
    "    context_precision,\n",
    "    context_entity_recall,\n",
    "    answer_similarity,\n",
    "    answer_correctness,\n",
    "    NonLLMContextRecall,\n",
    "    NonLLMContextPrecisionWithReference\n",
    ")\n",
    "\n",
    "from src.llm_factory import create_chat_model\n",
    "from src.emb_factory import create_embedding_model\n",
    "from src.config import (\n",
    "    ELASTICSEARCH_LOCAL_URL,\n",
    "    ELASTICSEARCH_INDEX,\n",
    "    OLLAMA_MODEL_NAME,\n",
    "    OLLAMA_EMB_MODEL_NAME,\n",
    "    RAW_DIR,\n",
    "    INTERIM_DIR\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "4426d6c0",
   "metadata": {},
   "outputs": [],
   "source": [
    "llm = create_chat_model(\n",
    "    provider=\"bedrock\",\n",
    "    model=\"global.anthropic.claude-opus-4-6-v1\",\n",
    "    temperature=0,\n",
    ")\n",
    "embeddings = create_embedding_model(\n",
    "    provider=\"ollama\",\n",
    "    model=OLLAMA_EMB_MODEL_NAME,\n",
    ")\n",
    "agent_llm = create_chat_model(\n",
    "    provider=\"ollama\",\n",
    "    model=OLLAMA_MODEL_NAME,\n",
    "    temperature=0,\n",
    "    validate_model_on_init=True,\n",
    ")\n",
    "vector_store = ElasticsearchStore(\n",
    "    es_url=ELASTICSEARCH_LOCAL_URL,\n",
    "    index_name=ELASTICSEARCH_INDEX,\n",
    "    embedding=embeddings,\n",
    "    query_field=\"text\",\n",
    "    vector_query_field=\"vector\",\n",
    ")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "fe524d14",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loaded 24 documents from /home/acano/PycharmProjects/assistance-engine/data/raw\n"
     ]
    }
   ],
   "source": [
    "docs: list[Document] = []\n",
    "for txt_file in sorted(RAW_DIR.glob(\"*.txt\")):\n",
    "    text = txt_file.read_text(encoding=\"utf-8\")\n",
    "    docs.append(Document(page_content=text, metadata={\"source\": txt_file.name}))\n",
    "\n",
    "print(f\"Loaded {len(docs)} documents from {RAW_DIR}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ab1932b7",
   "metadata": {},
   "outputs": [],
   "source": [
    "synth = SingleHopSpecificQuerySynthesizer(llm=LangchainLLMWrapper(llm))\n",
    "\n",
    "generator = TestsetGenerator(llm=LangchainLLMWrapper(llm), embedding_model=LangchainEmbeddingsWrapper(embeddings))\n",
    "synthetic_dataset = generator.generate_with_chunks(\n",
    "    chunks=docs,\n",
    "    testset_size=100,\n",
    "    query_distribution=[(synth, 1.0)]\n",
    ")\n",
    "synthetic_dataset = synthetic_dataset.to_pandas()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "d15cea12",
   "metadata": {},
   "outputs": [],
   "source": [
    "synthetic_dataset.to_csv(INTERIM_DIR / \"retrieve_eval_results/synthetic_dataset.csv\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "18ceb119",
   "metadata": {},
   "outputs": [],
   "source": [
    "retriever = vector_store.as_retriever(\n",
    "        search_type=\"similarity\",\n",
    "        search_kwargs={\"k\": 3},\n",
    "    )\n",
    "\n",
    "qa_chain = RetrievalQA.from_chain_type(\n",
    "    llm=agent_llm, retriever=retriever, return_source_documents=True\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "344a1266",
   "metadata": {},
   "outputs": [],
   "source": [
    "from datasets import Dataset\n",
    "questions = synthetic_dataset[\"user_input\"]\n",
    "ground_truths = synthetic_dataset[\"reference\"]\n",
    "\n",
    "answers = []\n",
    "contexts = []\n",
    "\n",
    "for query in questions:\n",
    "  answers.append(qa_chain.invoke(query)[\"result\"])\n",
    "  contexts.append([docs.page_content for docs in retriever.invoke(query)])\n",
    "\n",
    "# To dict\n",
    "data = {\n",
    "    \"question\": questions,\n",
    "    \"answer\": answers,\n",
    "    \"contexts\": contexts,\n",
    "    \"ground_truth\": ground_truths\n",
    "}\n",
    "\n",
    "# Convert dict to dataset\n",
    "dataset = Dataset.from_dict(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "a9011f94",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "3239c7c9d6254330b9b079a249a74c60",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Evaluating:   0%|          | 0/700 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Exception in callback Task.__step()\n",
      "handle: <Handle Task.__step()>\n",
      "Traceback (most recent call last):\n",
      "  File \"/home/acano/.local/share/uv/python/cpython-3.11.13-linux-x86_64-gnu/lib/python3.11/asyncio/events.py\", line 84, in _run\n",
      "    self._context.run(self._callback, *self._args)\n",
      "RuntimeError: cannot enter context: <_contextvars.Context object at 0x74fe3aa80780> is already entered\n",
      "Task was destroyed but it is pending!\n",
      "task: <Task pending name='Task-3487' coro=<_async_in_context.<locals>.run_in_context() done, defined at /home/acano/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/ipykernel/utils.py:57> wait_for=<Task pending name='Task-3489' coro=<Kernel.shell_main() running at /home/acano/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/ipykernel/kernelbase.py:597> cb=[Task.__wakeup()]> cb=[ZMQStream._run_callback.<locals>._log_error() at /home/acano/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/zmq/eventloop/zmqstream.py:563]>\n",
      "/home/acano/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/pydantic/json_schema.py:335: RuntimeWarning: coroutine 'Kernel.shell_main' was never awaited\n",
      "  mapping[key] = getattr(self, method_name)\n",
      "RuntimeWarning: Enable tracemalloc to get the object allocation traceback\n",
      "Task was destroyed but it is pending!\n",
      "task: <Task pending name='Task-3489' coro=<Kernel.shell_main() running at /home/acano/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/ipykernel/kernelbase.py:597> cb=[Task.__wakeup()]>\n",
      "Exception in callback Task.__step()\n",
      "handle: <Handle Task.__step()>\n",
      "Traceback (most recent call last):\n",
      "  File \"/home/acano/.local/share/uv/python/cpython-3.11.13-linux-x86_64-gnu/lib/python3.11/asyncio/events.py\", line 84, in _run\n",
      "    self._context.run(self._callback, *self._args)\n",
      "RuntimeError: cannot enter context: <_contextvars.Context object at 0x74fe3aa80780> is already entered\n",
      "Task was destroyed but it is pending!\n",
      "task: <Task pending name='Task-5636' coro=<_async_in_context.<locals>.run_in_context() done, defined at /home/acano/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/ipykernel/utils.py:57> wait_for=<Task pending name='Task-5637' coro=<Kernel.shell_main() running at /home/acano/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/ipykernel/kernelbase.py:597> cb=[Task.__wakeup()]> cb=[ZMQStream._run_callback.<locals>._log_error() at /home/acano/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/zmq/eventloop/zmqstream.py:563]>\n",
      "/home/acano/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/pydantic/main.py:716: RuntimeWarning: coroutine 'Kernel.shell_main' was never awaited\n",
      "  return cls.__pydantic_validator__.validate_python(\n",
      "RuntimeWarning: Enable tracemalloc to get the object allocation traceback\n",
      "Task was destroyed but it is pending!\n",
      "task: <Task pending name='Task-5637' coro=<Kernel.shell_main() running at /home/acano/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/ipykernel/kernelbase.py:597> cb=[Task.__wakeup()]>\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user_input</th>\n",
       "      <th>retrieved_contexts</th>\n",
       "      <th>response</th>\n",
       "      <th>reference</th>\n",
       "      <th>faithfulness</th>\n",
       "      <th>answer_relevancy</th>\n",
       "      <th>context_precision</th>\n",
       "      <th>context_recall</th>\n",
       "      <th>context_entity_recall</th>\n",
       "      <th>answer_similarity</th>\n",
       "      <th>answer_correctness</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>How does AVAP handel a ZeroDivisionError when ...</td>\n",
       "      <td>[Execution Model in AVAP 4.1. Structure of a P...</td>\n",
       "      <td>AVAP (Advanced Virtual Application Platform) i...</td>\n",
       "      <td>In AVAP, when a division by zero occurs—whethe...</td>\n",
       "      <td>0.083333</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.300000</td>\n",
       "      <td>0.833670</td>\n",
       "      <td>0.363590</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>As a backend developer who is learning AVAP an...</td>\n",
       "      <td>[SECTION III: Control Logic and Decision Struc...</td>\n",
       "      <td>I can provide information on the if statement ...</td>\n",
       "      <td>In AVAP, control flow structures include condi...</td>\n",
       "      <td>0.904762</td>\n",
       "      <td>0.837564</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.454545</td>\n",
       "      <td>0.157895</td>\n",
       "      <td>0.809311</td>\n",
       "      <td>0.531596</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>hey so in AVAP when i do division by zero what...</td>\n",
       "      <td>[Execution Model in AVAP 4.1. Structure of a P...</td>\n",
       "      <td>In AVAP, when a division operation is attempte...</td>\n",
       "      <td>In AVAP, when you perform a division by zero, ...</td>\n",
       "      <td>0.312500</td>\n",
       "      <td>0.892255</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.200000</td>\n",
       "      <td>0.923880</td>\n",
       "      <td>0.472905</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>what happen if file not found when i do import...</td>\n",
       "      <td>[Execution Model in AVAP 4.1. Structure of a P...</td>\n",
       "      <td>In AVAP, the equivalent of a `FileNotFoundErro...</td>\n",
       "      <td>When an import statement is executed in AVAP, ...</td>\n",
       "      <td>0.142857</td>\n",
       "      <td>0.912031</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.642044</td>\n",
       "      <td>0.248746</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>In AVAP, under what circumstances is a TypeErr...</td>\n",
       "      <td>[Execution Model in AVAP 4.1. Structure of a P...</td>\n",
       "      <td>I don't have information on the specific circu...</td>\n",
       "      <td>In AVAP, a TypeError exception is raised in tw...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.806848</td>\n",
       "      <td>0.201712</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>95</th>\n",
       "      <td>How does the data model in AVAP™ compare to Py...</td>\n",
       "      <td>[Introduction The data model in AVAP™ defines ...</td>\n",
       "      <td>AVAP (Advanced Vector Analysis Platform) is a ...</td>\n",
       "      <td>The data model in AVAP™ is very similar to Pyt...</td>\n",
       "      <td>0.289474</td>\n",
       "      <td>0.925546</td>\n",
       "      <td>0.833333</td>\n",
       "      <td>0.800000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.736816</td>\n",
       "      <td>0.422299</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>96</th>\n",
       "      <td>What data types are available in AVAP™?</td>\n",
       "      <td>[Chapter 5: Data Types In this chapter, we wil...</td>\n",
       "      <td>I don't know the specific details about the da...</td>\n",
       "      <td>In AVAP™, the most common data types include i...</td>\n",
       "      <td>0.818182</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.856402</td>\n",
       "      <td>0.658545</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>97</th>\n",
       "      <td>AVAP strings Unicode</td>\n",
       "      <td>[Chapter 2: Notation in AVAP™ Introduction Not...</td>\n",
       "      <td>AVAP (African Vertebrate Anatomy Project) uses...</td>\n",
       "      <td>In AVAP™, strings (str) represent sequences of...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.918740</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.686281</td>\n",
       "      <td>0.271570</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>98</th>\n",
       "      <td>AVAP data model comparison with Python data ty...</td>\n",
       "      <td>[Introduction The data model in AVAP™ defines ...</td>\n",
       "      <td>Here's a comparison of the AVAP data model wit...</td>\n",
       "      <td>The data model in AVAP is similar to Python in...</td>\n",
       "      <td>0.343750</td>\n",
       "      <td>0.954994</td>\n",
       "      <td>0.833333</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.555556</td>\n",
       "      <td>0.824449</td>\n",
       "      <td>0.587930</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>99</th>\n",
       "      <td>AVAP™ data types and data structures overview</td>\n",
       "      <td>[Introduction The data model in AVAP™ defines ...</td>\n",
       "      <td>AVAP (Advanced Visual Analytics Platform) is a...</td>\n",
       "      <td>AVAP™ uses a flexible and dynamic data model s...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.855719</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.100000</td>\n",
       "      <td>0.856107</td>\n",
       "      <td>0.323783</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>100 rows × 11 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                           user_input  \\\n",
       "0   How does AVAP handel a ZeroDivisionError when ...   \n",
       "1   As a backend developer who is learning AVAP an...   \n",
       "2   hey so in AVAP when i do division by zero what...   \n",
       "3   what happen if file not found when i do import...   \n",
       "4   In AVAP, under what circumstances is a TypeErr...   \n",
       "..                                                ...   \n",
       "95  How does the data model in AVAP™ compare to Py...   \n",
       "96            What data types are available in AVAP™?   \n",
       "97                               AVAP strings Unicode   \n",
       "98  AVAP data model comparison with Python data ty...   \n",
       "99      AVAP™ data types and data structures overview   \n",
       "\n",
       "                                   retrieved_contexts  \\\n",
       "0   [Execution Model in AVAP 4.1. Structure of a P...   \n",
       "1   [SECTION III: Control Logic and Decision Struc...   \n",
       "2   [Execution Model in AVAP 4.1. Structure of a P...   \n",
       "3   [Execution Model in AVAP 4.1. Structure of a P...   \n",
       "4   [Execution Model in AVAP 4.1. Structure of a P...   \n",
       "..                                                ...   \n",
       "95  [Introduction The data model in AVAP™ defines ...   \n",
       "96  [Chapter 5: Data Types In this chapter, we wil...   \n",
       "97  [Chapter 2: Notation in AVAP™ Introduction Not...   \n",
       "98  [Introduction The data model in AVAP™ defines ...   \n",
       "99  [Introduction The data model in AVAP™ defines ...   \n",
       "\n",
       "                                             response  \\\n",
       "0   AVAP (Advanced Virtual Application Platform) i...   \n",
       "1   I can provide information on the if statement ...   \n",
       "2   In AVAP, when a division operation is attempte...   \n",
       "3   In AVAP, the equivalent of a `FileNotFoundErro...   \n",
       "4   I don't have information on the specific circu...   \n",
       "..                                                ...   \n",
       "95  AVAP (Advanced Vector Analysis Platform) is a ...   \n",
       "96  I don't know the specific details about the da...   \n",
       "97  AVAP (African Vertebrate Anatomy Project) uses...   \n",
       "98  Here's a comparison of the AVAP data model wit...   \n",
       "99  AVAP (Advanced Visual Analytics Platform) is a...   \n",
       "\n",
       "                                            reference  faithfulness  \\\n",
       "0   In AVAP, when a division by zero occurs—whethe...      0.083333   \n",
       "1   In AVAP, control flow structures include condi...      0.904762   \n",
       "2   In AVAP, when you perform a division by zero, ...      0.312500   \n",
       "3   When an import statement is executed in AVAP, ...      0.142857   \n",
       "4   In AVAP, a TypeError exception is raised in tw...      0.000000   \n",
       "..                                                ...           ...   \n",
       "95  The data model in AVAP™ is very similar to Pyt...      0.289474   \n",
       "96  In AVAP™, the most common data types include i...      0.818182   \n",
       "97  In AVAP™, strings (str) represent sequences of...      0.000000   \n",
       "98  The data model in AVAP is similar to Python in...      0.343750   \n",
       "99  AVAP™ uses a flexible and dynamic data model s...      0.000000   \n",
       "\n",
       "    answer_relevancy  context_precision  context_recall  \\\n",
       "0           0.000000           1.000000        1.000000   \n",
       "1           0.837564           1.000000        0.454545   \n",
       "2           0.892255           1.000000        1.000000   \n",
       "3           0.912031           1.000000        1.000000   \n",
       "4           0.000000           1.000000        1.000000   \n",
       "..               ...                ...             ...   \n",
       "95          0.925546           0.833333        0.800000   \n",
       "96          0.000000           0.500000        1.000000   \n",
       "97          0.918740           0.000000        0.000000   \n",
       "98          0.954994           0.833333        1.000000   \n",
       "99          0.855719           1.000000        1.000000   \n",
       "\n",
       "    context_entity_recall  answer_similarity  answer_correctness  \n",
       "0                0.300000           0.833670            0.363590  \n",
       "1                0.157895           0.809311            0.531596  \n",
       "2                0.200000           0.923880            0.472905  \n",
       "3                1.000000           0.642044            0.248746  \n",
       "4                1.000000           0.806848            0.201712  \n",
       "..                    ...                ...                 ...  \n",
       "95               0.500000           0.736816            0.422299  \n",
       "96               0.500000           0.856402            0.658545  \n",
       "97               0.500000           0.686281            0.271570  \n",
       "98               0.555556           0.824449            0.587930  \n",
       "99               0.100000           0.856107            0.323783  \n",
       "\n",
       "[100 rows x 11 columns]"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "metrics = [\n",
    "    faithfulness,\n",
    "    answer_relevancy,\n",
    "    context_precision,\n",
    "    context_recall,\n",
    "    context_entity_recall,\n",
    "    answer_similarity,\n",
    "    answer_correctness\n",
    "]\n",
    "\n",
    "result = evaluate(\n",
    "    dataset=dataset, \n",
    "    metrics=metrics,\n",
    "    llm=llm,\n",
    "    embeddings=embeddings,\n",
    ")\n",
    "\n",
    "result_df = result.to_pandas()\n",
    "result_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "20c3fa64",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "faithfulness             0.254643\n",
       "answer_relevancy         0.609250\n",
       "context_precision        0.862500\n",
       "context_recall           0.906242\n",
       "context_entity_recall    0.354178\n",
       "answer_similarity        0.781973\n",
       "answer_correctness       0.359654\n",
       "dtype: float64"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "result_df.mean(numeric_only=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "350755fd",
   "metadata": {},
   "outputs": [],
   "source": [
    "result_df.to_csv(INTERIM_DIR + \"retrieve_eval_results/ragas_eval.csv\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "1ff60103",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user_input</th>\n",
       "      <th>reference_contexts</th>\n",
       "      <th>reference</th>\n",
       "      <th>persona_name</th>\n",
       "      <th>query_style</th>\n",
       "      <th>query_length</th>\n",
       "      <th>synthesizer_name</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Hey, I'm trying to understand how AVAP handels...</td>\n",
       "      <td>[Execution Model in AVAP\\n4.1. Structure of a ...</td>\n",
       "      <td>In AVAP, a ZeroDivisionError is raised in two ...</td>\n",
       "      <td>Carlos Medina</td>\n",
       "      <td>MISSPELLED</td>\n",
       "      <td>LONG</td>\n",
       "      <td>single_hop_specific_query_synthesizer</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>How AVAP handle name resolution different from...</td>\n",
       "      <td>[Execution Model in AVAP\\n4.1. Structure of a ...</td>\n",
       "      <td>In AVAP, when a name is used in a code block, ...</td>\n",
       "      <td>Carlos Medina</td>\n",
       "      <td>POOR_GRAMMAR</td>\n",
       "      <td>MEDIUM</td>\n",
       "      <td>single_hop_specific_query_synthesizer</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>How does AVAP handle name resoltuion and scopi...</td>\n",
       "      <td>[Execution Model in AVAP\\n4.1. Structure of a ...</td>\n",
       "      <td>In AVAP, name resolution works differently fro...</td>\n",
       "      <td>Carlos Méndez</td>\n",
       "      <td>MISSPELLED</td>\n",
       "      <td>MEDIUM</td>\n",
       "      <td>single_hop_specific_query_synthesizer</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>AVAP how does import statement work and what a...</td>\n",
       "      <td>[Execution Model in AVAP\\n4.1. Structure of a ...</td>\n",
       "      <td>In AVAP, the import statement is the only way ...</td>\n",
       "      <td>Carlos Méndez</td>\n",
       "      <td>WEB_SEARCH_LIKE</td>\n",
       "      <td>MEDIUM</td>\n",
       "      <td>single_hop_specific_query_synthesizer</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>what happen with StopIteration when generator ...</td>\n",
       "      <td>[Execution Model in AVAP\\n4.1. Structure of a ...</td>\n",
       "      <td>In generator functions, the return statement i...</td>\n",
       "      <td>Carlos Méndez</td>\n",
       "      <td>POOR_GRAMMAR</td>\n",
       "      <td>SHORT</td>\n",
       "      <td>single_hop_specific_query_synthesizer</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>95</th>\n",
       "      <td>Hey so I been learning AVAP and I wanna know, ...</td>\n",
       "      <td>[Introduction\\nThe data model in AVAP™ defines...</td>\n",
       "      <td>In AVAP™, the data type that uses Unicode is t...</td>\n",
       "      <td>Carlos Méndez</td>\n",
       "      <td>POOR_GRAMMAR</td>\n",
       "      <td>MEDIUM</td>\n",
       "      <td>single_hop_specific_query_synthesizer</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>96</th>\n",
       "      <td>Hey so I been trying to learn AVAP™ and I want...</td>\n",
       "      <td>[Introduction\\nThe data model in AVAP™ defines...</td>\n",
       "      <td>In AVAP™, just like in Python, data types are ...</td>\n",
       "      <td>Carlos Medina</td>\n",
       "      <td>POOR_GRAMMAR</td>\n",
       "      <td>LONG</td>\n",
       "      <td>single_hop_specific_query_synthesizer</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>97</th>\n",
       "      <td>How are Unicde characters related to strings i...</td>\n",
       "      <td>[Introduction\\nThe data model in AVAP™ defines...</td>\n",
       "      <td>In AVAP™, strings (str) represent sequences of...</td>\n",
       "      <td>Carlos Méndez</td>\n",
       "      <td>MISSPELLED</td>\n",
       "      <td>MEDIUM</td>\n",
       "      <td>single_hop_specific_query_synthesizer</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>98</th>\n",
       "      <td>How does the data model in AVAP compare to Pyt...</td>\n",
       "      <td>[Introduction\\nThe data model in AVAP™ defines...</td>\n",
       "      <td>Similar to Python, AVAP uses a flexible and dy...</td>\n",
       "      <td>Carlos Medina</td>\n",
       "      <td>PERFECT_GRAMMAR</td>\n",
       "      <td>SHORT</td>\n",
       "      <td>single_hop_specific_query_synthesizer</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>99</th>\n",
       "      <td>hey so i been learning AVAP™ and i wanna know ...</td>\n",
       "      <td>[Introduction\\nThe data model in AVAP™ defines...</td>\n",
       "      <td>In AVAP™, the most common data types include: ...</td>\n",
       "      <td>Carlos Méndez</td>\n",
       "      <td>POOR_GRAMMAR</td>\n",
       "      <td>MEDIUM</td>\n",
       "      <td>single_hop_specific_query_synthesizer</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>100 rows × 7 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                           user_input  \\\n",
       "0   Hey, I'm trying to understand how AVAP handels...   \n",
       "1   How AVAP handle name resolution different from...   \n",
       "2   How does AVAP handle name resoltuion and scopi...   \n",
       "3   AVAP how does import statement work and what a...   \n",
       "4   what happen with StopIteration when generator ...   \n",
       "..                                                ...   \n",
       "95  Hey so I been learning AVAP and I wanna know, ...   \n",
       "96  Hey so I been trying to learn AVAP™ and I want...   \n",
       "97  How are Unicde characters related to strings i...   \n",
       "98  How does the data model in AVAP compare to Pyt...   \n",
       "99  hey so i been learning AVAP™ and i wanna know ...   \n",
       "\n",
       "                                   reference_contexts  \\\n",
       "0   [Execution Model in AVAP\\n4.1. Structure of a ...   \n",
       "1   [Execution Model in AVAP\\n4.1. Structure of a ...   \n",
       "2   [Execution Model in AVAP\\n4.1. Structure of a ...   \n",
       "3   [Execution Model in AVAP\\n4.1. Structure of a ...   \n",
       "4   [Execution Model in AVAP\\n4.1. Structure of a ...   \n",
       "..                                                ...   \n",
       "95  [Introduction\\nThe data model in AVAP™ defines...   \n",
       "96  [Introduction\\nThe data model in AVAP™ defines...   \n",
       "97  [Introduction\\nThe data model in AVAP™ defines...   \n",
       "98  [Introduction\\nThe data model in AVAP™ defines...   \n",
       "99  [Introduction\\nThe data model in AVAP™ defines...   \n",
       "\n",
       "                                            reference   persona_name  \\\n",
       "0   In AVAP, a ZeroDivisionError is raised in two ...  Carlos Medina   \n",
       "1   In AVAP, when a name is used in a code block, ...  Carlos Medina   \n",
       "2   In AVAP, name resolution works differently fro...  Carlos Méndez   \n",
       "3   In AVAP, the import statement is the only way ...  Carlos Méndez   \n",
       "4   In generator functions, the return statement i...  Carlos Méndez   \n",
       "..                                                ...            ...   \n",
       "95  In AVAP™, the data type that uses Unicode is t...  Carlos Méndez   \n",
       "96  In AVAP™, just like in Python, data types are ...  Carlos Medina   \n",
       "97  In AVAP™, strings (str) represent sequences of...  Carlos Méndez   \n",
       "98  Similar to Python, AVAP uses a flexible and dy...  Carlos Medina   \n",
       "99  In AVAP™, the most common data types include: ...  Carlos Méndez   \n",
       "\n",
       "        query_style query_length                       synthesizer_name  \n",
       "0        MISSPELLED         LONG  single_hop_specific_query_synthesizer  \n",
       "1      POOR_GRAMMAR       MEDIUM  single_hop_specific_query_synthesizer  \n",
       "2        MISSPELLED       MEDIUM  single_hop_specific_query_synthesizer  \n",
       "3   WEB_SEARCH_LIKE       MEDIUM  single_hop_specific_query_synthesizer  \n",
       "4      POOR_GRAMMAR        SHORT  single_hop_specific_query_synthesizer  \n",
       "..              ...          ...                                    ...  \n",
       "95     POOR_GRAMMAR       MEDIUM  single_hop_specific_query_synthesizer  \n",
       "96     POOR_GRAMMAR         LONG  single_hop_specific_query_synthesizer  \n",
       "97       MISSPELLED       MEDIUM  single_hop_specific_query_synthesizer  \n",
       "98  PERFECT_GRAMMAR        SHORT  single_hop_specific_query_synthesizer  \n",
       "99     POOR_GRAMMAR       MEDIUM  single_hop_specific_query_synthesizer  \n",
       "\n",
       "[100 rows x 7 columns]"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "synthetic_dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "71743384",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "from evidently import Dataset\n",
    "from evidently import DataDefinition\n",
    "from evidently.descriptors import *\n",
    "\n",
    "from evidently import Report\n",
    "from evidently.presets import TextEvals\n",
    "from evidently.metrics import *\n",
    "from evidently.tests import *"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "e1ac1a41",
   "metadata": {},
   "outputs": [
    {
     "ename": "ValidationError",
     "evalue": "1 validation error for OllamaOptions\napi_url\n  field required (type=value_error.missing)",
     "output_type": "error",
     "traceback": [
      "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
      "\u001b[31mValidationError\u001b[39m                           Traceback (most recent call last)",
      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[24]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m context_based_evals = \u001b[43mDataset\u001b[49m\u001b[43m.\u001b[49m\u001b[43mfrom_pandas\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m      2\u001b[39m \u001b[43m    \u001b[49m\u001b[43msynthetic_dataset\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m      3\u001b[39m \u001b[43m    \u001b[49m\u001b[43mdata_definition\u001b[49m\u001b[43m=\u001b[49m\u001b[43mDataDefinition\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtext_columns\u001b[49m\u001b[43m=\u001b[49m\u001b[43m[\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43muser_input\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mreference_contexts\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mreference\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m      4\u001b[39m \u001b[43m    \u001b[49m\u001b[43mdescriptors\u001b[49m\u001b[43m=\u001b[49m\u001b[43m[\u001b[49m\u001b[43mContextQualityLLMEval\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mreference_contexts\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mquestion\u001b[49m\u001b[43m=\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43muser_input\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mprovider\u001b[49m\u001b[43m=\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mollama\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m=\u001b[49m\u001b[43mOLLAMA_MODEL_NAME\u001b[49m\u001b[43m)\u001b[49m\u001b[43m]\u001b[49m\n\u001b[32m      5\u001b[39m \u001b[43m)\u001b[49m\n\u001b[32m      6\u001b[39m context_based_evals.as_dataframe()\n",
      "\u001b[36mFile \u001b[39m\u001b[32m~/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/evidently/core/datasets.py:1271\u001b[39m, in \u001b[36mDataset.from_pandas\u001b[39m\u001b[34m(cls, data, data_definition, descriptors, options, metadata, tags)\u001b[39m\n\u001b[32m   1269\u001b[39m dataset = PandasDataset(data, data_definition, metadata=metadata, tags=tags)\n\u001b[32m   1270\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m descriptors \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m-> \u001b[39m\u001b[32m1271\u001b[39m     \u001b[43mdataset\u001b[49m\u001b[43m.\u001b[49m\u001b[43madd_descriptors\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdescriptors\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moptions\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m   1272\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m dataset\n",
      "\u001b[36mFile \u001b[39m\u001b[32m~/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/evidently/core/datasets.py:1382\u001b[39m, in \u001b[36mDataset.add_descriptors\u001b[39m\u001b[34m(self, descriptors, options)\u001b[39m\n\u001b[32m   1375\u001b[39m \u001b[38;5;250m\u001b[39m\u001b[33;03m\"\"\"Add multiple descriptors to the dataset.\u001b[39;00m\n\u001b[32m   1376\u001b[39m \n\u001b[32m   1377\u001b[39m \u001b[33;03mArgs:\u001b[39;00m\n\u001b[32m   1378\u001b[39m \u001b[33;03m* `descriptors`: List of `Descriptor` objects to compute\u001b[39;00m\n\u001b[32m   1379\u001b[39m \u001b[33;03m* `options`: Optional options for descriptor computation\u001b[39;00m\n\u001b[32m   1380\u001b[39m \u001b[33;03m\"\"\"\u001b[39;00m\n\u001b[32m   1381\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m descriptor \u001b[38;5;129;01min\u001b[39;00m descriptors:\n\u001b[32m-> \u001b[39m\u001b[32m1382\u001b[39m     \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43madd_descriptor\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdescriptor\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moptions\u001b[49m\u001b[43m)\u001b[49m\n",
      "\u001b[36mFile \u001b[39m\u001b[32m~/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/evidently/core/datasets.py:1688\u001b[39m, in \u001b[36mPandasDataset.add_descriptor\u001b[39m\u001b[34m(self, descriptor, options)\u001b[39m\n\u001b[32m   1686\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34madd_descriptor\u001b[39m(\u001b[38;5;28mself\u001b[39m, descriptor: Descriptor, options: AnyOptions = \u001b[38;5;28;01mNone\u001b[39;00m):\n\u001b[32m   1687\u001b[39m     descriptor.validate_input(\u001b[38;5;28mself\u001b[39m._data_definition)\n\u001b[32m-> \u001b[39m\u001b[32m1688\u001b[39m     new_columns = \u001b[43mdescriptor\u001b[49m\u001b[43m.\u001b[49m\u001b[43mgenerate_data\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mOptions\u001b[49m\u001b[43m.\u001b[49m\u001b[43mfrom_any_options\u001b[49m\u001b[43m(\u001b[49m\u001b[43moptions\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m   1689\u001b[39m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(new_columns, DatasetColumn):\n\u001b[32m   1690\u001b[39m         new_columns = {descriptor.alias: new_columns}\n",
      "\u001b[36mFile \u001b[39m\u001b[32m~/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/evidently/core/datasets.py:1099\u001b[39m, in \u001b[36mFeatureDescriptor.generate_data\u001b[39m\u001b[34m(self, dataset, options)\u001b[39m\n\u001b[32m   1096\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mgenerate_data\u001b[39m(\n\u001b[32m   1097\u001b[39m     \u001b[38;5;28mself\u001b[39m, dataset: \u001b[33m\"\u001b[39m\u001b[33mDataset\u001b[39m\u001b[33m\"\u001b[39m, options: Options\n\u001b[32m   1098\u001b[39m ) -> Union[DatasetColumn, Dict[DisplayName, DatasetColumn]]:\n\u001b[32m-> \u001b[39m\u001b[32m1099\u001b[39m     feature = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mfeature\u001b[49m\u001b[43m.\u001b[49m\u001b[43mgenerate_features_renamed\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m   1100\u001b[39m \u001b[43m        \u001b[49m\u001b[43mdataset\u001b[49m\u001b[43m.\u001b[49m\u001b[43mas_dataframe\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1101\u001b[39m \u001b[43m        \u001b[49m\u001b[43mcreate_data_definition\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdataset\u001b[49m\u001b[43m.\u001b[49m\u001b[43mas_dataframe\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mColumnMapping\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1102\u001b[39m \u001b[43m        \u001b[49m\u001b[43moptions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1103\u001b[39m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m   1104\u001b[39m     \u001b[38;5;28;01mreturn\u001b[39;00m {\n\u001b[32m   1105\u001b[39m         col.display_name: \u001b[38;5;28mself\u001b[39m.get_dataset_column(col.name, feature[col.name])\n\u001b[32m   1106\u001b[39m         \u001b[38;5;28;01mfor\u001b[39;00m col \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m.feature.list_columns()\n\u001b[32m   1107\u001b[39m     }\n",
      "\u001b[36mFile \u001b[39m\u001b[32m~/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/evidently/legacy/features/generated_features.py:56\u001b[39m, in \u001b[36mGeneratedFeatures.generate_features_renamed\u001b[39m\u001b[34m(self, data, data_definition, options)\u001b[39m\n\u001b[32m     53\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mgenerate_features_renamed\u001b[39m(\n\u001b[32m     54\u001b[39m     \u001b[38;5;28mself\u001b[39m, data: pd.DataFrame, data_definition: DataDefinition, options: Options\n\u001b[32m     55\u001b[39m ) -> pd.DataFrame:\n\u001b[32m---> \u001b[39m\u001b[32m56\u001b[39m     features = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mgenerate_features\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdata\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdata_definition\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moptions\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m     57\u001b[39m     \u001b[38;5;28;01mreturn\u001b[39;00m features.rename(columns={col: \u001b[38;5;28mself\u001b[39m._create_column_name(col) \u001b[38;5;28;01mfor\u001b[39;00m col \u001b[38;5;129;01min\u001b[39;00m features.columns}).set_index(\n\u001b[32m     58\u001b[39m         data.index\n\u001b[32m     59\u001b[39m     )\n",
      "\u001b[36mFile \u001b[39m\u001b[32m~/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/evidently/legacy/features/llm_judge.py:54\u001b[39m, in \u001b[36mLLMJudge.generate_features\u001b[39m\u001b[34m(self, data, data_definition, options)\u001b[39m\n\u001b[32m     53\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mgenerate_features\u001b[39m(\u001b[38;5;28mself\u001b[39m, data: pd.DataFrame, data_definition: DataDefinition, options: Options) -> pd.DataFrame:\n\u001b[32m---> \u001b[39m\u001b[32m54\u001b[39m     result: Union[List, Dict] = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mget_llm_wrapper\u001b[49m\u001b[43m(\u001b[49m\u001b[43moptions\u001b[49m\u001b[43m)\u001b[49m.run_batch_sync(\n\u001b[32m     55\u001b[39m         requests=\u001b[38;5;28mself\u001b[39m.template.iterate_messages(data, \u001b[38;5;28mself\u001b[39m.get_input_columns())\n\u001b[32m     56\u001b[39m     )\n\u001b[32m     57\u001b[39m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(result, \u001b[38;5;28mlist\u001b[39m) \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28many\u001b[39m(\u001b[38;5;28misinstance\u001b[39m(o, \u001b[38;5;28mdict\u001b[39m) \u001b[38;5;28;01mfor\u001b[39;00m o \u001b[38;5;129;01min\u001b[39;00m result):\n\u001b[32m     58\u001b[39m         result = {\u001b[38;5;28mself\u001b[39m.display_name \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m.template.get_main_output_column(): result}\n",
      "\u001b[36mFile \u001b[39m\u001b[32m~/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/evidently/legacy/features/llm_judge.py:43\u001b[39m, in \u001b[36mLLMJudge.get_llm_wrapper\u001b[39m\u001b[34m(self, options)\u001b[39m\n\u001b[32m     41\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mget_llm_wrapper\u001b[39m(\u001b[38;5;28mself\u001b[39m, options: Options) -> LLMWrapper:\n\u001b[32m     42\u001b[39m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m._llm_wrapper \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m---> \u001b[39m\u001b[32m43\u001b[39m         \u001b[38;5;28mself\u001b[39m._llm_wrapper = \u001b[43mget_llm_wrapper\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mprovider\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moptions\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m     44\u001b[39m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m._llm_wrapper\n",
      "\u001b[36mFile \u001b[39m\u001b[32m~/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/evidently/llm/utils/wrapper.py:437\u001b[39m, in \u001b[36mget_llm_wrapper\u001b[39m\u001b[34m(provider, model, options)\u001b[39m\n\u001b[32m    435\u001b[39m key = (provider, \u001b[38;5;28;01mNone\u001b[39;00m)\n\u001b[32m    436\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m key \u001b[38;5;129;01min\u001b[39;00m _wrappers:\n\u001b[32m--> \u001b[39m\u001b[32m437\u001b[39m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_wrappers\u001b[49m\u001b[43m[\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m]\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moptions\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m    438\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m find_spec(\u001b[33m\"\u001b[39m\u001b[33mlitellm\u001b[39m\u001b[33m\"\u001b[39m) \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m    439\u001b[39m     litellm_wrapper = get_litellm_wrapper(provider, model, options)\n",
      "\u001b[36mFile \u001b[39m\u001b[32m~/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/evidently/llm/utils/wrapper.py:583\u001b[39m, in \u001b[36mLiteLLMWrapper.__init__\u001b[39m\u001b[34m(self, model, options)\u001b[39m\n\u001b[32m    581\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34m__init__\u001b[39m(\u001b[38;5;28mself\u001b[39m, model: \u001b[38;5;28mstr\u001b[39m, options: Options):\n\u001b[32m    582\u001b[39m     \u001b[38;5;28mself\u001b[39m.model = model\n\u001b[32m--> \u001b[39m\u001b[32m583\u001b[39m     \u001b[38;5;28mself\u001b[39m.options: LLMOptions = \u001b[43moptions\u001b[49m\u001b[43m.\u001b[49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m__llm_options_type__\u001b[49m\u001b[43m)\u001b[49m\n",
      "\u001b[36mFile \u001b[39m\u001b[32m~/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/evidently/legacy/options/base.py:51\u001b[39m, in \u001b[36mOptions.get\u001b[39m\u001b[34m(self, option_type)\u001b[39m\n\u001b[32m     49\u001b[39m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28missubclass\u001b[39m(possible_subclass, option_type):\n\u001b[32m     50\u001b[39m         \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m.custom[possible_subclass]  \u001b[38;5;66;03m# type: ignore[return-value]\u001b[39;00m\n\u001b[32m---> \u001b[39m\u001b[32m51\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43moption_type\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n",
      "\u001b[36mFile \u001b[39m\u001b[32m~/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/evidently/llm/utils/wrapper.py:472\u001b[39m, in \u001b[36mLLMOptions.__init__\u001b[39m\u001b[34m(self, api_key, rpm_limit, **data)\u001b[39m\n\u001b[32m    465\u001b[39m \u001b[38;5;250m\u001b[39m\u001b[33;03m\"\"\"Initialize LLM options.\u001b[39;00m\n\u001b[32m    466\u001b[39m \n\u001b[32m    467\u001b[39m \u001b[33;03mArgs:\u001b[39;00m\n\u001b[32m    468\u001b[39m \u001b[33;03m* `api_key`: Optional API key for the provider.\u001b[39;00m\n\u001b[32m    469\u001b[39m \u001b[33;03m* `rpm_limit`: Optional requests per minute limit (backward compatibility).\u001b[39;00m\n\u001b[32m    470\u001b[39m \u001b[33;03m\"\"\"\u001b[39;00m\n\u001b[32m    471\u001b[39m \u001b[38;5;28mself\u001b[39m.api_key = SecretStr(api_key) \u001b[38;5;28;01mif\u001b[39;00m api_key \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m472\u001b[39m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m.\u001b[49m\u001b[34;43m__init__\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mdata\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m    473\u001b[39m \u001b[38;5;66;03m# backward comp\u001b[39;00m\n\u001b[32m    474\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m rpm_limit \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n",
      "\u001b[36mFile \u001b[39m\u001b[32m~/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/evidently/pydantic_utils.py:89\u001b[39m, in \u001b[36mFrozenBaseModel.__init__\u001b[39m\u001b[34m(self, **data)\u001b[39m\n\u001b[32m     88\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34m__init__\u001b[39m(\u001b[38;5;28mself\u001b[39m, **data: Any):\n\u001b[32m---> \u001b[39m\u001b[32m89\u001b[39m     \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m.\u001b[49m\u001b[34;43m__init__\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m__init_values__\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mdata\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m     90\u001b[39m     \u001b[38;5;28;01mfor\u001b[39;00m private_attr \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m.__private_attributes__:\n\u001b[32m     91\u001b[39m         \u001b[38;5;28;01mif\u001b[39;00m private_attr \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m.__init_values__:\n",
      "\u001b[36mFile \u001b[39m\u001b[32m~/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/pydantic/v1/main.py:347\u001b[39m, in \u001b[36mBaseModel.__init__\u001b[39m\u001b[34m(__pydantic_self__, **data)\u001b[39m\n\u001b[32m    345\u001b[39m values, fields_set, validation_error = validate_model(__pydantic_self__.\u001b[34m__class__\u001b[39m, data)\n\u001b[32m    346\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m validation_error:\n\u001b[32m--> \u001b[39m\u001b[32m347\u001b[39m     \u001b[38;5;28;01mraise\u001b[39;00m validation_error\n\u001b[32m    348\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m    349\u001b[39m     object_setattr(__pydantic_self__, \u001b[33m'\u001b[39m\u001b[33m__dict__\u001b[39m\u001b[33m'\u001b[39m, values)\n",
      "\u001b[31mValidationError\u001b[39m: 1 validation error for OllamaOptions\napi_url\n  field required (type=value_error.missing)"
     ]
    }
   ],
   "source": [
    "context_based_evals = Dataset.from_pandas(\n",
    "    synthetic_dataset,\n",
    "    data_definition=DataDefinition(text_columns=[\"user_input\", \"reference_contexts\", \"reference\"]),\n",
    "    descriptors=[ContextQualityLLMEval(\"reference_contexts\", question=\"user_input\", provider=\"ollama\", model=OLLAMA_MODEL_NAME)]\n",
    ")\n",
    "context_based_evals.as_dataframe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c2d127ad",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "assistance-engine",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}