assistance-engine/scratches/acano/evaluate_retrieve.ipynb

905 lines
56 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "code",
"execution_count": 6,
"id": "8fed4518",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_782131/4243561678.py:18: DeprecationWarning: Importing faithfulness from 'ragas.metrics' is deprecated and will be removed in v1.0. Please use 'ragas.metrics.collections' instead. Example: from ragas.metrics.collections import faithfulness\n",
" from ragas.metrics import (\n",
"/tmp/ipykernel_782131/4243561678.py:18: DeprecationWarning: Importing answer_relevancy from 'ragas.metrics' is deprecated and will be removed in v1.0. Please use 'ragas.metrics.collections' instead. Example: from ragas.metrics.collections import answer_relevancy\n",
" from ragas.metrics import (\n",
"/tmp/ipykernel_782131/4243561678.py:18: DeprecationWarning: Importing context_recall from 'ragas.metrics' is deprecated and will be removed in v1.0. Please use 'ragas.metrics.collections' instead. Example: from ragas.metrics.collections import context_recall\n",
" from ragas.metrics import (\n",
"/tmp/ipykernel_782131/4243561678.py:18: DeprecationWarning: Importing context_precision from 'ragas.metrics' is deprecated and will be removed in v1.0. Please use 'ragas.metrics.collections' instead. Example: from ragas.metrics.collections import context_precision\n",
" from ragas.metrics import (\n",
"/tmp/ipykernel_782131/4243561678.py:18: DeprecationWarning: Importing context_entity_recall from 'ragas.metrics' is deprecated and will be removed in v1.0. Please use 'ragas.metrics.collections' instead. Example: from ragas.metrics.collections import context_entity_recall\n",
" from ragas.metrics import (\n",
"/tmp/ipykernel_782131/4243561678.py:18: DeprecationWarning: Importing answer_similarity from 'ragas.metrics' is deprecated and will be removed in v1.0. Please use 'ragas.metrics.collections' instead. Example: from ragas.metrics.collections import answer_similarity\n",
" from ragas.metrics import (\n",
"/tmp/ipykernel_782131/4243561678.py:18: DeprecationWarning: Importing answer_correctness from 'ragas.metrics' is deprecated and will be removed in v1.0. Please use 'ragas.metrics.collections' instead. Example: from ragas.metrics.collections import answer_correctness\n",
" from ragas.metrics import (\n",
"/tmp/ipykernel_782131/4243561678.py:18: DeprecationWarning: Importing NonLLMContextRecall from 'ragas.metrics' is deprecated and will be removed in v1.0. Please use 'ragas.metrics.collections' instead. Example: from ragas.metrics.collections import NonLLMContextRecall\n",
" from ragas.metrics import (\n",
"/tmp/ipykernel_782131/4243561678.py:18: DeprecationWarning: Importing NonLLMContextPrecisionWithReference from 'ragas.metrics' is deprecated and will be removed in v1.0. Please use 'ragas.metrics.collections' instead. Example: from ragas.metrics.collections import NonLLMContextPrecisionWithReference\n",
" from ragas.metrics import (\n"
]
}
],
"source": [
"import sys\n",
"from pathlib import Path\n",
"\n",
"# Ensure the project root is on the path so `src` is importable\n",
"_project_root = str(Path(__file__).resolve().parents[2]) if \"__file__\" in dir() else str(Path.cwd().parents[1])\n",
"if _project_root not in sys.path:\n",
" sys.path.insert(0, _project_root)\n",
"\n",
"from langchain_core.documents import Document\n",
"from langchain_classic.chains.retrieval_qa.base import RetrievalQA\n",
"from langchain_elasticsearch import ElasticsearchStore\n",
"from ragas import evaluate, SingleTurnSample\n",
"from ragas.llms import LangchainLLMWrapper\n",
"from ragas.embeddings import LangchainEmbeddingsWrapper\n",
"from ragas.testset import TestsetGenerator\n",
"from ragas.testset.persona import Persona\n",
"from ragas.testset.synthesizers.single_hop.specific import SingleHopSpecificQuerySynthesizer\n",
"from ragas.metrics import (\n",
" faithfulness,\n",
" answer_relevancy,\n",
" context_recall,\n",
" context_precision,\n",
" context_entity_recall,\n",
" answer_similarity,\n",
" answer_correctness,\n",
" NonLLMContextRecall,\n",
" NonLLMContextPrecisionWithReference\n",
")\n",
"\n",
"from src.llm_factory import create_chat_model\n",
"from src.emb_factory import create_embedding_model\n",
"from src.config import (\n",
" ELASTICSEARCH_LOCAL_URL,\n",
" ELASTICSEARCH_INDEX,\n",
" OLLAMA_MODEL_NAME,\n",
" OLLAMA_EMB_MODEL_NAME,\n",
" RAW_DIR,\n",
" INTERIM_DIR\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "4426d6c0",
"metadata": {},
"outputs": [],
"source": [
"llm = create_chat_model(\n",
" provider=\"bedrock\",\n",
" model=\"global.anthropic.claude-opus-4-6-v1\",\n",
" temperature=0,\n",
")\n",
"embeddings = create_embedding_model(\n",
" provider=\"ollama\",\n",
" model=OLLAMA_EMB_MODEL_NAME,\n",
")\n",
"agent_llm = create_chat_model(\n",
" provider=\"ollama\",\n",
" model=OLLAMA_MODEL_NAME,\n",
" temperature=0,\n",
" validate_model_on_init=True,\n",
")\n",
"vector_store = ElasticsearchStore(\n",
" es_url=ELASTICSEARCH_LOCAL_URL,\n",
" index_name=ELASTICSEARCH_INDEX,\n",
" embedding=embeddings,\n",
" query_field=\"text\",\n",
" vector_query_field=\"vector\",\n",
")\n"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "fe524d14",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Loaded 24 documents from /home/acano/PycharmProjects/assistance-engine/data/raw\n"
]
}
],
"source": [
"docs: list[Document] = []\n",
"for txt_file in sorted(RAW_DIR.glob(\"*.txt\")):\n",
" text = txt_file.read_text(encoding=\"utf-8\")\n",
" docs.append(Document(page_content=text, metadata={\"source\": txt_file.name}))\n",
"\n",
"print(f\"Loaded {len(docs)} documents from {RAW_DIR}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ab1932b7",
"metadata": {},
"outputs": [],
"source": [
"synth = SingleHopSpecificQuerySynthesizer(llm=LangchainLLMWrapper(llm))\n",
"\n",
"generator = TestsetGenerator(llm=LangchainLLMWrapper(llm), embedding_model=LangchainEmbeddingsWrapper(embeddings))\n",
"synthetic_dataset = generator.generate_with_chunks(\n",
" chunks=docs,\n",
" testset_size=100,\n",
" query_distribution=[(synth, 1.0)]\n",
")\n",
"synthetic_dataset = synthetic_dataset.to_pandas()"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "d15cea12",
"metadata": {},
"outputs": [],
"source": [
"synthetic_dataset.to_csv(INTERIM_DIR / \"retrieve_eval_results/synthetic_dataset.csv\", index=False)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "18ceb119",
"metadata": {},
"outputs": [],
"source": [
"retriever = vector_store.as_retriever(\n",
" search_type=\"similarity\",\n",
" search_kwargs={\"k\": 3},\n",
" )\n",
"\n",
"qa_chain = RetrievalQA.from_chain_type(\n",
" llm=agent_llm, retriever=retriever, return_source_documents=True\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "344a1266",
"metadata": {},
"outputs": [],
"source": [
"from datasets import Dataset\n",
"questions = synthetic_dataset[\"user_input\"]\n",
"ground_truths = synthetic_dataset[\"reference\"]\n",
"\n",
"answers = []\n",
"contexts = []\n",
"\n",
"for query in questions:\n",
" answers.append(qa_chain.invoke(query)[\"result\"])\n",
" contexts.append([docs.page_content for docs in retriever.invoke(query)])\n",
"\n",
"# To dict\n",
"data = {\n",
" \"question\": questions,\n",
" \"answer\": answers,\n",
" \"contexts\": contexts,\n",
" \"ground_truth\": ground_truths\n",
"}\n",
"\n",
"# Convert dict to dataset\n",
"dataset = Dataset.from_dict(data)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "a9011f94",
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "3239c7c9d6254330b9b079a249a74c60",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Evaluating: 0%| | 0/700 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Exception in callback Task.__step()\n",
"handle: <Handle Task.__step()>\n",
"Traceback (most recent call last):\n",
" File \"/home/acano/.local/share/uv/python/cpython-3.11.13-linux-x86_64-gnu/lib/python3.11/asyncio/events.py\", line 84, in _run\n",
" self._context.run(self._callback, *self._args)\n",
"RuntimeError: cannot enter context: <_contextvars.Context object at 0x74fe3aa80780> is already entered\n",
"Task was destroyed but it is pending!\n",
"task: <Task pending name='Task-3487' coro=<_async_in_context.<locals>.run_in_context() done, defined at /home/acano/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/ipykernel/utils.py:57> wait_for=<Task pending name='Task-3489' coro=<Kernel.shell_main() running at /home/acano/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/ipykernel/kernelbase.py:597> cb=[Task.__wakeup()]> cb=[ZMQStream._run_callback.<locals>._log_error() at /home/acano/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/zmq/eventloop/zmqstream.py:563]>\n",
"/home/acano/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/pydantic/json_schema.py:335: RuntimeWarning: coroutine 'Kernel.shell_main' was never awaited\n",
" mapping[key] = getattr(self, method_name)\n",
"RuntimeWarning: Enable tracemalloc to get the object allocation traceback\n",
"Task was destroyed but it is pending!\n",
"task: <Task pending name='Task-3489' coro=<Kernel.shell_main() running at /home/acano/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/ipykernel/kernelbase.py:597> cb=[Task.__wakeup()]>\n",
"Exception in callback Task.__step()\n",
"handle: <Handle Task.__step()>\n",
"Traceback (most recent call last):\n",
" File \"/home/acano/.local/share/uv/python/cpython-3.11.13-linux-x86_64-gnu/lib/python3.11/asyncio/events.py\", line 84, in _run\n",
" self._context.run(self._callback, *self._args)\n",
"RuntimeError: cannot enter context: <_contextvars.Context object at 0x74fe3aa80780> is already entered\n",
"Task was destroyed but it is pending!\n",
"task: <Task pending name='Task-5636' coro=<_async_in_context.<locals>.run_in_context() done, defined at /home/acano/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/ipykernel/utils.py:57> wait_for=<Task pending name='Task-5637' coro=<Kernel.shell_main() running at /home/acano/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/ipykernel/kernelbase.py:597> cb=[Task.__wakeup()]> cb=[ZMQStream._run_callback.<locals>._log_error() at /home/acano/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/zmq/eventloop/zmqstream.py:563]>\n",
"/home/acano/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/pydantic/main.py:716: RuntimeWarning: coroutine 'Kernel.shell_main' was never awaited\n",
" return cls.__pydantic_validator__.validate_python(\n",
"RuntimeWarning: Enable tracemalloc to get the object allocation traceback\n",
"Task was destroyed but it is pending!\n",
"task: <Task pending name='Task-5637' coro=<Kernel.shell_main() running at /home/acano/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/ipykernel/kernelbase.py:597> cb=[Task.__wakeup()]>\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>user_input</th>\n",
" <th>retrieved_contexts</th>\n",
" <th>response</th>\n",
" <th>reference</th>\n",
" <th>faithfulness</th>\n",
" <th>answer_relevancy</th>\n",
" <th>context_precision</th>\n",
" <th>context_recall</th>\n",
" <th>context_entity_recall</th>\n",
" <th>answer_similarity</th>\n",
" <th>answer_correctness</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>How does AVAP handel a ZeroDivisionError when ...</td>\n",
" <td>[Execution Model in AVAP 4.1. Structure of a P...</td>\n",
" <td>AVAP (Advanced Virtual Application Platform) i...</td>\n",
" <td>In AVAP, when a division by zero occurs—whethe...</td>\n",
" <td>0.083333</td>\n",
" <td>0.000000</td>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
" <td>0.300000</td>\n",
" <td>0.833670</td>\n",
" <td>0.363590</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>As a backend developer who is learning AVAP an...</td>\n",
" <td>[SECTION III: Control Logic and Decision Struc...</td>\n",
" <td>I can provide information on the if statement ...</td>\n",
" <td>In AVAP, control flow structures include condi...</td>\n",
" <td>0.904762</td>\n",
" <td>0.837564</td>\n",
" <td>1.000000</td>\n",
" <td>0.454545</td>\n",
" <td>0.157895</td>\n",
" <td>0.809311</td>\n",
" <td>0.531596</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>hey so in AVAP when i do division by zero what...</td>\n",
" <td>[Execution Model in AVAP 4.1. Structure of a P...</td>\n",
" <td>In AVAP, when a division operation is attempte...</td>\n",
" <td>In AVAP, when you perform a division by zero, ...</td>\n",
" <td>0.312500</td>\n",
" <td>0.892255</td>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
" <td>0.200000</td>\n",
" <td>0.923880</td>\n",
" <td>0.472905</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>what happen if file not found when i do import...</td>\n",
" <td>[Execution Model in AVAP 4.1. Structure of a P...</td>\n",
" <td>In AVAP, the equivalent of a `FileNotFoundErro...</td>\n",
" <td>When an import statement is executed in AVAP, ...</td>\n",
" <td>0.142857</td>\n",
" <td>0.912031</td>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
" <td>0.642044</td>\n",
" <td>0.248746</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>In AVAP, under what circumstances is a TypeErr...</td>\n",
" <td>[Execution Model in AVAP 4.1. Structure of a P...</td>\n",
" <td>I don't have information on the specific circu...</td>\n",
" <td>In AVAP, a TypeError exception is raised in tw...</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
" <td>0.806848</td>\n",
" <td>0.201712</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>95</th>\n",
" <td>How does the data model in AVAP™ compare to Py...</td>\n",
" <td>[Introduction The data model in AVAP™ defines ...</td>\n",
" <td>AVAP (Advanced Vector Analysis Platform) is a ...</td>\n",
" <td>The data model in AVAP™ is very similar to Pyt...</td>\n",
" <td>0.289474</td>\n",
" <td>0.925546</td>\n",
" <td>0.833333</td>\n",
" <td>0.800000</td>\n",
" <td>0.500000</td>\n",
" <td>0.736816</td>\n",
" <td>0.422299</td>\n",
" </tr>\n",
" <tr>\n",
" <th>96</th>\n",
" <td>What data types are available in AVAP™?</td>\n",
" <td>[Chapter 5: Data Types In this chapter, we wil...</td>\n",
" <td>I don't know the specific details about the da...</td>\n",
" <td>In AVAP™, the most common data types include i...</td>\n",
" <td>0.818182</td>\n",
" <td>0.000000</td>\n",
" <td>0.500000</td>\n",
" <td>1.000000</td>\n",
" <td>0.500000</td>\n",
" <td>0.856402</td>\n",
" <td>0.658545</td>\n",
" </tr>\n",
" <tr>\n",
" <th>97</th>\n",
" <td>AVAP strings Unicode</td>\n",
" <td>[Chapter 2: Notation in AVAP™ Introduction Not...</td>\n",
" <td>AVAP (African Vertebrate Anatomy Project) uses...</td>\n",
" <td>In AVAP™, strings (str) represent sequences of...</td>\n",
" <td>0.000000</td>\n",
" <td>0.918740</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.500000</td>\n",
" <td>0.686281</td>\n",
" <td>0.271570</td>\n",
" </tr>\n",
" <tr>\n",
" <th>98</th>\n",
" <td>AVAP data model comparison with Python data ty...</td>\n",
" <td>[Introduction The data model in AVAP™ defines ...</td>\n",
" <td>Here's a comparison of the AVAP data model wit...</td>\n",
" <td>The data model in AVAP is similar to Python in...</td>\n",
" <td>0.343750</td>\n",
" <td>0.954994</td>\n",
" <td>0.833333</td>\n",
" <td>1.000000</td>\n",
" <td>0.555556</td>\n",
" <td>0.824449</td>\n",
" <td>0.587930</td>\n",
" </tr>\n",
" <tr>\n",
" <th>99</th>\n",
" <td>AVAP™ data types and data structures overview</td>\n",
" <td>[Introduction The data model in AVAP™ defines ...</td>\n",
" <td>AVAP (Advanced Visual Analytics Platform) is a...</td>\n",
" <td>AVAP™ uses a flexible and dynamic data model s...</td>\n",
" <td>0.000000</td>\n",
" <td>0.855719</td>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
" <td>0.100000</td>\n",
" <td>0.856107</td>\n",
" <td>0.323783</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>100 rows × 11 columns</p>\n",
"</div>"
],
"text/plain": [
" user_input \\\n",
"0 How does AVAP handel a ZeroDivisionError when ... \n",
"1 As a backend developer who is learning AVAP an... \n",
"2 hey so in AVAP when i do division by zero what... \n",
"3 what happen if file not found when i do import... \n",
"4 In AVAP, under what circumstances is a TypeErr... \n",
".. ... \n",
"95 How does the data model in AVAP™ compare to Py... \n",
"96 What data types are available in AVAP™? \n",
"97 AVAP strings Unicode \n",
"98 AVAP data model comparison with Python data ty... \n",
"99 AVAP™ data types and data structures overview \n",
"\n",
" retrieved_contexts \\\n",
"0 [Execution Model in AVAP 4.1. Structure of a P... \n",
"1 [SECTION III: Control Logic and Decision Struc... \n",
"2 [Execution Model in AVAP 4.1. Structure of a P... \n",
"3 [Execution Model in AVAP 4.1. Structure of a P... \n",
"4 [Execution Model in AVAP 4.1. Structure of a P... \n",
".. ... \n",
"95 [Introduction The data model in AVAP™ defines ... \n",
"96 [Chapter 5: Data Types In this chapter, we wil... \n",
"97 [Chapter 2: Notation in AVAP™ Introduction Not... \n",
"98 [Introduction The data model in AVAP™ defines ... \n",
"99 [Introduction The data model in AVAP™ defines ... \n",
"\n",
" response \\\n",
"0 AVAP (Advanced Virtual Application Platform) i... \n",
"1 I can provide information on the if statement ... \n",
"2 In AVAP, when a division operation is attempte... \n",
"3 In AVAP, the equivalent of a `FileNotFoundErro... \n",
"4 I don't have information on the specific circu... \n",
".. ... \n",
"95 AVAP (Advanced Vector Analysis Platform) is a ... \n",
"96 I don't know the specific details about the da... \n",
"97 AVAP (African Vertebrate Anatomy Project) uses... \n",
"98 Here's a comparison of the AVAP data model wit... \n",
"99 AVAP (Advanced Visual Analytics Platform) is a... \n",
"\n",
" reference faithfulness \\\n",
"0 In AVAP, when a division by zero occurs—whethe... 0.083333 \n",
"1 In AVAP, control flow structures include condi... 0.904762 \n",
"2 In AVAP, when you perform a division by zero, ... 0.312500 \n",
"3 When an import statement is executed in AVAP, ... 0.142857 \n",
"4 In AVAP, a TypeError exception is raised in tw... 0.000000 \n",
".. ... ... \n",
"95 The data model in AVAP™ is very similar to Pyt... 0.289474 \n",
"96 In AVAP™, the most common data types include i... 0.818182 \n",
"97 In AVAP™, strings (str) represent sequences of... 0.000000 \n",
"98 The data model in AVAP is similar to Python in... 0.343750 \n",
"99 AVAP™ uses a flexible and dynamic data model s... 0.000000 \n",
"\n",
" answer_relevancy context_precision context_recall \\\n",
"0 0.000000 1.000000 1.000000 \n",
"1 0.837564 1.000000 0.454545 \n",
"2 0.892255 1.000000 1.000000 \n",
"3 0.912031 1.000000 1.000000 \n",
"4 0.000000 1.000000 1.000000 \n",
".. ... ... ... \n",
"95 0.925546 0.833333 0.800000 \n",
"96 0.000000 0.500000 1.000000 \n",
"97 0.918740 0.000000 0.000000 \n",
"98 0.954994 0.833333 1.000000 \n",
"99 0.855719 1.000000 1.000000 \n",
"\n",
" context_entity_recall answer_similarity answer_correctness \n",
"0 0.300000 0.833670 0.363590 \n",
"1 0.157895 0.809311 0.531596 \n",
"2 0.200000 0.923880 0.472905 \n",
"3 1.000000 0.642044 0.248746 \n",
"4 1.000000 0.806848 0.201712 \n",
".. ... ... ... \n",
"95 0.500000 0.736816 0.422299 \n",
"96 0.500000 0.856402 0.658545 \n",
"97 0.500000 0.686281 0.271570 \n",
"98 0.555556 0.824449 0.587930 \n",
"99 0.100000 0.856107 0.323783 \n",
"\n",
"[100 rows x 11 columns]"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"metrics = [\n",
" faithfulness,\n",
" answer_relevancy,\n",
" context_precision,\n",
" context_recall,\n",
" context_entity_recall,\n",
" answer_similarity,\n",
" answer_correctness\n",
"]\n",
"\n",
"result = evaluate(\n",
" dataset=dataset, \n",
" metrics=metrics,\n",
" llm=llm,\n",
" embeddings=embeddings,\n",
")\n",
"\n",
"result_df = result.to_pandas()\n",
"result_df"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "20c3fa64",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"faithfulness 0.254643\n",
"answer_relevancy 0.609250\n",
"context_precision 0.862500\n",
"context_recall 0.906242\n",
"context_entity_recall 0.354178\n",
"answer_similarity 0.781973\n",
"answer_correctness 0.359654\n",
"dtype: float64"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"result_df.mean(numeric_only=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "350755fd",
"metadata": {},
"outputs": [],
"source": [
"result_df.to_csv(INTERIM_DIR + \"retrieve_eval_results/ragas_eval.csv\", index=False)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "1ff60103",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>user_input</th>\n",
" <th>reference_contexts</th>\n",
" <th>reference</th>\n",
" <th>persona_name</th>\n",
" <th>query_style</th>\n",
" <th>query_length</th>\n",
" <th>synthesizer_name</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Hey, I'm trying to understand how AVAP handels...</td>\n",
" <td>[Execution Model in AVAP\\n4.1. Structure of a ...</td>\n",
" <td>In AVAP, a ZeroDivisionError is raised in two ...</td>\n",
" <td>Carlos Medina</td>\n",
" <td>MISSPELLED</td>\n",
" <td>LONG</td>\n",
" <td>single_hop_specific_query_synthesizer</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>How AVAP handle name resolution different from...</td>\n",
" <td>[Execution Model in AVAP\\n4.1. Structure of a ...</td>\n",
" <td>In AVAP, when a name is used in a code block, ...</td>\n",
" <td>Carlos Medina</td>\n",
" <td>POOR_GRAMMAR</td>\n",
" <td>MEDIUM</td>\n",
" <td>single_hop_specific_query_synthesizer</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>How does AVAP handle name resoltuion and scopi...</td>\n",
" <td>[Execution Model in AVAP\\n4.1. Structure of a ...</td>\n",
" <td>In AVAP, name resolution works differently fro...</td>\n",
" <td>Carlos Méndez</td>\n",
" <td>MISSPELLED</td>\n",
" <td>MEDIUM</td>\n",
" <td>single_hop_specific_query_synthesizer</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>AVAP how does import statement work and what a...</td>\n",
" <td>[Execution Model in AVAP\\n4.1. Structure of a ...</td>\n",
" <td>In AVAP, the import statement is the only way ...</td>\n",
" <td>Carlos Méndez</td>\n",
" <td>WEB_SEARCH_LIKE</td>\n",
" <td>MEDIUM</td>\n",
" <td>single_hop_specific_query_synthesizer</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>what happen with StopIteration when generator ...</td>\n",
" <td>[Execution Model in AVAP\\n4.1. Structure of a ...</td>\n",
" <td>In generator functions, the return statement i...</td>\n",
" <td>Carlos Méndez</td>\n",
" <td>POOR_GRAMMAR</td>\n",
" <td>SHORT</td>\n",
" <td>single_hop_specific_query_synthesizer</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>95</th>\n",
" <td>Hey so I been learning AVAP and I wanna know, ...</td>\n",
" <td>[Introduction\\nThe data model in AVAP™ defines...</td>\n",
" <td>In AVAP™, the data type that uses Unicode is t...</td>\n",
" <td>Carlos Méndez</td>\n",
" <td>POOR_GRAMMAR</td>\n",
" <td>MEDIUM</td>\n",
" <td>single_hop_specific_query_synthesizer</td>\n",
" </tr>\n",
" <tr>\n",
" <th>96</th>\n",
" <td>Hey so I been trying to learn AVAP™ and I want...</td>\n",
" <td>[Introduction\\nThe data model in AVAP™ defines...</td>\n",
" <td>In AVAP™, just like in Python, data types are ...</td>\n",
" <td>Carlos Medina</td>\n",
" <td>POOR_GRAMMAR</td>\n",
" <td>LONG</td>\n",
" <td>single_hop_specific_query_synthesizer</td>\n",
" </tr>\n",
" <tr>\n",
" <th>97</th>\n",
" <td>How are Unicde characters related to strings i...</td>\n",
" <td>[Introduction\\nThe data model in AVAP™ defines...</td>\n",
" <td>In AVAP™, strings (str) represent sequences of...</td>\n",
" <td>Carlos Méndez</td>\n",
" <td>MISSPELLED</td>\n",
" <td>MEDIUM</td>\n",
" <td>single_hop_specific_query_synthesizer</td>\n",
" </tr>\n",
" <tr>\n",
" <th>98</th>\n",
" <td>How does the data model in AVAP compare to Pyt...</td>\n",
" <td>[Introduction\\nThe data model in AVAP™ defines...</td>\n",
" <td>Similar to Python, AVAP uses a flexible and dy...</td>\n",
" <td>Carlos Medina</td>\n",
" <td>PERFECT_GRAMMAR</td>\n",
" <td>SHORT</td>\n",
" <td>single_hop_specific_query_synthesizer</td>\n",
" </tr>\n",
" <tr>\n",
" <th>99</th>\n",
" <td>hey so i been learning AVAP™ and i wanna know ...</td>\n",
" <td>[Introduction\\nThe data model in AVAP™ defines...</td>\n",
" <td>In AVAP™, the most common data types include: ...</td>\n",
" <td>Carlos Méndez</td>\n",
" <td>POOR_GRAMMAR</td>\n",
" <td>MEDIUM</td>\n",
" <td>single_hop_specific_query_synthesizer</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>100 rows × 7 columns</p>\n",
"</div>"
],
"text/plain": [
" user_input \\\n",
"0 Hey, I'm trying to understand how AVAP handels... \n",
"1 How AVAP handle name resolution different from... \n",
"2 How does AVAP handle name resoltuion and scopi... \n",
"3 AVAP how does import statement work and what a... \n",
"4 what happen with StopIteration when generator ... \n",
".. ... \n",
"95 Hey so I been learning AVAP and I wanna know, ... \n",
"96 Hey so I been trying to learn AVAP™ and I want... \n",
"97 How are Unicde characters related to strings i... \n",
"98 How does the data model in AVAP compare to Pyt... \n",
"99 hey so i been learning AVAP™ and i wanna know ... \n",
"\n",
" reference_contexts \\\n",
"0 [Execution Model in AVAP\\n4.1. Structure of a ... \n",
"1 [Execution Model in AVAP\\n4.1. Structure of a ... \n",
"2 [Execution Model in AVAP\\n4.1. Structure of a ... \n",
"3 [Execution Model in AVAP\\n4.1. Structure of a ... \n",
"4 [Execution Model in AVAP\\n4.1. Structure of a ... \n",
".. ... \n",
"95 [Introduction\\nThe data model in AVAP™ defines... \n",
"96 [Introduction\\nThe data model in AVAP™ defines... \n",
"97 [Introduction\\nThe data model in AVAP™ defines... \n",
"98 [Introduction\\nThe data model in AVAP™ defines... \n",
"99 [Introduction\\nThe data model in AVAP™ defines... \n",
"\n",
" reference persona_name \\\n",
"0 In AVAP, a ZeroDivisionError is raised in two ... Carlos Medina \n",
"1 In AVAP, when a name is used in a code block, ... Carlos Medina \n",
"2 In AVAP, name resolution works differently fro... Carlos Méndez \n",
"3 In AVAP, the import statement is the only way ... Carlos Méndez \n",
"4 In generator functions, the return statement i... Carlos Méndez \n",
".. ... ... \n",
"95 In AVAP™, the data type that uses Unicode is t... Carlos Méndez \n",
"96 In AVAP™, just like in Python, data types are ... Carlos Medina \n",
"97 In AVAP™, strings (str) represent sequences of... Carlos Méndez \n",
"98 Similar to Python, AVAP uses a flexible and dy... Carlos Medina \n",
"99 In AVAP™, the most common data types include: ... Carlos Méndez \n",
"\n",
" query_style query_length synthesizer_name \n",
"0 MISSPELLED LONG single_hop_specific_query_synthesizer \n",
"1 POOR_GRAMMAR MEDIUM single_hop_specific_query_synthesizer \n",
"2 MISSPELLED MEDIUM single_hop_specific_query_synthesizer \n",
"3 WEB_SEARCH_LIKE MEDIUM single_hop_specific_query_synthesizer \n",
"4 POOR_GRAMMAR SHORT single_hop_specific_query_synthesizer \n",
".. ... ... ... \n",
"95 POOR_GRAMMAR MEDIUM single_hop_specific_query_synthesizer \n",
"96 POOR_GRAMMAR LONG single_hop_specific_query_synthesizer \n",
"97 MISSPELLED MEDIUM single_hop_specific_query_synthesizer \n",
"98 PERFECT_GRAMMAR SHORT single_hop_specific_query_synthesizer \n",
"99 POOR_GRAMMAR MEDIUM single_hop_specific_query_synthesizer \n",
"\n",
"[100 rows x 7 columns]"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"synthetic_dataset"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "71743384",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"\n",
"from evidently import Dataset\n",
"from evidently import DataDefinition\n",
"from evidently.descriptors import *\n",
"\n",
"from evidently import Report\n",
"from evidently.presets import TextEvals\n",
"from evidently.metrics import *\n",
"from evidently.tests import *"
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "e1ac1a41",
"metadata": {},
"outputs": [
{
"ename": "ValidationError",
"evalue": "1 validation error for OllamaOptions\napi_url\n field required (type=value_error.missing)",
"output_type": "error",
"traceback": [
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
"\u001b[31mValidationError\u001b[39m Traceback (most recent call last)",
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[24]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m context_based_evals = \u001b[43mDataset\u001b[49m\u001b[43m.\u001b[49m\u001b[43mfrom_pandas\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 2\u001b[39m \u001b[43m \u001b[49m\u001b[43msynthetic_dataset\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 3\u001b[39m \u001b[43m \u001b[49m\u001b[43mdata_definition\u001b[49m\u001b[43m=\u001b[49m\u001b[43mDataDefinition\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtext_columns\u001b[49m\u001b[43m=\u001b[49m\u001b[43m[\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43muser_input\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mreference_contexts\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mreference\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 4\u001b[39m \u001b[43m \u001b[49m\u001b[43mdescriptors\u001b[49m\u001b[43m=\u001b[49m\u001b[43m[\u001b[49m\u001b[43mContextQualityLLMEval\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mreference_contexts\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mquestion\u001b[49m\u001b[43m=\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43muser_input\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mprovider\u001b[49m\u001b[43m=\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mollama\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m=\u001b[49m\u001b[43mOLLAMA_MODEL_NAME\u001b[49m\u001b[43m)\u001b[49m\u001b[43m]\u001b[49m\n\u001b[32m 5\u001b[39m \u001b[43m)\u001b[49m\n\u001b[32m 6\u001b[39m context_based_evals.as_dataframe()\n",
"\u001b[36mFile \u001b[39m\u001b[32m~/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/evidently/core/datasets.py:1271\u001b[39m, in \u001b[36mDataset.from_pandas\u001b[39m\u001b[34m(cls, data, data_definition, descriptors, options, metadata, tags)\u001b[39m\n\u001b[32m 1269\u001b[39m dataset = PandasDataset(data, data_definition, metadata=metadata, tags=tags)\n\u001b[32m 1270\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m descriptors \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m-> \u001b[39m\u001b[32m1271\u001b[39m \u001b[43mdataset\u001b[49m\u001b[43m.\u001b[49m\u001b[43madd_descriptors\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdescriptors\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moptions\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 1272\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m dataset\n",
"\u001b[36mFile \u001b[39m\u001b[32m~/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/evidently/core/datasets.py:1382\u001b[39m, in \u001b[36mDataset.add_descriptors\u001b[39m\u001b[34m(self, descriptors, options)\u001b[39m\n\u001b[32m 1375\u001b[39m \u001b[38;5;250m\u001b[39m\u001b[33;03m\"\"\"Add multiple descriptors to the dataset.\u001b[39;00m\n\u001b[32m 1376\u001b[39m \n\u001b[32m 1377\u001b[39m \u001b[33;03mArgs:\u001b[39;00m\n\u001b[32m 1378\u001b[39m \u001b[33;03m* `descriptors`: List of `Descriptor` objects to compute\u001b[39;00m\n\u001b[32m 1379\u001b[39m \u001b[33;03m* `options`: Optional options for descriptor computation\u001b[39;00m\n\u001b[32m 1380\u001b[39m \u001b[33;03m\"\"\"\u001b[39;00m\n\u001b[32m 1381\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m descriptor \u001b[38;5;129;01min\u001b[39;00m descriptors:\n\u001b[32m-> \u001b[39m\u001b[32m1382\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43madd_descriptor\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdescriptor\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moptions\u001b[49m\u001b[43m)\u001b[49m\n",
"\u001b[36mFile \u001b[39m\u001b[32m~/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/evidently/core/datasets.py:1688\u001b[39m, in \u001b[36mPandasDataset.add_descriptor\u001b[39m\u001b[34m(self, descriptor, options)\u001b[39m\n\u001b[32m 1686\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34madd_descriptor\u001b[39m(\u001b[38;5;28mself\u001b[39m, descriptor: Descriptor, options: AnyOptions = \u001b[38;5;28;01mNone\u001b[39;00m):\n\u001b[32m 1687\u001b[39m descriptor.validate_input(\u001b[38;5;28mself\u001b[39m._data_definition)\n\u001b[32m-> \u001b[39m\u001b[32m1688\u001b[39m new_columns = \u001b[43mdescriptor\u001b[49m\u001b[43m.\u001b[49m\u001b[43mgenerate_data\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mOptions\u001b[49m\u001b[43m.\u001b[49m\u001b[43mfrom_any_options\u001b[49m\u001b[43m(\u001b[49m\u001b[43moptions\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 1689\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(new_columns, DatasetColumn):\n\u001b[32m 1690\u001b[39m new_columns = {descriptor.alias: new_columns}\n",
"\u001b[36mFile \u001b[39m\u001b[32m~/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/evidently/core/datasets.py:1099\u001b[39m, in \u001b[36mFeatureDescriptor.generate_data\u001b[39m\u001b[34m(self, dataset, options)\u001b[39m\n\u001b[32m 1096\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mgenerate_data\u001b[39m(\n\u001b[32m 1097\u001b[39m \u001b[38;5;28mself\u001b[39m, dataset: \u001b[33m\"\u001b[39m\u001b[33mDataset\u001b[39m\u001b[33m\"\u001b[39m, options: Options\n\u001b[32m 1098\u001b[39m ) -> Union[DatasetColumn, Dict[DisplayName, DatasetColumn]]:\n\u001b[32m-> \u001b[39m\u001b[32m1099\u001b[39m feature = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mfeature\u001b[49m\u001b[43m.\u001b[49m\u001b[43mgenerate_features_renamed\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 1100\u001b[39m \u001b[43m \u001b[49m\u001b[43mdataset\u001b[49m\u001b[43m.\u001b[49m\u001b[43mas_dataframe\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1101\u001b[39m \u001b[43m \u001b[49m\u001b[43mcreate_data_definition\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdataset\u001b[49m\u001b[43m.\u001b[49m\u001b[43mas_dataframe\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mColumnMapping\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1102\u001b[39m \u001b[43m \u001b[49m\u001b[43moptions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1103\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 1104\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m {\n\u001b[32m 1105\u001b[39m col.display_name: \u001b[38;5;28mself\u001b[39m.get_dataset_column(col.name, feature[col.name])\n\u001b[32m 1106\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m col \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m.feature.list_columns()\n\u001b[32m 1107\u001b[39m }\n",
"\u001b[36mFile \u001b[39m\u001b[32m~/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/evidently/legacy/features/generated_features.py:56\u001b[39m, in \u001b[36mGeneratedFeatures.generate_features_renamed\u001b[39m\u001b[34m(self, data, data_definition, options)\u001b[39m\n\u001b[32m 53\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mgenerate_features_renamed\u001b[39m(\n\u001b[32m 54\u001b[39m \u001b[38;5;28mself\u001b[39m, data: pd.DataFrame, data_definition: DataDefinition, options: Options\n\u001b[32m 55\u001b[39m ) -> pd.DataFrame:\n\u001b[32m---> \u001b[39m\u001b[32m56\u001b[39m features = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mgenerate_features\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdata\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdata_definition\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moptions\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 57\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m features.rename(columns={col: \u001b[38;5;28mself\u001b[39m._create_column_name(col) \u001b[38;5;28;01mfor\u001b[39;00m col \u001b[38;5;129;01min\u001b[39;00m features.columns}).set_index(\n\u001b[32m 58\u001b[39m data.index\n\u001b[32m 59\u001b[39m )\n",
"\u001b[36mFile \u001b[39m\u001b[32m~/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/evidently/legacy/features/llm_judge.py:54\u001b[39m, in \u001b[36mLLMJudge.generate_features\u001b[39m\u001b[34m(self, data, data_definition, options)\u001b[39m\n\u001b[32m 53\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mgenerate_features\u001b[39m(\u001b[38;5;28mself\u001b[39m, data: pd.DataFrame, data_definition: DataDefinition, options: Options) -> pd.DataFrame:\n\u001b[32m---> \u001b[39m\u001b[32m54\u001b[39m result: Union[List, Dict] = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mget_llm_wrapper\u001b[49m\u001b[43m(\u001b[49m\u001b[43moptions\u001b[49m\u001b[43m)\u001b[49m.run_batch_sync(\n\u001b[32m 55\u001b[39m requests=\u001b[38;5;28mself\u001b[39m.template.iterate_messages(data, \u001b[38;5;28mself\u001b[39m.get_input_columns())\n\u001b[32m 56\u001b[39m )\n\u001b[32m 57\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(result, \u001b[38;5;28mlist\u001b[39m) \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28many\u001b[39m(\u001b[38;5;28misinstance\u001b[39m(o, \u001b[38;5;28mdict\u001b[39m) \u001b[38;5;28;01mfor\u001b[39;00m o \u001b[38;5;129;01min\u001b[39;00m result):\n\u001b[32m 58\u001b[39m result = {\u001b[38;5;28mself\u001b[39m.display_name \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m.template.get_main_output_column(): result}\n",
"\u001b[36mFile \u001b[39m\u001b[32m~/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/evidently/legacy/features/llm_judge.py:43\u001b[39m, in \u001b[36mLLMJudge.get_llm_wrapper\u001b[39m\u001b[34m(self, options)\u001b[39m\n\u001b[32m 41\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mget_llm_wrapper\u001b[39m(\u001b[38;5;28mself\u001b[39m, options: Options) -> LLMWrapper:\n\u001b[32m 42\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m._llm_wrapper \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m---> \u001b[39m\u001b[32m43\u001b[39m \u001b[38;5;28mself\u001b[39m._llm_wrapper = \u001b[43mget_llm_wrapper\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mprovider\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moptions\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 44\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m._llm_wrapper\n",
"\u001b[36mFile \u001b[39m\u001b[32m~/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/evidently/llm/utils/wrapper.py:437\u001b[39m, in \u001b[36mget_llm_wrapper\u001b[39m\u001b[34m(provider, model, options)\u001b[39m\n\u001b[32m 435\u001b[39m key = (provider, \u001b[38;5;28;01mNone\u001b[39;00m)\n\u001b[32m 436\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m key \u001b[38;5;129;01min\u001b[39;00m _wrappers:\n\u001b[32m--> \u001b[39m\u001b[32m437\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_wrappers\u001b[49m\u001b[43m[\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m]\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moptions\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 438\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m find_spec(\u001b[33m\"\u001b[39m\u001b[33mlitellm\u001b[39m\u001b[33m\"\u001b[39m) \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m 439\u001b[39m litellm_wrapper = get_litellm_wrapper(provider, model, options)\n",
"\u001b[36mFile \u001b[39m\u001b[32m~/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/evidently/llm/utils/wrapper.py:583\u001b[39m, in \u001b[36mLiteLLMWrapper.__init__\u001b[39m\u001b[34m(self, model, options)\u001b[39m\n\u001b[32m 581\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34m__init__\u001b[39m(\u001b[38;5;28mself\u001b[39m, model: \u001b[38;5;28mstr\u001b[39m, options: Options):\n\u001b[32m 582\u001b[39m \u001b[38;5;28mself\u001b[39m.model = model\n\u001b[32m--> \u001b[39m\u001b[32m583\u001b[39m \u001b[38;5;28mself\u001b[39m.options: LLMOptions = \u001b[43moptions\u001b[49m\u001b[43m.\u001b[49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m__llm_options_type__\u001b[49m\u001b[43m)\u001b[49m\n",
"\u001b[36mFile \u001b[39m\u001b[32m~/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/evidently/legacy/options/base.py:51\u001b[39m, in \u001b[36mOptions.get\u001b[39m\u001b[34m(self, option_type)\u001b[39m\n\u001b[32m 49\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28missubclass\u001b[39m(possible_subclass, option_type):\n\u001b[32m 50\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m.custom[possible_subclass] \u001b[38;5;66;03m# type: ignore[return-value]\u001b[39;00m\n\u001b[32m---> \u001b[39m\u001b[32m51\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43moption_type\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n",
"\u001b[36mFile \u001b[39m\u001b[32m~/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/evidently/llm/utils/wrapper.py:472\u001b[39m, in \u001b[36mLLMOptions.__init__\u001b[39m\u001b[34m(self, api_key, rpm_limit, **data)\u001b[39m\n\u001b[32m 465\u001b[39m \u001b[38;5;250m\u001b[39m\u001b[33;03m\"\"\"Initialize LLM options.\u001b[39;00m\n\u001b[32m 466\u001b[39m \n\u001b[32m 467\u001b[39m \u001b[33;03mArgs:\u001b[39;00m\n\u001b[32m 468\u001b[39m \u001b[33;03m* `api_key`: Optional API key for the provider.\u001b[39;00m\n\u001b[32m 469\u001b[39m \u001b[33;03m* `rpm_limit`: Optional requests per minute limit (backward compatibility).\u001b[39;00m\n\u001b[32m 470\u001b[39m \u001b[33;03m\"\"\"\u001b[39;00m\n\u001b[32m 471\u001b[39m \u001b[38;5;28mself\u001b[39m.api_key = SecretStr(api_key) \u001b[38;5;28;01mif\u001b[39;00m api_key \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m472\u001b[39m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m.\u001b[49m\u001b[34;43m__init__\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mdata\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 473\u001b[39m \u001b[38;5;66;03m# backward comp\u001b[39;00m\n\u001b[32m 474\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m rpm_limit \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n",
"\u001b[36mFile \u001b[39m\u001b[32m~/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/evidently/pydantic_utils.py:89\u001b[39m, in \u001b[36mFrozenBaseModel.__init__\u001b[39m\u001b[34m(self, **data)\u001b[39m\n\u001b[32m 88\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34m__init__\u001b[39m(\u001b[38;5;28mself\u001b[39m, **data: Any):\n\u001b[32m---> \u001b[39m\u001b[32m89\u001b[39m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m.\u001b[49m\u001b[34;43m__init__\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m__init_values__\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mdata\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 90\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m private_attr \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m.__private_attributes__:\n\u001b[32m 91\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m private_attr \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m.__init_values__:\n",
"\u001b[36mFile \u001b[39m\u001b[32m~/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/pydantic/v1/main.py:347\u001b[39m, in \u001b[36mBaseModel.__init__\u001b[39m\u001b[34m(__pydantic_self__, **data)\u001b[39m\n\u001b[32m 345\u001b[39m values, fields_set, validation_error = validate_model(__pydantic_self__.\u001b[34m__class__\u001b[39m, data)\n\u001b[32m 346\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m validation_error:\n\u001b[32m--> \u001b[39m\u001b[32m347\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m validation_error\n\u001b[32m 348\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m 349\u001b[39m object_setattr(__pydantic_self__, \u001b[33m'\u001b[39m\u001b[33m__dict__\u001b[39m\u001b[33m'\u001b[39m, values)\n",
"\u001b[31mValidationError\u001b[39m: 1 validation error for OllamaOptions\napi_url\n field required (type=value_error.missing)"
]
}
],
"source": [
"context_based_evals = Dataset.from_pandas(\n",
" synthetic_dataset,\n",
" data_definition=DataDefinition(text_columns=[\"user_input\", \"reference_contexts\", \"reference\"]),\n",
" descriptors=[ContextQualityLLMEval(\"reference_contexts\", question=\"user_input\", provider=\"ollama\", model=OLLAMA_MODEL_NAME)]\n",
")\n",
"context_based_evals.as_dataframe()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c2d127ad",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "assistance-engine",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.13"
}
},
"nbformat": 4,
"nbformat_minor": 5
}