905 lines
56 KiB
Plaintext
905 lines
56 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 6,
|
||
"id": "8fed4518",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"/tmp/ipykernel_782131/4243561678.py:18: DeprecationWarning: Importing faithfulness from 'ragas.metrics' is deprecated and will be removed in v1.0. Please use 'ragas.metrics.collections' instead. Example: from ragas.metrics.collections import faithfulness\n",
|
||
" from ragas.metrics import (\n",
|
||
"/tmp/ipykernel_782131/4243561678.py:18: DeprecationWarning: Importing answer_relevancy from 'ragas.metrics' is deprecated and will be removed in v1.0. Please use 'ragas.metrics.collections' instead. Example: from ragas.metrics.collections import answer_relevancy\n",
|
||
" from ragas.metrics import (\n",
|
||
"/tmp/ipykernel_782131/4243561678.py:18: DeprecationWarning: Importing context_recall from 'ragas.metrics' is deprecated and will be removed in v1.0. Please use 'ragas.metrics.collections' instead. Example: from ragas.metrics.collections import context_recall\n",
|
||
" from ragas.metrics import (\n",
|
||
"/tmp/ipykernel_782131/4243561678.py:18: DeprecationWarning: Importing context_precision from 'ragas.metrics' is deprecated and will be removed in v1.0. Please use 'ragas.metrics.collections' instead. Example: from ragas.metrics.collections import context_precision\n",
|
||
" from ragas.metrics import (\n",
|
||
"/tmp/ipykernel_782131/4243561678.py:18: DeprecationWarning: Importing context_entity_recall from 'ragas.metrics' is deprecated and will be removed in v1.0. Please use 'ragas.metrics.collections' instead. Example: from ragas.metrics.collections import context_entity_recall\n",
|
||
" from ragas.metrics import (\n",
|
||
"/tmp/ipykernel_782131/4243561678.py:18: DeprecationWarning: Importing answer_similarity from 'ragas.metrics' is deprecated and will be removed in v1.0. Please use 'ragas.metrics.collections' instead. Example: from ragas.metrics.collections import answer_similarity\n",
|
||
" from ragas.metrics import (\n",
|
||
"/tmp/ipykernel_782131/4243561678.py:18: DeprecationWarning: Importing answer_correctness from 'ragas.metrics' is deprecated and will be removed in v1.0. Please use 'ragas.metrics.collections' instead. Example: from ragas.metrics.collections import answer_correctness\n",
|
||
" from ragas.metrics import (\n",
|
||
"/tmp/ipykernel_782131/4243561678.py:18: DeprecationWarning: Importing NonLLMContextRecall from 'ragas.metrics' is deprecated and will be removed in v1.0. Please use 'ragas.metrics.collections' instead. Example: from ragas.metrics.collections import NonLLMContextRecall\n",
|
||
" from ragas.metrics import (\n",
|
||
"/tmp/ipykernel_782131/4243561678.py:18: DeprecationWarning: Importing NonLLMContextPrecisionWithReference from 'ragas.metrics' is deprecated and will be removed in v1.0. Please use 'ragas.metrics.collections' instead. Example: from ragas.metrics.collections import NonLLMContextPrecisionWithReference\n",
|
||
" from ragas.metrics import (\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"import sys\n",
|
||
"from pathlib import Path\n",
|
||
"\n",
|
||
"# Ensure the project root is on the path so `src` is importable\n",
|
||
"_project_root = str(Path(__file__).resolve().parents[2]) if \"__file__\" in dir() else str(Path.cwd().parents[1])\n",
|
||
"if _project_root not in sys.path:\n",
|
||
" sys.path.insert(0, _project_root)\n",
|
||
"\n",
|
||
"from langchain_core.documents import Document\n",
|
||
"from langchain_classic.chains.retrieval_qa.base import RetrievalQA\n",
|
||
"from langchain_elasticsearch import ElasticsearchStore\n",
|
||
"from ragas import evaluate, SingleTurnSample\n",
|
||
"from ragas.llms import LangchainLLMWrapper\n",
|
||
"from ragas.embeddings import LangchainEmbeddingsWrapper\n",
|
||
"from ragas.testset import TestsetGenerator\n",
|
||
"from ragas.testset.persona import Persona\n",
|
||
"from ragas.testset.synthesizers.single_hop.specific import SingleHopSpecificQuerySynthesizer\n",
|
||
"from ragas.metrics import (\n",
|
||
" faithfulness,\n",
|
||
" answer_relevancy,\n",
|
||
" context_recall,\n",
|
||
" context_precision,\n",
|
||
" context_entity_recall,\n",
|
||
" answer_similarity,\n",
|
||
" answer_correctness,\n",
|
||
" NonLLMContextRecall,\n",
|
||
" NonLLMContextPrecisionWithReference\n",
|
||
")\n",
|
||
"\n",
|
||
"from src.llm_factory import create_chat_model\n",
|
||
"from src.emb_factory import create_embedding_model\n",
|
||
"from src.config import (\n",
|
||
" ELASTICSEARCH_LOCAL_URL,\n",
|
||
" ELASTICSEARCH_INDEX,\n",
|
||
" OLLAMA_MODEL_NAME,\n",
|
||
" OLLAMA_EMB_MODEL_NAME,\n",
|
||
" RAW_DIR,\n",
|
||
" INTERIM_DIR\n",
|
||
")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 3,
|
||
"id": "4426d6c0",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"llm = create_chat_model(\n",
|
||
" provider=\"bedrock\",\n",
|
||
" model=\"global.anthropic.claude-opus-4-6-v1\",\n",
|
||
" temperature=0,\n",
|
||
")\n",
|
||
"embeddings = create_embedding_model(\n",
|
||
" provider=\"ollama\",\n",
|
||
" model=OLLAMA_EMB_MODEL_NAME,\n",
|
||
")\n",
|
||
"agent_llm = create_chat_model(\n",
|
||
" provider=\"ollama\",\n",
|
||
" model=OLLAMA_MODEL_NAME,\n",
|
||
" temperature=0,\n",
|
||
" validate_model_on_init=True,\n",
|
||
")\n",
|
||
"vector_store = ElasticsearchStore(\n",
|
||
" es_url=ELASTICSEARCH_LOCAL_URL,\n",
|
||
" index_name=ELASTICSEARCH_INDEX,\n",
|
||
" embedding=embeddings,\n",
|
||
" query_field=\"text\",\n",
|
||
" vector_query_field=\"vector\",\n",
|
||
")\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 4,
|
||
"id": "fe524d14",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Loaded 24 documents from /home/acano/PycharmProjects/assistance-engine/data/raw\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"docs: list[Document] = []\n",
|
||
"for txt_file in sorted(RAW_DIR.glob(\"*.txt\")):\n",
|
||
" text = txt_file.read_text(encoding=\"utf-8\")\n",
|
||
" docs.append(Document(page_content=text, metadata={\"source\": txt_file.name}))\n",
|
||
"\n",
|
||
"print(f\"Loaded {len(docs)} documents from {RAW_DIR}\")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "ab1932b7",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"synth = SingleHopSpecificQuerySynthesizer(llm=LangchainLLMWrapper(llm))\n",
|
||
"\n",
|
||
"generator = TestsetGenerator(llm=LangchainLLMWrapper(llm), embedding_model=LangchainEmbeddingsWrapper(embeddings))\n",
|
||
"synthetic_dataset = generator.generate_with_chunks(\n",
|
||
" chunks=docs,\n",
|
||
" testset_size=100,\n",
|
||
" query_distribution=[(synth, 1.0)]\n",
|
||
")\n",
|
||
"synthetic_dataset = synthetic_dataset.to_pandas()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 10,
|
||
"id": "d15cea12",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"synthetic_dataset.to_csv(INTERIM_DIR / \"retrieve_eval_results/synthetic_dataset.csv\", index=False)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 5,
|
||
"id": "18ceb119",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"retriever = vector_store.as_retriever(\n",
|
||
" search_type=\"similarity\",\n",
|
||
" search_kwargs={\"k\": 3},\n",
|
||
" )\n",
|
||
"\n",
|
||
"qa_chain = RetrievalQA.from_chain_type(\n",
|
||
" llm=agent_llm, retriever=retriever, return_source_documents=True\n",
|
||
")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 12,
|
||
"id": "344a1266",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"from datasets import Dataset\n",
|
||
"questions = synthetic_dataset[\"user_input\"]\n",
|
||
"ground_truths = synthetic_dataset[\"reference\"]\n",
|
||
"\n",
|
||
"answers = []\n",
|
||
"contexts = []\n",
|
||
"\n",
|
||
"for query in questions:\n",
|
||
" answers.append(qa_chain.invoke(query)[\"result\"])\n",
|
||
" contexts.append([docs.page_content for docs in retriever.invoke(query)])\n",
|
||
"\n",
|
||
"# To dict\n",
|
||
"data = {\n",
|
||
" \"question\": questions,\n",
|
||
" \"answer\": answers,\n",
|
||
" \"contexts\": contexts,\n",
|
||
" \"ground_truth\": ground_truths\n",
|
||
"}\n",
|
||
"\n",
|
||
"# Convert dict to dataset\n",
|
||
"dataset = Dataset.from_dict(data)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 13,
|
||
"id": "a9011f94",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"application/vnd.jupyter.widget-view+json": {
|
||
"model_id": "3239c7c9d6254330b9b079a249a74c60",
|
||
"version_major": 2,
|
||
"version_minor": 0
|
||
},
|
||
"text/plain": [
|
||
"Evaluating: 0%| | 0/700 [00:00<?, ?it/s]"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Exception in callback Task.__step()\n",
|
||
"handle: <Handle Task.__step()>\n",
|
||
"Traceback (most recent call last):\n",
|
||
" File \"/home/acano/.local/share/uv/python/cpython-3.11.13-linux-x86_64-gnu/lib/python3.11/asyncio/events.py\", line 84, in _run\n",
|
||
" self._context.run(self._callback, *self._args)\n",
|
||
"RuntimeError: cannot enter context: <_contextvars.Context object at 0x74fe3aa80780> is already entered\n",
|
||
"Task was destroyed but it is pending!\n",
|
||
"task: <Task pending name='Task-3487' coro=<_async_in_context.<locals>.run_in_context() done, defined at /home/acano/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/ipykernel/utils.py:57> wait_for=<Task pending name='Task-3489' coro=<Kernel.shell_main() running at /home/acano/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/ipykernel/kernelbase.py:597> cb=[Task.__wakeup()]> cb=[ZMQStream._run_callback.<locals>._log_error() at /home/acano/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/zmq/eventloop/zmqstream.py:563]>\n",
|
||
"/home/acano/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/pydantic/json_schema.py:335: RuntimeWarning: coroutine 'Kernel.shell_main' was never awaited\n",
|
||
" mapping[key] = getattr(self, method_name)\n",
|
||
"RuntimeWarning: Enable tracemalloc to get the object allocation traceback\n",
|
||
"Task was destroyed but it is pending!\n",
|
||
"task: <Task pending name='Task-3489' coro=<Kernel.shell_main() running at /home/acano/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/ipykernel/kernelbase.py:597> cb=[Task.__wakeup()]>\n",
|
||
"Exception in callback Task.__step()\n",
|
||
"handle: <Handle Task.__step()>\n",
|
||
"Traceback (most recent call last):\n",
|
||
" File \"/home/acano/.local/share/uv/python/cpython-3.11.13-linux-x86_64-gnu/lib/python3.11/asyncio/events.py\", line 84, in _run\n",
|
||
" self._context.run(self._callback, *self._args)\n",
|
||
"RuntimeError: cannot enter context: <_contextvars.Context object at 0x74fe3aa80780> is already entered\n",
|
||
"Task was destroyed but it is pending!\n",
|
||
"task: <Task pending name='Task-5636' coro=<_async_in_context.<locals>.run_in_context() done, defined at /home/acano/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/ipykernel/utils.py:57> wait_for=<Task pending name='Task-5637' coro=<Kernel.shell_main() running at /home/acano/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/ipykernel/kernelbase.py:597> cb=[Task.__wakeup()]> cb=[ZMQStream._run_callback.<locals>._log_error() at /home/acano/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/zmq/eventloop/zmqstream.py:563]>\n",
|
||
"/home/acano/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/pydantic/main.py:716: RuntimeWarning: coroutine 'Kernel.shell_main' was never awaited\n",
|
||
" return cls.__pydantic_validator__.validate_python(\n",
|
||
"RuntimeWarning: Enable tracemalloc to get the object allocation traceback\n",
|
||
"Task was destroyed but it is pending!\n",
|
||
"task: <Task pending name='Task-5637' coro=<Kernel.shell_main() running at /home/acano/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/ipykernel/kernelbase.py:597> cb=[Task.__wakeup()]>\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>user_input</th>\n",
|
||
" <th>retrieved_contexts</th>\n",
|
||
" <th>response</th>\n",
|
||
" <th>reference</th>\n",
|
||
" <th>faithfulness</th>\n",
|
||
" <th>answer_relevancy</th>\n",
|
||
" <th>context_precision</th>\n",
|
||
" <th>context_recall</th>\n",
|
||
" <th>context_entity_recall</th>\n",
|
||
" <th>answer_similarity</th>\n",
|
||
" <th>answer_correctness</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>How does AVAP handel a ZeroDivisionError when ...</td>\n",
|
||
" <td>[Execution Model in AVAP 4.1. Structure of a P...</td>\n",
|
||
" <td>AVAP (Advanced Virtual Application Platform) i...</td>\n",
|
||
" <td>In AVAP, when a division by zero occurs—whethe...</td>\n",
|
||
" <td>0.083333</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" <td>0.300000</td>\n",
|
||
" <td>0.833670</td>\n",
|
||
" <td>0.363590</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>As a backend developer who is learning AVAP an...</td>\n",
|
||
" <td>[SECTION III: Control Logic and Decision Struc...</td>\n",
|
||
" <td>I can provide information on the if statement ...</td>\n",
|
||
" <td>In AVAP, control flow structures include condi...</td>\n",
|
||
" <td>0.904762</td>\n",
|
||
" <td>0.837564</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" <td>0.454545</td>\n",
|
||
" <td>0.157895</td>\n",
|
||
" <td>0.809311</td>\n",
|
||
" <td>0.531596</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>hey so in AVAP when i do division by zero what...</td>\n",
|
||
" <td>[Execution Model in AVAP 4.1. Structure of a P...</td>\n",
|
||
" <td>In AVAP, when a division operation is attempte...</td>\n",
|
||
" <td>In AVAP, when you perform a division by zero, ...</td>\n",
|
||
" <td>0.312500</td>\n",
|
||
" <td>0.892255</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" <td>0.200000</td>\n",
|
||
" <td>0.923880</td>\n",
|
||
" <td>0.472905</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>what happen if file not found when i do import...</td>\n",
|
||
" <td>[Execution Model in AVAP 4.1. Structure of a P...</td>\n",
|
||
" <td>In AVAP, the equivalent of a `FileNotFoundErro...</td>\n",
|
||
" <td>When an import statement is executed in AVAP, ...</td>\n",
|
||
" <td>0.142857</td>\n",
|
||
" <td>0.912031</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" <td>0.642044</td>\n",
|
||
" <td>0.248746</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>In AVAP, under what circumstances is a TypeErr...</td>\n",
|
||
" <td>[Execution Model in AVAP 4.1. Structure of a P...</td>\n",
|
||
" <td>I don't have information on the specific circu...</td>\n",
|
||
" <td>In AVAP, a TypeError exception is raised in tw...</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" <td>0.806848</td>\n",
|
||
" <td>0.201712</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>95</th>\n",
|
||
" <td>How does the data model in AVAP™ compare to Py...</td>\n",
|
||
" <td>[Introduction The data model in AVAP™ defines ...</td>\n",
|
||
" <td>AVAP (Advanced Vector Analysis Platform) is a ...</td>\n",
|
||
" <td>The data model in AVAP™ is very similar to Pyt...</td>\n",
|
||
" <td>0.289474</td>\n",
|
||
" <td>0.925546</td>\n",
|
||
" <td>0.833333</td>\n",
|
||
" <td>0.800000</td>\n",
|
||
" <td>0.500000</td>\n",
|
||
" <td>0.736816</td>\n",
|
||
" <td>0.422299</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>96</th>\n",
|
||
" <td>What data types are available in AVAP™?</td>\n",
|
||
" <td>[Chapter 5: Data Types In this chapter, we wil...</td>\n",
|
||
" <td>I don't know the specific details about the da...</td>\n",
|
||
" <td>In AVAP™, the most common data types include i...</td>\n",
|
||
" <td>0.818182</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>0.500000</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" <td>0.500000</td>\n",
|
||
" <td>0.856402</td>\n",
|
||
" <td>0.658545</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>97</th>\n",
|
||
" <td>AVAP strings Unicode</td>\n",
|
||
" <td>[Chapter 2: Notation in AVAP™ Introduction Not...</td>\n",
|
||
" <td>AVAP (African Vertebrate Anatomy Project) uses...</td>\n",
|
||
" <td>In AVAP™, strings (str) represent sequences of...</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>0.918740</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>0.500000</td>\n",
|
||
" <td>0.686281</td>\n",
|
||
" <td>0.271570</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>98</th>\n",
|
||
" <td>AVAP data model comparison with Python data ty...</td>\n",
|
||
" <td>[Introduction The data model in AVAP™ defines ...</td>\n",
|
||
" <td>Here's a comparison of the AVAP data model wit...</td>\n",
|
||
" <td>The data model in AVAP is similar to Python in...</td>\n",
|
||
" <td>0.343750</td>\n",
|
||
" <td>0.954994</td>\n",
|
||
" <td>0.833333</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" <td>0.555556</td>\n",
|
||
" <td>0.824449</td>\n",
|
||
" <td>0.587930</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>99</th>\n",
|
||
" <td>AVAP™ data types and data structures overview</td>\n",
|
||
" <td>[Introduction The data model in AVAP™ defines ...</td>\n",
|
||
" <td>AVAP (Advanced Visual Analytics Platform) is a...</td>\n",
|
||
" <td>AVAP™ uses a flexible and dynamic data model s...</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>0.855719</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" <td>0.100000</td>\n",
|
||
" <td>0.856107</td>\n",
|
||
" <td>0.323783</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>100 rows × 11 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" user_input \\\n",
|
||
"0 How does AVAP handel a ZeroDivisionError when ... \n",
|
||
"1 As a backend developer who is learning AVAP an... \n",
|
||
"2 hey so in AVAP when i do division by zero what... \n",
|
||
"3 what happen if file not found when i do import... \n",
|
||
"4 In AVAP, under what circumstances is a TypeErr... \n",
|
||
".. ... \n",
|
||
"95 How does the data model in AVAP™ compare to Py... \n",
|
||
"96 What data types are available in AVAP™? \n",
|
||
"97 AVAP strings Unicode \n",
|
||
"98 AVAP data model comparison with Python data ty... \n",
|
||
"99 AVAP™ data types and data structures overview \n",
|
||
"\n",
|
||
" retrieved_contexts \\\n",
|
||
"0 [Execution Model in AVAP 4.1. Structure of a P... \n",
|
||
"1 [SECTION III: Control Logic and Decision Struc... \n",
|
||
"2 [Execution Model in AVAP 4.1. Structure of a P... \n",
|
||
"3 [Execution Model in AVAP 4.1. Structure of a P... \n",
|
||
"4 [Execution Model in AVAP 4.1. Structure of a P... \n",
|
||
".. ... \n",
|
||
"95 [Introduction The data model in AVAP™ defines ... \n",
|
||
"96 [Chapter 5: Data Types In this chapter, we wil... \n",
|
||
"97 [Chapter 2: Notation in AVAP™ Introduction Not... \n",
|
||
"98 [Introduction The data model in AVAP™ defines ... \n",
|
||
"99 [Introduction The data model in AVAP™ defines ... \n",
|
||
"\n",
|
||
" response \\\n",
|
||
"0 AVAP (Advanced Virtual Application Platform) i... \n",
|
||
"1 I can provide information on the if statement ... \n",
|
||
"2 In AVAP, when a division operation is attempte... \n",
|
||
"3 In AVAP, the equivalent of a `FileNotFoundErro... \n",
|
||
"4 I don't have information on the specific circu... \n",
|
||
".. ... \n",
|
||
"95 AVAP (Advanced Vector Analysis Platform) is a ... \n",
|
||
"96 I don't know the specific details about the da... \n",
|
||
"97 AVAP (African Vertebrate Anatomy Project) uses... \n",
|
||
"98 Here's a comparison of the AVAP data model wit... \n",
|
||
"99 AVAP (Advanced Visual Analytics Platform) is a... \n",
|
||
"\n",
|
||
" reference faithfulness \\\n",
|
||
"0 In AVAP, when a division by zero occurs—whethe... 0.083333 \n",
|
||
"1 In AVAP, control flow structures include condi... 0.904762 \n",
|
||
"2 In AVAP, when you perform a division by zero, ... 0.312500 \n",
|
||
"3 When an import statement is executed in AVAP, ... 0.142857 \n",
|
||
"4 In AVAP, a TypeError exception is raised in tw... 0.000000 \n",
|
||
".. ... ... \n",
|
||
"95 The data model in AVAP™ is very similar to Pyt... 0.289474 \n",
|
||
"96 In AVAP™, the most common data types include i... 0.818182 \n",
|
||
"97 In AVAP™, strings (str) represent sequences of... 0.000000 \n",
|
||
"98 The data model in AVAP is similar to Python in... 0.343750 \n",
|
||
"99 AVAP™ uses a flexible and dynamic data model s... 0.000000 \n",
|
||
"\n",
|
||
" answer_relevancy context_precision context_recall \\\n",
|
||
"0 0.000000 1.000000 1.000000 \n",
|
||
"1 0.837564 1.000000 0.454545 \n",
|
||
"2 0.892255 1.000000 1.000000 \n",
|
||
"3 0.912031 1.000000 1.000000 \n",
|
||
"4 0.000000 1.000000 1.000000 \n",
|
||
".. ... ... ... \n",
|
||
"95 0.925546 0.833333 0.800000 \n",
|
||
"96 0.000000 0.500000 1.000000 \n",
|
||
"97 0.918740 0.000000 0.000000 \n",
|
||
"98 0.954994 0.833333 1.000000 \n",
|
||
"99 0.855719 1.000000 1.000000 \n",
|
||
"\n",
|
||
" context_entity_recall answer_similarity answer_correctness \n",
|
||
"0 0.300000 0.833670 0.363590 \n",
|
||
"1 0.157895 0.809311 0.531596 \n",
|
||
"2 0.200000 0.923880 0.472905 \n",
|
||
"3 1.000000 0.642044 0.248746 \n",
|
||
"4 1.000000 0.806848 0.201712 \n",
|
||
".. ... ... ... \n",
|
||
"95 0.500000 0.736816 0.422299 \n",
|
||
"96 0.500000 0.856402 0.658545 \n",
|
||
"97 0.500000 0.686281 0.271570 \n",
|
||
"98 0.555556 0.824449 0.587930 \n",
|
||
"99 0.100000 0.856107 0.323783 \n",
|
||
"\n",
|
||
"[100 rows x 11 columns]"
|
||
]
|
||
},
|
||
"execution_count": 13,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"metrics = [\n",
|
||
" faithfulness,\n",
|
||
" answer_relevancy,\n",
|
||
" context_precision,\n",
|
||
" context_recall,\n",
|
||
" context_entity_recall,\n",
|
||
" answer_similarity,\n",
|
||
" answer_correctness\n",
|
||
"]\n",
|
||
"\n",
|
||
"result = evaluate(\n",
|
||
" dataset=dataset, \n",
|
||
" metrics=metrics,\n",
|
||
" llm=llm,\n",
|
||
" embeddings=embeddings,\n",
|
||
")\n",
|
||
"\n",
|
||
"result_df = result.to_pandas()\n",
|
||
"result_df"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 14,
|
||
"id": "20c3fa64",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"faithfulness 0.254643\n",
|
||
"answer_relevancy 0.609250\n",
|
||
"context_precision 0.862500\n",
|
||
"context_recall 0.906242\n",
|
||
"context_entity_recall 0.354178\n",
|
||
"answer_similarity 0.781973\n",
|
||
"answer_correctness 0.359654\n",
|
||
"dtype: float64"
|
||
]
|
||
},
|
||
"execution_count": 14,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"result_df.mean(numeric_only=True)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "350755fd",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"result_df.to_csv(INTERIM_DIR + \"retrieve_eval_results/ragas_eval.csv\", index=False)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 11,
|
||
"id": "1ff60103",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>user_input</th>\n",
|
||
" <th>reference_contexts</th>\n",
|
||
" <th>reference</th>\n",
|
||
" <th>persona_name</th>\n",
|
||
" <th>query_style</th>\n",
|
||
" <th>query_length</th>\n",
|
||
" <th>synthesizer_name</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>Hey, I'm trying to understand how AVAP handels...</td>\n",
|
||
" <td>[Execution Model in AVAP\\n4.1. Structure of a ...</td>\n",
|
||
" <td>In AVAP, a ZeroDivisionError is raised in two ...</td>\n",
|
||
" <td>Carlos Medina</td>\n",
|
||
" <td>MISSPELLED</td>\n",
|
||
" <td>LONG</td>\n",
|
||
" <td>single_hop_specific_query_synthesizer</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>How AVAP handle name resolution different from...</td>\n",
|
||
" <td>[Execution Model in AVAP\\n4.1. Structure of a ...</td>\n",
|
||
" <td>In AVAP, when a name is used in a code block, ...</td>\n",
|
||
" <td>Carlos Medina</td>\n",
|
||
" <td>POOR_GRAMMAR</td>\n",
|
||
" <td>MEDIUM</td>\n",
|
||
" <td>single_hop_specific_query_synthesizer</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>How does AVAP handle name resoltuion and scopi...</td>\n",
|
||
" <td>[Execution Model in AVAP\\n4.1. Structure of a ...</td>\n",
|
||
" <td>In AVAP, name resolution works differently fro...</td>\n",
|
||
" <td>Carlos Méndez</td>\n",
|
||
" <td>MISSPELLED</td>\n",
|
||
" <td>MEDIUM</td>\n",
|
||
" <td>single_hop_specific_query_synthesizer</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>AVAP how does import statement work and what a...</td>\n",
|
||
" <td>[Execution Model in AVAP\\n4.1. Structure of a ...</td>\n",
|
||
" <td>In AVAP, the import statement is the only way ...</td>\n",
|
||
" <td>Carlos Méndez</td>\n",
|
||
" <td>WEB_SEARCH_LIKE</td>\n",
|
||
" <td>MEDIUM</td>\n",
|
||
" <td>single_hop_specific_query_synthesizer</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>what happen with StopIteration when generator ...</td>\n",
|
||
" <td>[Execution Model in AVAP\\n4.1. Structure of a ...</td>\n",
|
||
" <td>In generator functions, the return statement i...</td>\n",
|
||
" <td>Carlos Méndez</td>\n",
|
||
" <td>POOR_GRAMMAR</td>\n",
|
||
" <td>SHORT</td>\n",
|
||
" <td>single_hop_specific_query_synthesizer</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>95</th>\n",
|
||
" <td>Hey so I been learning AVAP and I wanna know, ...</td>\n",
|
||
" <td>[Introduction\\nThe data model in AVAP™ defines...</td>\n",
|
||
" <td>In AVAP™, the data type that uses Unicode is t...</td>\n",
|
||
" <td>Carlos Méndez</td>\n",
|
||
" <td>POOR_GRAMMAR</td>\n",
|
||
" <td>MEDIUM</td>\n",
|
||
" <td>single_hop_specific_query_synthesizer</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>96</th>\n",
|
||
" <td>Hey so I been trying to learn AVAP™ and I want...</td>\n",
|
||
" <td>[Introduction\\nThe data model in AVAP™ defines...</td>\n",
|
||
" <td>In AVAP™, just like in Python, data types are ...</td>\n",
|
||
" <td>Carlos Medina</td>\n",
|
||
" <td>POOR_GRAMMAR</td>\n",
|
||
" <td>LONG</td>\n",
|
||
" <td>single_hop_specific_query_synthesizer</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>97</th>\n",
|
||
" <td>How are Unicde characters related to strings i...</td>\n",
|
||
" <td>[Introduction\\nThe data model in AVAP™ defines...</td>\n",
|
||
" <td>In AVAP™, strings (str) represent sequences of...</td>\n",
|
||
" <td>Carlos Méndez</td>\n",
|
||
" <td>MISSPELLED</td>\n",
|
||
" <td>MEDIUM</td>\n",
|
||
" <td>single_hop_specific_query_synthesizer</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>98</th>\n",
|
||
" <td>How does the data model in AVAP compare to Pyt...</td>\n",
|
||
" <td>[Introduction\\nThe data model in AVAP™ defines...</td>\n",
|
||
" <td>Similar to Python, AVAP uses a flexible and dy...</td>\n",
|
||
" <td>Carlos Medina</td>\n",
|
||
" <td>PERFECT_GRAMMAR</td>\n",
|
||
" <td>SHORT</td>\n",
|
||
" <td>single_hop_specific_query_synthesizer</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>99</th>\n",
|
||
" <td>hey so i been learning AVAP™ and i wanna know ...</td>\n",
|
||
" <td>[Introduction\\nThe data model in AVAP™ defines...</td>\n",
|
||
" <td>In AVAP™, the most common data types include: ...</td>\n",
|
||
" <td>Carlos Méndez</td>\n",
|
||
" <td>POOR_GRAMMAR</td>\n",
|
||
" <td>MEDIUM</td>\n",
|
||
" <td>single_hop_specific_query_synthesizer</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>100 rows × 7 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" user_input \\\n",
|
||
"0 Hey, I'm trying to understand how AVAP handels... \n",
|
||
"1 How AVAP handle name resolution different from... \n",
|
||
"2 How does AVAP handle name resoltuion and scopi... \n",
|
||
"3 AVAP how does import statement work and what a... \n",
|
||
"4 what happen with StopIteration when generator ... \n",
|
||
".. ... \n",
|
||
"95 Hey so I been learning AVAP and I wanna know, ... \n",
|
||
"96 Hey so I been trying to learn AVAP™ and I want... \n",
|
||
"97 How are Unicde characters related to strings i... \n",
|
||
"98 How does the data model in AVAP compare to Pyt... \n",
|
||
"99 hey so i been learning AVAP™ and i wanna know ... \n",
|
||
"\n",
|
||
" reference_contexts \\\n",
|
||
"0 [Execution Model in AVAP\\n4.1. Structure of a ... \n",
|
||
"1 [Execution Model in AVAP\\n4.1. Structure of a ... \n",
|
||
"2 [Execution Model in AVAP\\n4.1. Structure of a ... \n",
|
||
"3 [Execution Model in AVAP\\n4.1. Structure of a ... \n",
|
||
"4 [Execution Model in AVAP\\n4.1. Structure of a ... \n",
|
||
".. ... \n",
|
||
"95 [Introduction\\nThe data model in AVAP™ defines... \n",
|
||
"96 [Introduction\\nThe data model in AVAP™ defines... \n",
|
||
"97 [Introduction\\nThe data model in AVAP™ defines... \n",
|
||
"98 [Introduction\\nThe data model in AVAP™ defines... \n",
|
||
"99 [Introduction\\nThe data model in AVAP™ defines... \n",
|
||
"\n",
|
||
" reference persona_name \\\n",
|
||
"0 In AVAP, a ZeroDivisionError is raised in two ... Carlos Medina \n",
|
||
"1 In AVAP, when a name is used in a code block, ... Carlos Medina \n",
|
||
"2 In AVAP, name resolution works differently fro... Carlos Méndez \n",
|
||
"3 In AVAP, the import statement is the only way ... Carlos Méndez \n",
|
||
"4 In generator functions, the return statement i... Carlos Méndez \n",
|
||
".. ... ... \n",
|
||
"95 In AVAP™, the data type that uses Unicode is t... Carlos Méndez \n",
|
||
"96 In AVAP™, just like in Python, data types are ... Carlos Medina \n",
|
||
"97 In AVAP™, strings (str) represent sequences of... Carlos Méndez \n",
|
||
"98 Similar to Python, AVAP uses a flexible and dy... Carlos Medina \n",
|
||
"99 In AVAP™, the most common data types include: ... Carlos Méndez \n",
|
||
"\n",
|
||
" query_style query_length synthesizer_name \n",
|
||
"0 MISSPELLED LONG single_hop_specific_query_synthesizer \n",
|
||
"1 POOR_GRAMMAR MEDIUM single_hop_specific_query_synthesizer \n",
|
||
"2 MISSPELLED MEDIUM single_hop_specific_query_synthesizer \n",
|
||
"3 WEB_SEARCH_LIKE MEDIUM single_hop_specific_query_synthesizer \n",
|
||
"4 POOR_GRAMMAR SHORT single_hop_specific_query_synthesizer \n",
|
||
".. ... ... ... \n",
|
||
"95 POOR_GRAMMAR MEDIUM single_hop_specific_query_synthesizer \n",
|
||
"96 POOR_GRAMMAR LONG single_hop_specific_query_synthesizer \n",
|
||
"97 MISSPELLED MEDIUM single_hop_specific_query_synthesizer \n",
|
||
"98 PERFECT_GRAMMAR SHORT single_hop_specific_query_synthesizer \n",
|
||
"99 POOR_GRAMMAR MEDIUM single_hop_specific_query_synthesizer \n",
|
||
"\n",
|
||
"[100 rows x 7 columns]"
|
||
]
|
||
},
|
||
"execution_count": 11,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"synthetic_dataset"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 22,
|
||
"id": "71743384",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"import pandas as pd\n",
|
||
"\n",
|
||
"from evidently import Dataset\n",
|
||
"from evidently import DataDefinition\n",
|
||
"from evidently.descriptors import *\n",
|
||
"\n",
|
||
"from evidently import Report\n",
|
||
"from evidently.presets import TextEvals\n",
|
||
"from evidently.metrics import *\n",
|
||
"from evidently.tests import *"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 24,
|
||
"id": "e1ac1a41",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"ename": "ValidationError",
|
||
"evalue": "1 validation error for OllamaOptions\napi_url\n field required (type=value_error.missing)",
|
||
"output_type": "error",
|
||
"traceback": [
|
||
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
|
||
"\u001b[31mValidationError\u001b[39m Traceback (most recent call last)",
|
||
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[24]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m context_based_evals = \u001b[43mDataset\u001b[49m\u001b[43m.\u001b[49m\u001b[43mfrom_pandas\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 2\u001b[39m \u001b[43m \u001b[49m\u001b[43msynthetic_dataset\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 3\u001b[39m \u001b[43m \u001b[49m\u001b[43mdata_definition\u001b[49m\u001b[43m=\u001b[49m\u001b[43mDataDefinition\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtext_columns\u001b[49m\u001b[43m=\u001b[49m\u001b[43m[\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43muser_input\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mreference_contexts\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mreference\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 4\u001b[39m \u001b[43m \u001b[49m\u001b[43mdescriptors\u001b[49m\u001b[43m=\u001b[49m\u001b[43m[\u001b[49m\u001b[43mContextQualityLLMEval\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mreference_contexts\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mquestion\u001b[49m\u001b[43m=\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43muser_input\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mprovider\u001b[49m\u001b[43m=\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mollama\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m=\u001b[49m\u001b[43mOLLAMA_MODEL_NAME\u001b[49m\u001b[43m)\u001b[49m\u001b[43m]\u001b[49m\n\u001b[32m 5\u001b[39m \u001b[43m)\u001b[49m\n\u001b[32m 6\u001b[39m context_based_evals.as_dataframe()\n",
|
||
"\u001b[36mFile \u001b[39m\u001b[32m~/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/evidently/core/datasets.py:1271\u001b[39m, in \u001b[36mDataset.from_pandas\u001b[39m\u001b[34m(cls, data, data_definition, descriptors, options, metadata, tags)\u001b[39m\n\u001b[32m 1269\u001b[39m dataset = PandasDataset(data, data_definition, metadata=metadata, tags=tags)\n\u001b[32m 1270\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m descriptors \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m-> \u001b[39m\u001b[32m1271\u001b[39m \u001b[43mdataset\u001b[49m\u001b[43m.\u001b[49m\u001b[43madd_descriptors\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdescriptors\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moptions\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 1272\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m dataset\n",
|
||
"\u001b[36mFile \u001b[39m\u001b[32m~/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/evidently/core/datasets.py:1382\u001b[39m, in \u001b[36mDataset.add_descriptors\u001b[39m\u001b[34m(self, descriptors, options)\u001b[39m\n\u001b[32m 1375\u001b[39m \u001b[38;5;250m\u001b[39m\u001b[33;03m\"\"\"Add multiple descriptors to the dataset.\u001b[39;00m\n\u001b[32m 1376\u001b[39m \n\u001b[32m 1377\u001b[39m \u001b[33;03mArgs:\u001b[39;00m\n\u001b[32m 1378\u001b[39m \u001b[33;03m* `descriptors`: List of `Descriptor` objects to compute\u001b[39;00m\n\u001b[32m 1379\u001b[39m \u001b[33;03m* `options`: Optional options for descriptor computation\u001b[39;00m\n\u001b[32m 1380\u001b[39m \u001b[33;03m\"\"\"\u001b[39;00m\n\u001b[32m 1381\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m descriptor \u001b[38;5;129;01min\u001b[39;00m descriptors:\n\u001b[32m-> \u001b[39m\u001b[32m1382\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43madd_descriptor\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdescriptor\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moptions\u001b[49m\u001b[43m)\u001b[49m\n",
|
||
"\u001b[36mFile \u001b[39m\u001b[32m~/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/evidently/core/datasets.py:1688\u001b[39m, in \u001b[36mPandasDataset.add_descriptor\u001b[39m\u001b[34m(self, descriptor, options)\u001b[39m\n\u001b[32m 1686\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34madd_descriptor\u001b[39m(\u001b[38;5;28mself\u001b[39m, descriptor: Descriptor, options: AnyOptions = \u001b[38;5;28;01mNone\u001b[39;00m):\n\u001b[32m 1687\u001b[39m descriptor.validate_input(\u001b[38;5;28mself\u001b[39m._data_definition)\n\u001b[32m-> \u001b[39m\u001b[32m1688\u001b[39m new_columns = \u001b[43mdescriptor\u001b[49m\u001b[43m.\u001b[49m\u001b[43mgenerate_data\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mOptions\u001b[49m\u001b[43m.\u001b[49m\u001b[43mfrom_any_options\u001b[49m\u001b[43m(\u001b[49m\u001b[43moptions\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 1689\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(new_columns, DatasetColumn):\n\u001b[32m 1690\u001b[39m new_columns = {descriptor.alias: new_columns}\n",
|
||
"\u001b[36mFile \u001b[39m\u001b[32m~/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/evidently/core/datasets.py:1099\u001b[39m, in \u001b[36mFeatureDescriptor.generate_data\u001b[39m\u001b[34m(self, dataset, options)\u001b[39m\n\u001b[32m 1096\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mgenerate_data\u001b[39m(\n\u001b[32m 1097\u001b[39m \u001b[38;5;28mself\u001b[39m, dataset: \u001b[33m\"\u001b[39m\u001b[33mDataset\u001b[39m\u001b[33m\"\u001b[39m, options: Options\n\u001b[32m 1098\u001b[39m ) -> Union[DatasetColumn, Dict[DisplayName, DatasetColumn]]:\n\u001b[32m-> \u001b[39m\u001b[32m1099\u001b[39m feature = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mfeature\u001b[49m\u001b[43m.\u001b[49m\u001b[43mgenerate_features_renamed\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 1100\u001b[39m \u001b[43m \u001b[49m\u001b[43mdataset\u001b[49m\u001b[43m.\u001b[49m\u001b[43mas_dataframe\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1101\u001b[39m \u001b[43m \u001b[49m\u001b[43mcreate_data_definition\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdataset\u001b[49m\u001b[43m.\u001b[49m\u001b[43mas_dataframe\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mColumnMapping\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1102\u001b[39m \u001b[43m \u001b[49m\u001b[43moptions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1103\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 1104\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m {\n\u001b[32m 1105\u001b[39m col.display_name: \u001b[38;5;28mself\u001b[39m.get_dataset_column(col.name, feature[col.name])\n\u001b[32m 1106\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m col \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m.feature.list_columns()\n\u001b[32m 1107\u001b[39m }\n",
|
||
"\u001b[36mFile \u001b[39m\u001b[32m~/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/evidently/legacy/features/generated_features.py:56\u001b[39m, in \u001b[36mGeneratedFeatures.generate_features_renamed\u001b[39m\u001b[34m(self, data, data_definition, options)\u001b[39m\n\u001b[32m 53\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mgenerate_features_renamed\u001b[39m(\n\u001b[32m 54\u001b[39m \u001b[38;5;28mself\u001b[39m, data: pd.DataFrame, data_definition: DataDefinition, options: Options\n\u001b[32m 55\u001b[39m ) -> pd.DataFrame:\n\u001b[32m---> \u001b[39m\u001b[32m56\u001b[39m features = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mgenerate_features\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdata\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdata_definition\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moptions\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 57\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m features.rename(columns={col: \u001b[38;5;28mself\u001b[39m._create_column_name(col) \u001b[38;5;28;01mfor\u001b[39;00m col \u001b[38;5;129;01min\u001b[39;00m features.columns}).set_index(\n\u001b[32m 58\u001b[39m data.index\n\u001b[32m 59\u001b[39m )\n",
|
||
"\u001b[36mFile \u001b[39m\u001b[32m~/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/evidently/legacy/features/llm_judge.py:54\u001b[39m, in \u001b[36mLLMJudge.generate_features\u001b[39m\u001b[34m(self, data, data_definition, options)\u001b[39m\n\u001b[32m 53\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mgenerate_features\u001b[39m(\u001b[38;5;28mself\u001b[39m, data: pd.DataFrame, data_definition: DataDefinition, options: Options) -> pd.DataFrame:\n\u001b[32m---> \u001b[39m\u001b[32m54\u001b[39m result: Union[List, Dict] = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mget_llm_wrapper\u001b[49m\u001b[43m(\u001b[49m\u001b[43moptions\u001b[49m\u001b[43m)\u001b[49m.run_batch_sync(\n\u001b[32m 55\u001b[39m requests=\u001b[38;5;28mself\u001b[39m.template.iterate_messages(data, \u001b[38;5;28mself\u001b[39m.get_input_columns())\n\u001b[32m 56\u001b[39m )\n\u001b[32m 57\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(result, \u001b[38;5;28mlist\u001b[39m) \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28many\u001b[39m(\u001b[38;5;28misinstance\u001b[39m(o, \u001b[38;5;28mdict\u001b[39m) \u001b[38;5;28;01mfor\u001b[39;00m o \u001b[38;5;129;01min\u001b[39;00m result):\n\u001b[32m 58\u001b[39m result = {\u001b[38;5;28mself\u001b[39m.display_name \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m.template.get_main_output_column(): result}\n",
|
||
"\u001b[36mFile \u001b[39m\u001b[32m~/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/evidently/legacy/features/llm_judge.py:43\u001b[39m, in \u001b[36mLLMJudge.get_llm_wrapper\u001b[39m\u001b[34m(self, options)\u001b[39m\n\u001b[32m 41\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mget_llm_wrapper\u001b[39m(\u001b[38;5;28mself\u001b[39m, options: Options) -> LLMWrapper:\n\u001b[32m 42\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m._llm_wrapper \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m---> \u001b[39m\u001b[32m43\u001b[39m \u001b[38;5;28mself\u001b[39m._llm_wrapper = \u001b[43mget_llm_wrapper\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mprovider\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moptions\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 44\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m._llm_wrapper\n",
|
||
"\u001b[36mFile \u001b[39m\u001b[32m~/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/evidently/llm/utils/wrapper.py:437\u001b[39m, in \u001b[36mget_llm_wrapper\u001b[39m\u001b[34m(provider, model, options)\u001b[39m\n\u001b[32m 435\u001b[39m key = (provider, \u001b[38;5;28;01mNone\u001b[39;00m)\n\u001b[32m 436\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m key \u001b[38;5;129;01min\u001b[39;00m _wrappers:\n\u001b[32m--> \u001b[39m\u001b[32m437\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_wrappers\u001b[49m\u001b[43m[\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m]\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moptions\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 438\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m find_spec(\u001b[33m\"\u001b[39m\u001b[33mlitellm\u001b[39m\u001b[33m\"\u001b[39m) \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m 439\u001b[39m litellm_wrapper = get_litellm_wrapper(provider, model, options)\n",
|
||
"\u001b[36mFile \u001b[39m\u001b[32m~/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/evidently/llm/utils/wrapper.py:583\u001b[39m, in \u001b[36mLiteLLMWrapper.__init__\u001b[39m\u001b[34m(self, model, options)\u001b[39m\n\u001b[32m 581\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34m__init__\u001b[39m(\u001b[38;5;28mself\u001b[39m, model: \u001b[38;5;28mstr\u001b[39m, options: Options):\n\u001b[32m 582\u001b[39m \u001b[38;5;28mself\u001b[39m.model = model\n\u001b[32m--> \u001b[39m\u001b[32m583\u001b[39m \u001b[38;5;28mself\u001b[39m.options: LLMOptions = \u001b[43moptions\u001b[49m\u001b[43m.\u001b[49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m__llm_options_type__\u001b[49m\u001b[43m)\u001b[49m\n",
|
||
"\u001b[36mFile \u001b[39m\u001b[32m~/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/evidently/legacy/options/base.py:51\u001b[39m, in \u001b[36mOptions.get\u001b[39m\u001b[34m(self, option_type)\u001b[39m\n\u001b[32m 49\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28missubclass\u001b[39m(possible_subclass, option_type):\n\u001b[32m 50\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m.custom[possible_subclass] \u001b[38;5;66;03m# type: ignore[return-value]\u001b[39;00m\n\u001b[32m---> \u001b[39m\u001b[32m51\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43moption_type\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n",
|
||
"\u001b[36mFile \u001b[39m\u001b[32m~/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/evidently/llm/utils/wrapper.py:472\u001b[39m, in \u001b[36mLLMOptions.__init__\u001b[39m\u001b[34m(self, api_key, rpm_limit, **data)\u001b[39m\n\u001b[32m 465\u001b[39m \u001b[38;5;250m\u001b[39m\u001b[33;03m\"\"\"Initialize LLM options.\u001b[39;00m\n\u001b[32m 466\u001b[39m \n\u001b[32m 467\u001b[39m \u001b[33;03mArgs:\u001b[39;00m\n\u001b[32m 468\u001b[39m \u001b[33;03m* `api_key`: Optional API key for the provider.\u001b[39;00m\n\u001b[32m 469\u001b[39m \u001b[33;03m* `rpm_limit`: Optional requests per minute limit (backward compatibility).\u001b[39;00m\n\u001b[32m 470\u001b[39m \u001b[33;03m\"\"\"\u001b[39;00m\n\u001b[32m 471\u001b[39m \u001b[38;5;28mself\u001b[39m.api_key = SecretStr(api_key) \u001b[38;5;28;01mif\u001b[39;00m api_key \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m472\u001b[39m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m.\u001b[49m\u001b[34;43m__init__\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mdata\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 473\u001b[39m \u001b[38;5;66;03m# backward comp\u001b[39;00m\n\u001b[32m 474\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m rpm_limit \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n",
|
||
"\u001b[36mFile \u001b[39m\u001b[32m~/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/evidently/pydantic_utils.py:89\u001b[39m, in \u001b[36mFrozenBaseModel.__init__\u001b[39m\u001b[34m(self, **data)\u001b[39m\n\u001b[32m 88\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34m__init__\u001b[39m(\u001b[38;5;28mself\u001b[39m, **data: Any):\n\u001b[32m---> \u001b[39m\u001b[32m89\u001b[39m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m.\u001b[49m\u001b[34;43m__init__\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m__init_values__\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mdata\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 90\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m private_attr \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m.__private_attributes__:\n\u001b[32m 91\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m private_attr \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m.__init_values__:\n",
|
||
"\u001b[36mFile \u001b[39m\u001b[32m~/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/pydantic/v1/main.py:347\u001b[39m, in \u001b[36mBaseModel.__init__\u001b[39m\u001b[34m(__pydantic_self__, **data)\u001b[39m\n\u001b[32m 345\u001b[39m values, fields_set, validation_error = validate_model(__pydantic_self__.\u001b[34m__class__\u001b[39m, data)\n\u001b[32m 346\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m validation_error:\n\u001b[32m--> \u001b[39m\u001b[32m347\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m validation_error\n\u001b[32m 348\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m 349\u001b[39m object_setattr(__pydantic_self__, \u001b[33m'\u001b[39m\u001b[33m__dict__\u001b[39m\u001b[33m'\u001b[39m, values)\n",
|
||
"\u001b[31mValidationError\u001b[39m: 1 validation error for OllamaOptions\napi_url\n field required (type=value_error.missing)"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"context_based_evals = Dataset.from_pandas(\n",
|
||
" synthetic_dataset,\n",
|
||
" data_definition=DataDefinition(text_columns=[\"user_input\", \"reference_contexts\", \"reference\"]),\n",
|
||
" descriptors=[ContextQualityLLMEval(\"reference_contexts\", question=\"user_input\", provider=\"ollama\", model=OLLAMA_MODEL_NAME)]\n",
|
||
")\n",
|
||
"context_based_evals.as_dataframe()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "c2d127ad",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": []
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "assistance-engine",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.11.13"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 5
|
||
}
|