assistance-engine/scratches/acano/evaluate_retrieve.ipynb

1010 lines
44 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "8fed4518",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_716860/1516785970.py:18: DeprecationWarning: Importing faithfulness from 'ragas.metrics' is deprecated and will be removed in v1.0. Please use 'ragas.metrics.collections' instead. Example: from ragas.metrics.collections import faithfulness\n",
" from ragas.metrics import (\n",
"/tmp/ipykernel_716860/1516785970.py:18: DeprecationWarning: Importing answer_relevancy from 'ragas.metrics' is deprecated and will be removed in v1.0. Please use 'ragas.metrics.collections' instead. Example: from ragas.metrics.collections import answer_relevancy\n",
" from ragas.metrics import (\n",
"/tmp/ipykernel_716860/1516785970.py:18: DeprecationWarning: Importing context_recall from 'ragas.metrics' is deprecated and will be removed in v1.0. Please use 'ragas.metrics.collections' instead. Example: from ragas.metrics.collections import context_recall\n",
" from ragas.metrics import (\n",
"/tmp/ipykernel_716860/1516785970.py:18: DeprecationWarning: Importing context_precision from 'ragas.metrics' is deprecated and will be removed in v1.0. Please use 'ragas.metrics.collections' instead. Example: from ragas.metrics.collections import context_precision\n",
" from ragas.metrics import (\n",
"/tmp/ipykernel_716860/1516785970.py:18: DeprecationWarning: Importing context_entity_recall from 'ragas.metrics' is deprecated and will be removed in v1.0. Please use 'ragas.metrics.collections' instead. Example: from ragas.metrics.collections import context_entity_recall\n",
" from ragas.metrics import (\n",
"/tmp/ipykernel_716860/1516785970.py:18: DeprecationWarning: Importing answer_similarity from 'ragas.metrics' is deprecated and will be removed in v1.0. Please use 'ragas.metrics.collections' instead. Example: from ragas.metrics.collections import answer_similarity\n",
" from ragas.metrics import (\n",
"/tmp/ipykernel_716860/1516785970.py:18: DeprecationWarning: Importing answer_correctness from 'ragas.metrics' is deprecated and will be removed in v1.0. Please use 'ragas.metrics.collections' instead. Example: from ragas.metrics.collections import answer_correctness\n",
" from ragas.metrics import (\n",
"/tmp/ipykernel_716860/1516785970.py:18: DeprecationWarning: Importing NonLLMContextRecall from 'ragas.metrics' is deprecated and will be removed in v1.0. Please use 'ragas.metrics.collections' instead. Example: from ragas.metrics.collections import NonLLMContextRecall\n",
" from ragas.metrics import (\n",
"/tmp/ipykernel_716860/1516785970.py:18: DeprecationWarning: Importing NonLLMContextPrecisionWithReference from 'ragas.metrics' is deprecated and will be removed in v1.0. Please use 'ragas.metrics.collections' instead. Example: from ragas.metrics.collections import NonLLMContextPrecisionWithReference\n",
" from ragas.metrics import (\n"
]
}
],
"source": [
"import sys\n",
"from pathlib import Path\n",
"\n",
"# Ensure the project root is on the path so `src` is importable\n",
"_project_root = str(Path(__file__).resolve().parents[2]) if \"__file__\" in dir() else str(Path.cwd().parents[1])\n",
"if _project_root not in sys.path:\n",
" sys.path.insert(0, _project_root)\n",
"\n",
"from langchain_core.documents import Document\n",
"from langchain_classic.chains.retrieval_qa.base import RetrievalQA\n",
"from langchain_elasticsearch import ElasticsearchStore\n",
"from ragas import evaluate, SingleTurnSample\n",
"from ragas.llms import LangchainLLMWrapper\n",
"from ragas.embeddings import LangchainEmbeddingsWrapper\n",
"from ragas.testset import TestsetGenerator\n",
"from ragas.testset.persona import Persona\n",
"from ragas.testset.synthesizers.single_hop.specific import SingleHopSpecificQuerySynthesizer\n",
"from ragas.metrics import (\n",
" faithfulness,\n",
" answer_relevancy,\n",
" context_recall,\n",
" context_precision,\n",
" context_entity_recall,\n",
" answer_similarity,\n",
" answer_correctness,\n",
" NonLLMContextRecall,\n",
" NonLLMContextPrecisionWithReference\n",
")\n",
"\n",
"from src.llm_factory import create_chat_model\n",
"from src.emb_factory import create_embedding_model\n",
"from src.config import (\n",
" ELASTICSEARCH_LOCAL_URL,\n",
" ELASTICSEARCH_INDEX,\n",
" OLLAMA_MODEL_NAME,\n",
" OLLAMA_EMB_MODEL_NAME,\n",
" RAW_DIR\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "4426d6c0",
"metadata": {},
"outputs": [],
"source": [
"llm = create_chat_model(\n",
" provider=\"bedrock\",\n",
" model=\"global.anthropic.claude-opus-4-6-v1\",\n",
" temperature=0,\n",
")\n",
"embeddings = create_embedding_model(\n",
" provider=\"ollama\",\n",
" model=OLLAMA_EMB_MODEL_NAME,\n",
")\n",
"agent_llm = create_chat_model(\n",
" provider=\"ollama\",\n",
" model=OLLAMA_MODEL_NAME,\n",
" temperature=0,\n",
" validate_model_on_init=True,\n",
")\n",
"vector_store = ElasticsearchStore(\n",
" es_url=ELASTICSEARCH_LOCAL_URL,\n",
" index_name=ELASTICSEARCH_INDEX,\n",
" embedding=embeddings,\n",
" query_field=\"text\",\n",
" vector_query_field=\"vector\",\n",
")\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "fe524d14",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Loaded 24 documents from /home/acano/PycharmProjects/assistance-engine/data/raw\n"
]
}
],
"source": [
"docs: list[Document] = []\n",
"for txt_file in sorted(RAW_DIR.glob(\"*.txt\")):\n",
" text = txt_file.read_text(encoding=\"utf-8\")\n",
" docs.append(Document(page_content=text, metadata={\"source\": txt_file.name}))\n",
"\n",
"print(f\"Loaded {len(docs)} documents from {RAW_DIR}\")"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "06103178",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>user_input</th>\n",
" <th>reference_contexts</th>\n",
" <th>reference</th>\n",
" <th>persona_name</th>\n",
" <th>query_style</th>\n",
" <th>query_length</th>\n",
" <th>synthesizer_name</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>How does AVAP handel a ZeroDivisionError when ...</td>\n",
" <td>[Execution Model in AVAP\\n4.1. Structure of a ...</td>\n",
" <td>In AVAP, when a division by zero occurs—whethe...</td>\n",
" <td>Carlos Menendez</td>\n",
" <td>MISSPELLED</td>\n",
" <td>LONG</td>\n",
" <td>single_hop_specific_query_synthesizer</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>As a backend developer who is learning AVAP an...</td>\n",
" <td>[Execution Model in AVAP\\n4.1. Structure of a ...</td>\n",
" <td>In AVAP, control flow structures include condi...</td>\n",
" <td>Carlos Menendez</td>\n",
" <td>PERFECT_GRAMMAR</td>\n",
" <td>LONG</td>\n",
" <td>single_hop_specific_query_synthesizer</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>hey so in AVAP when i do division by zero what...</td>\n",
" <td>[Execution Model in AVAP\\n4.1. Structure of a ...</td>\n",
" <td>In AVAP, when you perform a division by zero, ...</td>\n",
" <td>Carlos Medina</td>\n",
" <td>POOR_GRAMMAR</td>\n",
" <td>MEDIUM</td>\n",
" <td>single_hop_specific_query_synthesizer</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>what happen if file not found when i do import...</td>\n",
" <td>[Execution Model in AVAP\\n4.1. Structure of a ...</td>\n",
" <td>When an import statement is executed in AVAP, ...</td>\n",
" <td>Carlos Medina</td>\n",
" <td>POOR_GRAMMAR</td>\n",
" <td>SHORT</td>\n",
" <td>single_hop_specific_query_synthesizer</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>In AVAP, under what circumstances is a TypeErr...</td>\n",
" <td>[Execution Model in AVAP\\n4.1. Structure of a ...</td>\n",
" <td>In AVAP, a TypeError exception is raised in tw...</td>\n",
" <td>Carlos Menendez</td>\n",
" <td>PERFECT_GRAMMAR</td>\n",
" <td>MEDIUM</td>\n",
" <td>single_hop_specific_query_synthesizer</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>95</th>\n",
" <td>How does the data model in AVAP™ compare to Py...</td>\n",
" <td>[Introduction\\nThe data model in AVAP™ defines...</td>\n",
" <td>The data model in AVAP™ is very similar to Pyt...</td>\n",
" <td>Carlos Menendez</td>\n",
" <td>PERFECT_GRAMMAR</td>\n",
" <td>MEDIUM</td>\n",
" <td>single_hop_specific_query_synthesizer</td>\n",
" </tr>\n",
" <tr>\n",
" <th>96</th>\n",
" <td>What data types are available in AVAP™?</td>\n",
" <td>[Introduction\\nThe data model in AVAP™ defines...</td>\n",
" <td>In AVAP™, the most common data types include i...</td>\n",
" <td>Carlos Medina</td>\n",
" <td>PERFECT_GRAMMAR</td>\n",
" <td>SHORT</td>\n",
" <td>single_hop_specific_query_synthesizer</td>\n",
" </tr>\n",
" <tr>\n",
" <th>97</th>\n",
" <td>AVAP strings Unicode</td>\n",
" <td>[Introduction\\nThe data model in AVAP™ defines...</td>\n",
" <td>In AVAP™, strings (str) represent sequences of...</td>\n",
" <td>Carlos Medina</td>\n",
" <td>WEB_SEARCH_LIKE</td>\n",
" <td>SHORT</td>\n",
" <td>single_hop_specific_query_synthesizer</td>\n",
" </tr>\n",
" <tr>\n",
" <th>98</th>\n",
" <td>AVAP data model comparison with Python data ty...</td>\n",
" <td>[Introduction\\nThe data model in AVAP™ defines...</td>\n",
" <td>The data model in AVAP is similar to Python in...</td>\n",
" <td>Carlos Mendieta</td>\n",
" <td>WEB_SEARCH_LIKE</td>\n",
" <td>MEDIUM</td>\n",
" <td>single_hop_specific_query_synthesizer</td>\n",
" </tr>\n",
" <tr>\n",
" <th>99</th>\n",
" <td>AVAP™ data types and data structures overview</td>\n",
" <td>[Introduction\\nThe data model in AVAP™ defines...</td>\n",
" <td>AVAP™ uses a flexible and dynamic data model s...</td>\n",
" <td>Carlos Mendieta</td>\n",
" <td>WEB_SEARCH_LIKE</td>\n",
" <td>SHORT</td>\n",
" <td>single_hop_specific_query_synthesizer</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>100 rows × 7 columns</p>\n",
"</div>"
],
"text/plain": [
" user_input \\\n",
"0 How does AVAP handel a ZeroDivisionError when ... \n",
"1 As a backend developer who is learning AVAP an... \n",
"2 hey so in AVAP when i do division by zero what... \n",
"3 what happen if file not found when i do import... \n",
"4 In AVAP, under what circumstances is a TypeErr... \n",
".. ... \n",
"95 How does the data model in AVAP™ compare to Py... \n",
"96 What data types are available in AVAP™? \n",
"97 AVAP strings Unicode \n",
"98 AVAP data model comparison with Python data ty... \n",
"99 AVAP™ data types and data structures overview \n",
"\n",
" reference_contexts \\\n",
"0 [Execution Model in AVAP\\n4.1. Structure of a ... \n",
"1 [Execution Model in AVAP\\n4.1. Structure of a ... \n",
"2 [Execution Model in AVAP\\n4.1. Structure of a ... \n",
"3 [Execution Model in AVAP\\n4.1. Structure of a ... \n",
"4 [Execution Model in AVAP\\n4.1. Structure of a ... \n",
".. ... \n",
"95 [Introduction\\nThe data model in AVAP™ defines... \n",
"96 [Introduction\\nThe data model in AVAP™ defines... \n",
"97 [Introduction\\nThe data model in AVAP™ defines... \n",
"98 [Introduction\\nThe data model in AVAP™ defines... \n",
"99 [Introduction\\nThe data model in AVAP™ defines... \n",
"\n",
" reference persona_name \\\n",
"0 In AVAP, when a division by zero occurs—whethe... Carlos Menendez \n",
"1 In AVAP, control flow structures include condi... Carlos Menendez \n",
"2 In AVAP, when you perform a division by zero, ... Carlos Medina \n",
"3 When an import statement is executed in AVAP, ... Carlos Medina \n",
"4 In AVAP, a TypeError exception is raised in tw... Carlos Menendez \n",
".. ... ... \n",
"95 The data model in AVAP™ is very similar to Pyt... Carlos Menendez \n",
"96 In AVAP™, the most common data types include i... Carlos Medina \n",
"97 In AVAP™, strings (str) represent sequences of... Carlos Medina \n",
"98 The data model in AVAP is similar to Python in... Carlos Mendieta \n",
"99 AVAP™ uses a flexible and dynamic data model s... Carlos Mendieta \n",
"\n",
" query_style query_length synthesizer_name \n",
"0 MISSPELLED LONG single_hop_specific_query_synthesizer \n",
"1 PERFECT_GRAMMAR LONG single_hop_specific_query_synthesizer \n",
"2 POOR_GRAMMAR MEDIUM single_hop_specific_query_synthesizer \n",
"3 POOR_GRAMMAR SHORT single_hop_specific_query_synthesizer \n",
"4 PERFECT_GRAMMAR MEDIUM single_hop_specific_query_synthesizer \n",
".. ... ... ... \n",
"95 PERFECT_GRAMMAR MEDIUM single_hop_specific_query_synthesizer \n",
"96 PERFECT_GRAMMAR SHORT single_hop_specific_query_synthesizer \n",
"97 WEB_SEARCH_LIKE SHORT single_hop_specific_query_synthesizer \n",
"98 WEB_SEARCH_LIKE MEDIUM single_hop_specific_query_synthesizer \n",
"99 WEB_SEARCH_LIKE SHORT single_hop_specific_query_synthesizer \n",
"\n",
"[100 rows x 7 columns]"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"synthetic_dataset"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "ab1932b7",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_716860/244266171.py:1: DeprecationWarning: LangchainLLMWrapper is deprecated and will be removed in a future version. Use llm_factory instead: from openai import OpenAI; from ragas.llms import llm_factory; llm = llm_factory('gpt-4o-mini', client=OpenAI(api_key='...'))\n",
" synth = SingleHopSpecificQuerySynthesizer(llm=LangchainLLMWrapper(llm))\n",
"/tmp/ipykernel_716860/244266171.py:3: DeprecationWarning: LangchainLLMWrapper is deprecated and will be removed in a future version. Use llm_factory instead: from openai import OpenAI; from ragas.llms import llm_factory; llm = llm_factory('gpt-4o-mini', client=OpenAI(api_key='...'))\n",
" generator = TestsetGenerator(llm=LangchainLLMWrapper(llm), embedding_model=LangchainEmbeddingsWrapper(embeddings))\n",
"/tmp/ipykernel_716860/244266171.py:3: DeprecationWarning: LangchainEmbeddingsWrapper is deprecated and will be removed in a future version. Use the modern embedding providers instead: embedding_factory('openai', model='text-embedding-3-small', client=openai_client) or from ragas.embeddings import OpenAIEmbeddings, GoogleEmbeddings, HuggingFaceEmbeddings\n",
" generator = TestsetGenerator(llm=LangchainLLMWrapper(llm), embedding_model=LangchainEmbeddingsWrapper(embeddings))\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "8ec6ef79b1964c44b78a75ca539f816b",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Applying SummaryExtractor: 0%| | 0/24 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "f583879571cf4c818cbb7321b0839990",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Applying CustomNodeFilter: 0%| | 0/24 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Node 75603049-8ebb-49dc-9e7d-da37fa927eb9 does not have a summary. Skipping filtering.\n",
"Node c2d1e8b0-ca69-47af-9bcd-39cbf8560edb does not have a summary. Skipping filtering.\n",
"Node 24c16f65-02fd-4d80-84c6-d7d1a8a2638c does not have a summary. Skipping filtering.\n",
"Node a0975db8-14b3-44eb-8aa2-e83274fb55ab does not have a summary. Skipping filtering.\n",
"Node 6768ece8-9a13-42b3-9aec-08e828044420 does not have a summary. Skipping filtering.\n",
"Node 54719709-293a-49db-86f2-8f697015e16a does not have a summary. Skipping filtering.\n",
"Node a049eacb-5a3e-404f-83ea-249061fcae0a does not have a summary. Skipping filtering.\n",
"Node eb4ac1be-55ae-487e-936c-ee43513f25e9 does not have a summary. Skipping filtering.\n",
"Node baf6b749-0280-46f0-a47b-8fd82373da1b does not have a summary. Skipping filtering.\n",
"Node 9caa0b62-10ea-4f19-98b7-5f10b2cbc486 does not have a summary. Skipping filtering.\n",
"Node d28505f3-cdd7-44d1-9c45-9741e27e25c3 does not have a summary. Skipping filtering.\n",
"Node f9a234cb-1af1-4f06-8d9a-6921c19ffbf5 does not have a summary. Skipping filtering.\n",
"Node 4f0b355e-81ca-450c-99e3-8458ebd304c6 does not have a summary. Skipping filtering.\n",
"Node 66cf6447-7639-497c-9ae2-e26b0c7443b5 does not have a summary. Skipping filtering.\n",
"Node 722bfb38-b24e-483f-9787-253d71716c1e does not have a summary. Skipping filtering.\n",
"Node ce76bfcc-8cb3-4de2-87e4-74f10ad5c549 does not have a summary. Skipping filtering.\n",
"Node dada2116-28ae-4d7c-a4ad-f8ccc3952eb1 does not have a summary. Skipping filtering.\n",
"Node e6f7360d-4309-453a-aab8-d3015d53dd88 does not have a summary. Skipping filtering.\n",
"Node a73eb1ba-9609-4ad8-80bc-98d9c4993fcd does not have a summary. Skipping filtering.\n",
"Node 004b6ce2-48a7-4bff-9393-67e963ebe7fc does not have a summary. Skipping filtering.\n",
"Node 854676ec-e80f-45ef-a84c-08d527b96813 does not have a summary. Skipping filtering.\n",
"Node 241a936b-3470-41be-8449-7994f3ba5eee does not have a summary. Skipping filtering.\n",
"Node 28f76e87-5e68-4a63-83a8-e7c4addb855a does not have a summary. Skipping filtering.\n",
"Node f7e3d432-5073-4004-af6c-683cc7e7a600 does not have a summary. Skipping filtering.\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "a3cf82e356d7485fa6ffa54b131d6a18",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Applying EmbeddingExtractor: 0%| | 0/24 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "b14be1e6a8e74d9592860377b5fa0044",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Applying ThemesExtractor: 0%| | 0/24 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "65089eea206341f290cda033732df991",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Applying NERExtractor: 0%| | 0/24 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "3120af625643421eafc48c78fce57d8d",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Applying CosineSimilarityBuilder: 0%| | 0/1 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "6a8e81dbde254d6e82d76f3752e211d2",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Applying OverlapScoreBuilder: 0%| | 0/1 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "59ca74c675f14067a4a665d56b4e29ba",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Generating personas: 0%| | 0/3 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "fa5aee3db9674f6eb50fef7214cadd92",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Generating Scenarios: 0%| | 0/1 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "63030d2b67984b838c055b29d0443639",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Generating Samples: 0%| | 0/100 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"synth = SingleHopSpecificQuerySynthesizer(llm=LangchainLLMWrapper(llm))\n",
"\n",
"generator = TestsetGenerator(llm=LangchainLLMWrapper(llm), embedding_model=LangchainEmbeddingsWrapper(embeddings))\n",
"synthetic_dataset = generator.generate_with_chunks(\n",
" chunks=docs,\n",
" testset_size=100,\n",
" query_distribution=[(synth, 1.0)]\n",
")\n",
"synthetic_dataset = synthetic_dataset.to_pandas()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "18ceb119",
"metadata": {},
"outputs": [],
"source": [
"retriever = vector_store.as_retriever(\n",
" search_type=\"similarity\",\n",
" search_kwargs={\"k\": 3},\n",
" )\n",
"\n",
"qa_chain = RetrievalQA.from_chain_type(\n",
" llm=agent_llm, retriever=retriever, return_source_documents=True\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "344a1266",
"metadata": {},
"outputs": [],
"source": [
"from datasets import Dataset\n",
"questions = synthetic_dataset[\"user_input\"]\n",
"ground_truths = synthetic_dataset[\"reference\"]\n",
"\n",
"answers = []\n",
"contexts = []\n",
"\n",
"for query in questions:\n",
" answers.append(qa_chain.invoke(query)[\"result\"])\n",
" contexts.append([docs.page_content for docs in retriever.invoke(query)])\n",
"\n",
"# To dict\n",
"data = {\n",
" \"question\": questions,\n",
" \"answer\": answers,\n",
" \"contexts\": contexts,\n",
" \"ground_truth\": ground_truths\n",
"}\n",
"\n",
"# Convert dict to dataset\n",
"dataset = Dataset.from_dict(data)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "a9011f94",
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "3239c7c9d6254330b9b079a249a74c60",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Evaluating: 0%| | 0/700 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Exception in callback Task.__step()\n",
"handle: <Handle Task.__step()>\n",
"Traceback (most recent call last):\n",
" File \"/home/acano/.local/share/uv/python/cpython-3.11.13-linux-x86_64-gnu/lib/python3.11/asyncio/events.py\", line 84, in _run\n",
" self._context.run(self._callback, *self._args)\n",
"RuntimeError: cannot enter context: <_contextvars.Context object at 0x74fe3aa80780> is already entered\n",
"Task was destroyed but it is pending!\n",
"task: <Task pending name='Task-3487' coro=<_async_in_context.<locals>.run_in_context() done, defined at /home/acano/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/ipykernel/utils.py:57> wait_for=<Task pending name='Task-3489' coro=<Kernel.shell_main() running at /home/acano/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/ipykernel/kernelbase.py:597> cb=[Task.__wakeup()]> cb=[ZMQStream._run_callback.<locals>._log_error() at /home/acano/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/zmq/eventloop/zmqstream.py:563]>\n",
"/home/acano/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/pydantic/json_schema.py:335: RuntimeWarning: coroutine 'Kernel.shell_main' was never awaited\n",
" mapping[key] = getattr(self, method_name)\n",
"RuntimeWarning: Enable tracemalloc to get the object allocation traceback\n",
"Task was destroyed but it is pending!\n",
"task: <Task pending name='Task-3489' coro=<Kernel.shell_main() running at /home/acano/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/ipykernel/kernelbase.py:597> cb=[Task.__wakeup()]>\n",
"Exception in callback Task.__step()\n",
"handle: <Handle Task.__step()>\n",
"Traceback (most recent call last):\n",
" File \"/home/acano/.local/share/uv/python/cpython-3.11.13-linux-x86_64-gnu/lib/python3.11/asyncio/events.py\", line 84, in _run\n",
" self._context.run(self._callback, *self._args)\n",
"RuntimeError: cannot enter context: <_contextvars.Context object at 0x74fe3aa80780> is already entered\n",
"Task was destroyed but it is pending!\n",
"task: <Task pending name='Task-5636' coro=<_async_in_context.<locals>.run_in_context() done, defined at /home/acano/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/ipykernel/utils.py:57> wait_for=<Task pending name='Task-5637' coro=<Kernel.shell_main() running at /home/acano/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/ipykernel/kernelbase.py:597> cb=[Task.__wakeup()]> cb=[ZMQStream._run_callback.<locals>._log_error() at /home/acano/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/zmq/eventloop/zmqstream.py:563]>\n",
"/home/acano/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/pydantic/main.py:716: RuntimeWarning: coroutine 'Kernel.shell_main' was never awaited\n",
" return cls.__pydantic_validator__.validate_python(\n",
"RuntimeWarning: Enable tracemalloc to get the object allocation traceback\n",
"Task was destroyed but it is pending!\n",
"task: <Task pending name='Task-5637' coro=<Kernel.shell_main() running at /home/acano/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/ipykernel/kernelbase.py:597> cb=[Task.__wakeup()]>\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>user_input</th>\n",
" <th>retrieved_contexts</th>\n",
" <th>response</th>\n",
" <th>reference</th>\n",
" <th>faithfulness</th>\n",
" <th>answer_relevancy</th>\n",
" <th>context_precision</th>\n",
" <th>context_recall</th>\n",
" <th>context_entity_recall</th>\n",
" <th>answer_similarity</th>\n",
" <th>answer_correctness</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>How does AVAP handel a ZeroDivisionError when ...</td>\n",
" <td>[Execution Model in AVAP 4.1. Structure of a P...</td>\n",
" <td>AVAP (Advanced Virtual Application Platform) i...</td>\n",
" <td>In AVAP, when a division by zero occurs—whethe...</td>\n",
" <td>0.083333</td>\n",
" <td>0.000000</td>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
" <td>0.300000</td>\n",
" <td>0.833670</td>\n",
" <td>0.363590</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>As a backend developer who is learning AVAP an...</td>\n",
" <td>[SECTION III: Control Logic and Decision Struc...</td>\n",
" <td>I can provide information on the if statement ...</td>\n",
" <td>In AVAP, control flow structures include condi...</td>\n",
" <td>0.904762</td>\n",
" <td>0.837564</td>\n",
" <td>1.000000</td>\n",
" <td>0.454545</td>\n",
" <td>0.157895</td>\n",
" <td>0.809311</td>\n",
" <td>0.531596</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>hey so in AVAP when i do division by zero what...</td>\n",
" <td>[Execution Model in AVAP 4.1. Structure of a P...</td>\n",
" <td>In AVAP, when a division operation is attempte...</td>\n",
" <td>In AVAP, when you perform a division by zero, ...</td>\n",
" <td>0.312500</td>\n",
" <td>0.892255</td>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
" <td>0.200000</td>\n",
" <td>0.923880</td>\n",
" <td>0.472905</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>what happen if file not found when i do import...</td>\n",
" <td>[Execution Model in AVAP 4.1. Structure of a P...</td>\n",
" <td>In AVAP, the equivalent of a `FileNotFoundErro...</td>\n",
" <td>When an import statement is executed in AVAP, ...</td>\n",
" <td>0.142857</td>\n",
" <td>0.912031</td>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
" <td>0.642044</td>\n",
" <td>0.248746</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>In AVAP, under what circumstances is a TypeErr...</td>\n",
" <td>[Execution Model in AVAP 4.1. Structure of a P...</td>\n",
" <td>I don't have information on the specific circu...</td>\n",
" <td>In AVAP, a TypeError exception is raised in tw...</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
" <td>0.806848</td>\n",
" <td>0.201712</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>95</th>\n",
" <td>How does the data model in AVAP™ compare to Py...</td>\n",
" <td>[Introduction The data model in AVAP™ defines ...</td>\n",
" <td>AVAP (Advanced Vector Analysis Platform) is a ...</td>\n",
" <td>The data model in AVAP™ is very similar to Pyt...</td>\n",
" <td>0.289474</td>\n",
" <td>0.925546</td>\n",
" <td>0.833333</td>\n",
" <td>0.800000</td>\n",
" <td>0.500000</td>\n",
" <td>0.736816</td>\n",
" <td>0.422299</td>\n",
" </tr>\n",
" <tr>\n",
" <th>96</th>\n",
" <td>What data types are available in AVAP™?</td>\n",
" <td>[Chapter 5: Data Types In this chapter, we wil...</td>\n",
" <td>I don't know the specific details about the da...</td>\n",
" <td>In AVAP™, the most common data types include i...</td>\n",
" <td>0.818182</td>\n",
" <td>0.000000</td>\n",
" <td>0.500000</td>\n",
" <td>1.000000</td>\n",
" <td>0.500000</td>\n",
" <td>0.856402</td>\n",
" <td>0.658545</td>\n",
" </tr>\n",
" <tr>\n",
" <th>97</th>\n",
" <td>AVAP strings Unicode</td>\n",
" <td>[Chapter 2: Notation in AVAP™ Introduction Not...</td>\n",
" <td>AVAP (African Vertebrate Anatomy Project) uses...</td>\n",
" <td>In AVAP™, strings (str) represent sequences of...</td>\n",
" <td>0.000000</td>\n",
" <td>0.918740</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.500000</td>\n",
" <td>0.686281</td>\n",
" <td>0.271570</td>\n",
" </tr>\n",
" <tr>\n",
" <th>98</th>\n",
" <td>AVAP data model comparison with Python data ty...</td>\n",
" <td>[Introduction The data model in AVAP™ defines ...</td>\n",
" <td>Here's a comparison of the AVAP data model wit...</td>\n",
" <td>The data model in AVAP is similar to Python in...</td>\n",
" <td>0.343750</td>\n",
" <td>0.954994</td>\n",
" <td>0.833333</td>\n",
" <td>1.000000</td>\n",
" <td>0.555556</td>\n",
" <td>0.824449</td>\n",
" <td>0.587930</td>\n",
" </tr>\n",
" <tr>\n",
" <th>99</th>\n",
" <td>AVAP™ data types and data structures overview</td>\n",
" <td>[Introduction The data model in AVAP™ defines ...</td>\n",
" <td>AVAP (Advanced Visual Analytics Platform) is a...</td>\n",
" <td>AVAP™ uses a flexible and dynamic data model s...</td>\n",
" <td>0.000000</td>\n",
" <td>0.855719</td>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
" <td>0.100000</td>\n",
" <td>0.856107</td>\n",
" <td>0.323783</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>100 rows × 11 columns</p>\n",
"</div>"
],
"text/plain": [
" user_input \\\n",
"0 How does AVAP handel a ZeroDivisionError when ... \n",
"1 As a backend developer who is learning AVAP an... \n",
"2 hey so in AVAP when i do division by zero what... \n",
"3 what happen if file not found when i do import... \n",
"4 In AVAP, under what circumstances is a TypeErr... \n",
".. ... \n",
"95 How does the data model in AVAP™ compare to Py... \n",
"96 What data types are available in AVAP™? \n",
"97 AVAP strings Unicode \n",
"98 AVAP data model comparison with Python data ty... \n",
"99 AVAP™ data types and data structures overview \n",
"\n",
" retrieved_contexts \\\n",
"0 [Execution Model in AVAP 4.1. Structure of a P... \n",
"1 [SECTION III: Control Logic and Decision Struc... \n",
"2 [Execution Model in AVAP 4.1. Structure of a P... \n",
"3 [Execution Model in AVAP 4.1. Structure of a P... \n",
"4 [Execution Model in AVAP 4.1. Structure of a P... \n",
".. ... \n",
"95 [Introduction The data model in AVAP™ defines ... \n",
"96 [Chapter 5: Data Types In this chapter, we wil... \n",
"97 [Chapter 2: Notation in AVAP™ Introduction Not... \n",
"98 [Introduction The data model in AVAP™ defines ... \n",
"99 [Introduction The data model in AVAP™ defines ... \n",
"\n",
" response \\\n",
"0 AVAP (Advanced Virtual Application Platform) i... \n",
"1 I can provide information on the if statement ... \n",
"2 In AVAP, when a division operation is attempte... \n",
"3 In AVAP, the equivalent of a `FileNotFoundErro... \n",
"4 I don't have information on the specific circu... \n",
".. ... \n",
"95 AVAP (Advanced Vector Analysis Platform) is a ... \n",
"96 I don't know the specific details about the da... \n",
"97 AVAP (African Vertebrate Anatomy Project) uses... \n",
"98 Here's a comparison of the AVAP data model wit... \n",
"99 AVAP (Advanced Visual Analytics Platform) is a... \n",
"\n",
" reference faithfulness \\\n",
"0 In AVAP, when a division by zero occurs—whethe... 0.083333 \n",
"1 In AVAP, control flow structures include condi... 0.904762 \n",
"2 In AVAP, when you perform a division by zero, ... 0.312500 \n",
"3 When an import statement is executed in AVAP, ... 0.142857 \n",
"4 In AVAP, a TypeError exception is raised in tw... 0.000000 \n",
".. ... ... \n",
"95 The data model in AVAP™ is very similar to Pyt... 0.289474 \n",
"96 In AVAP™, the most common data types include i... 0.818182 \n",
"97 In AVAP™, strings (str) represent sequences of... 0.000000 \n",
"98 The data model in AVAP is similar to Python in... 0.343750 \n",
"99 AVAP™ uses a flexible and dynamic data model s... 0.000000 \n",
"\n",
" answer_relevancy context_precision context_recall \\\n",
"0 0.000000 1.000000 1.000000 \n",
"1 0.837564 1.000000 0.454545 \n",
"2 0.892255 1.000000 1.000000 \n",
"3 0.912031 1.000000 1.000000 \n",
"4 0.000000 1.000000 1.000000 \n",
".. ... ... ... \n",
"95 0.925546 0.833333 0.800000 \n",
"96 0.000000 0.500000 1.000000 \n",
"97 0.918740 0.000000 0.000000 \n",
"98 0.954994 0.833333 1.000000 \n",
"99 0.855719 1.000000 1.000000 \n",
"\n",
" context_entity_recall answer_similarity answer_correctness \n",
"0 0.300000 0.833670 0.363590 \n",
"1 0.157895 0.809311 0.531596 \n",
"2 0.200000 0.923880 0.472905 \n",
"3 1.000000 0.642044 0.248746 \n",
"4 1.000000 0.806848 0.201712 \n",
".. ... ... ... \n",
"95 0.500000 0.736816 0.422299 \n",
"96 0.500000 0.856402 0.658545 \n",
"97 0.500000 0.686281 0.271570 \n",
"98 0.555556 0.824449 0.587930 \n",
"99 0.100000 0.856107 0.323783 \n",
"\n",
"[100 rows x 11 columns]"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"metrics = [\n",
" faithfulness,\n",
" answer_relevancy,\n",
" context_precision,\n",
" context_recall,\n",
" context_entity_recall,\n",
" answer_similarity,\n",
" answer_correctness\n",
"]\n",
"\n",
"result = evaluate(\n",
" dataset=dataset, \n",
" metrics=metrics,\n",
" llm=llm,\n",
" embeddings=embeddings,\n",
")\n",
"\n",
"result_df = result.to_pandas()\n",
"result_df"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "20c3fa64",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"faithfulness 0.254643\n",
"answer_relevancy 0.609250\n",
"context_precision 0.862500\n",
"context_recall 0.906242\n",
"context_entity_recall 0.354178\n",
"answer_similarity 0.781973\n",
"answer_correctness 0.359654\n",
"dtype: float64"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"result_df.mean(numeric_only=True)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "350755fd",
"metadata": {},
"outputs": [],
"source": [
"result_df.to_csv(\"/home/acano/PycharmProjects/assistance-engine/data/interim/embedding_eval_results/retrieve_eval_results/ragas_eval.csv\", index=False)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "assistance-engine",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.13"
}
},
"nbformat": 4,
"nbformat_minor": 5
}