1010 lines
44 KiB
Plaintext
1010 lines
44 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 1,
|
||
"id": "8fed4518",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"/tmp/ipykernel_716860/1516785970.py:18: DeprecationWarning: Importing faithfulness from 'ragas.metrics' is deprecated and will be removed in v1.0. Please use 'ragas.metrics.collections' instead. Example: from ragas.metrics.collections import faithfulness\n",
|
||
" from ragas.metrics import (\n",
|
||
"/tmp/ipykernel_716860/1516785970.py:18: DeprecationWarning: Importing answer_relevancy from 'ragas.metrics' is deprecated and will be removed in v1.0. Please use 'ragas.metrics.collections' instead. Example: from ragas.metrics.collections import answer_relevancy\n",
|
||
" from ragas.metrics import (\n",
|
||
"/tmp/ipykernel_716860/1516785970.py:18: DeprecationWarning: Importing context_recall from 'ragas.metrics' is deprecated and will be removed in v1.0. Please use 'ragas.metrics.collections' instead. Example: from ragas.metrics.collections import context_recall\n",
|
||
" from ragas.metrics import (\n",
|
||
"/tmp/ipykernel_716860/1516785970.py:18: DeprecationWarning: Importing context_precision from 'ragas.metrics' is deprecated and will be removed in v1.0. Please use 'ragas.metrics.collections' instead. Example: from ragas.metrics.collections import context_precision\n",
|
||
" from ragas.metrics import (\n",
|
||
"/tmp/ipykernel_716860/1516785970.py:18: DeprecationWarning: Importing context_entity_recall from 'ragas.metrics' is deprecated and will be removed in v1.0. Please use 'ragas.metrics.collections' instead. Example: from ragas.metrics.collections import context_entity_recall\n",
|
||
" from ragas.metrics import (\n",
|
||
"/tmp/ipykernel_716860/1516785970.py:18: DeprecationWarning: Importing answer_similarity from 'ragas.metrics' is deprecated and will be removed in v1.0. Please use 'ragas.metrics.collections' instead. Example: from ragas.metrics.collections import answer_similarity\n",
|
||
" from ragas.metrics import (\n",
|
||
"/tmp/ipykernel_716860/1516785970.py:18: DeprecationWarning: Importing answer_correctness from 'ragas.metrics' is deprecated and will be removed in v1.0. Please use 'ragas.metrics.collections' instead. Example: from ragas.metrics.collections import answer_correctness\n",
|
||
" from ragas.metrics import (\n",
|
||
"/tmp/ipykernel_716860/1516785970.py:18: DeprecationWarning: Importing NonLLMContextRecall from 'ragas.metrics' is deprecated and will be removed in v1.0. Please use 'ragas.metrics.collections' instead. Example: from ragas.metrics.collections import NonLLMContextRecall\n",
|
||
" from ragas.metrics import (\n",
|
||
"/tmp/ipykernel_716860/1516785970.py:18: DeprecationWarning: Importing NonLLMContextPrecisionWithReference from 'ragas.metrics' is deprecated and will be removed in v1.0. Please use 'ragas.metrics.collections' instead. Example: from ragas.metrics.collections import NonLLMContextPrecisionWithReference\n",
|
||
" from ragas.metrics import (\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"import sys\n",
|
||
"from pathlib import Path\n",
|
||
"\n",
|
||
"# Ensure the project root is on the path so `src` is importable\n",
|
||
"_project_root = str(Path(__file__).resolve().parents[2]) if \"__file__\" in dir() else str(Path.cwd().parents[1])\n",
|
||
"if _project_root not in sys.path:\n",
|
||
" sys.path.insert(0, _project_root)\n",
|
||
"\n",
|
||
"from langchain_core.documents import Document\n",
|
||
"from langchain_classic.chains.retrieval_qa.base import RetrievalQA\n",
|
||
"from langchain_elasticsearch import ElasticsearchStore\n",
|
||
"from ragas import evaluate, SingleTurnSample\n",
|
||
"from ragas.llms import LangchainLLMWrapper\n",
|
||
"from ragas.embeddings import LangchainEmbeddingsWrapper\n",
|
||
"from ragas.testset import TestsetGenerator\n",
|
||
"from ragas.testset.persona import Persona\n",
|
||
"from ragas.testset.synthesizers.single_hop.specific import SingleHopSpecificQuerySynthesizer\n",
|
||
"from ragas.metrics import (\n",
|
||
" faithfulness,\n",
|
||
" answer_relevancy,\n",
|
||
" context_recall,\n",
|
||
" context_precision,\n",
|
||
" context_entity_recall,\n",
|
||
" answer_similarity,\n",
|
||
" answer_correctness,\n",
|
||
" NonLLMContextRecall,\n",
|
||
" NonLLMContextPrecisionWithReference\n",
|
||
")\n",
|
||
"\n",
|
||
"from src.llm_factory import create_chat_model\n",
|
||
"from src.emb_factory import create_embedding_model\n",
|
||
"from src.config import (\n",
|
||
" ELASTICSEARCH_LOCAL_URL,\n",
|
||
" ELASTICSEARCH_INDEX,\n",
|
||
" OLLAMA_MODEL_NAME,\n",
|
||
" OLLAMA_EMB_MODEL_NAME,\n",
|
||
" RAW_DIR\n",
|
||
")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 2,
|
||
"id": "4426d6c0",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"llm = create_chat_model(\n",
|
||
" provider=\"bedrock\",\n",
|
||
" model=\"global.anthropic.claude-opus-4-6-v1\",\n",
|
||
" temperature=0,\n",
|
||
")\n",
|
||
"embeddings = create_embedding_model(\n",
|
||
" provider=\"ollama\",\n",
|
||
" model=OLLAMA_EMB_MODEL_NAME,\n",
|
||
")\n",
|
||
"agent_llm = create_chat_model(\n",
|
||
" provider=\"ollama\",\n",
|
||
" model=OLLAMA_MODEL_NAME,\n",
|
||
" temperature=0,\n",
|
||
" validate_model_on_init=True,\n",
|
||
")\n",
|
||
"vector_store = ElasticsearchStore(\n",
|
||
" es_url=ELASTICSEARCH_LOCAL_URL,\n",
|
||
" index_name=ELASTICSEARCH_INDEX,\n",
|
||
" embedding=embeddings,\n",
|
||
" query_field=\"text\",\n",
|
||
" vector_query_field=\"vector\",\n",
|
||
")\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 3,
|
||
"id": "fe524d14",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Loaded 24 documents from /home/acano/PycharmProjects/assistance-engine/data/raw\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"docs: list[Document] = []\n",
|
||
"for txt_file in sorted(RAW_DIR.glob(\"*.txt\")):\n",
|
||
" text = txt_file.read_text(encoding=\"utf-8\")\n",
|
||
" docs.append(Document(page_content=text, metadata={\"source\": txt_file.name}))\n",
|
||
"\n",
|
||
"print(f\"Loaded {len(docs)} documents from {RAW_DIR}\")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 15,
|
||
"id": "06103178",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>user_input</th>\n",
|
||
" <th>reference_contexts</th>\n",
|
||
" <th>reference</th>\n",
|
||
" <th>persona_name</th>\n",
|
||
" <th>query_style</th>\n",
|
||
" <th>query_length</th>\n",
|
||
" <th>synthesizer_name</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>How does AVAP handel a ZeroDivisionError when ...</td>\n",
|
||
" <td>[Execution Model in AVAP\\n4.1. Structure of a ...</td>\n",
|
||
" <td>In AVAP, when a division by zero occurs—whethe...</td>\n",
|
||
" <td>Carlos Menendez</td>\n",
|
||
" <td>MISSPELLED</td>\n",
|
||
" <td>LONG</td>\n",
|
||
" <td>single_hop_specific_query_synthesizer</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>As a backend developer who is learning AVAP an...</td>\n",
|
||
" <td>[Execution Model in AVAP\\n4.1. Structure of a ...</td>\n",
|
||
" <td>In AVAP, control flow structures include condi...</td>\n",
|
||
" <td>Carlos Menendez</td>\n",
|
||
" <td>PERFECT_GRAMMAR</td>\n",
|
||
" <td>LONG</td>\n",
|
||
" <td>single_hop_specific_query_synthesizer</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>hey so in AVAP when i do division by zero what...</td>\n",
|
||
" <td>[Execution Model in AVAP\\n4.1. Structure of a ...</td>\n",
|
||
" <td>In AVAP, when you perform a division by zero, ...</td>\n",
|
||
" <td>Carlos Medina</td>\n",
|
||
" <td>POOR_GRAMMAR</td>\n",
|
||
" <td>MEDIUM</td>\n",
|
||
" <td>single_hop_specific_query_synthesizer</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>what happen if file not found when i do import...</td>\n",
|
||
" <td>[Execution Model in AVAP\\n4.1. Structure of a ...</td>\n",
|
||
" <td>When an import statement is executed in AVAP, ...</td>\n",
|
||
" <td>Carlos Medina</td>\n",
|
||
" <td>POOR_GRAMMAR</td>\n",
|
||
" <td>SHORT</td>\n",
|
||
" <td>single_hop_specific_query_synthesizer</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>In AVAP, under what circumstances is a TypeErr...</td>\n",
|
||
" <td>[Execution Model in AVAP\\n4.1. Structure of a ...</td>\n",
|
||
" <td>In AVAP, a TypeError exception is raised in tw...</td>\n",
|
||
" <td>Carlos Menendez</td>\n",
|
||
" <td>PERFECT_GRAMMAR</td>\n",
|
||
" <td>MEDIUM</td>\n",
|
||
" <td>single_hop_specific_query_synthesizer</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>95</th>\n",
|
||
" <td>How does the data model in AVAP™ compare to Py...</td>\n",
|
||
" <td>[Introduction\\nThe data model in AVAP™ defines...</td>\n",
|
||
" <td>The data model in AVAP™ is very similar to Pyt...</td>\n",
|
||
" <td>Carlos Menendez</td>\n",
|
||
" <td>PERFECT_GRAMMAR</td>\n",
|
||
" <td>MEDIUM</td>\n",
|
||
" <td>single_hop_specific_query_synthesizer</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>96</th>\n",
|
||
" <td>What data types are available in AVAP™?</td>\n",
|
||
" <td>[Introduction\\nThe data model in AVAP™ defines...</td>\n",
|
||
" <td>In AVAP™, the most common data types include i...</td>\n",
|
||
" <td>Carlos Medina</td>\n",
|
||
" <td>PERFECT_GRAMMAR</td>\n",
|
||
" <td>SHORT</td>\n",
|
||
" <td>single_hop_specific_query_synthesizer</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>97</th>\n",
|
||
" <td>AVAP strings Unicode</td>\n",
|
||
" <td>[Introduction\\nThe data model in AVAP™ defines...</td>\n",
|
||
" <td>In AVAP™, strings (str) represent sequences of...</td>\n",
|
||
" <td>Carlos Medina</td>\n",
|
||
" <td>WEB_SEARCH_LIKE</td>\n",
|
||
" <td>SHORT</td>\n",
|
||
" <td>single_hop_specific_query_synthesizer</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>98</th>\n",
|
||
" <td>AVAP data model comparison with Python data ty...</td>\n",
|
||
" <td>[Introduction\\nThe data model in AVAP™ defines...</td>\n",
|
||
" <td>The data model in AVAP is similar to Python in...</td>\n",
|
||
" <td>Carlos Mendieta</td>\n",
|
||
" <td>WEB_SEARCH_LIKE</td>\n",
|
||
" <td>MEDIUM</td>\n",
|
||
" <td>single_hop_specific_query_synthesizer</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>99</th>\n",
|
||
" <td>AVAP™ data types and data structures overview</td>\n",
|
||
" <td>[Introduction\\nThe data model in AVAP™ defines...</td>\n",
|
||
" <td>AVAP™ uses a flexible and dynamic data model s...</td>\n",
|
||
" <td>Carlos Mendieta</td>\n",
|
||
" <td>WEB_SEARCH_LIKE</td>\n",
|
||
" <td>SHORT</td>\n",
|
||
" <td>single_hop_specific_query_synthesizer</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>100 rows × 7 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" user_input \\\n",
|
||
"0 How does AVAP handel a ZeroDivisionError when ... \n",
|
||
"1 As a backend developer who is learning AVAP an... \n",
|
||
"2 hey so in AVAP when i do division by zero what... \n",
|
||
"3 what happen if file not found when i do import... \n",
|
||
"4 In AVAP, under what circumstances is a TypeErr... \n",
|
||
".. ... \n",
|
||
"95 How does the data model in AVAP™ compare to Py... \n",
|
||
"96 What data types are available in AVAP™? \n",
|
||
"97 AVAP strings Unicode \n",
|
||
"98 AVAP data model comparison with Python data ty... \n",
|
||
"99 AVAP™ data types and data structures overview \n",
|
||
"\n",
|
||
" reference_contexts \\\n",
|
||
"0 [Execution Model in AVAP\\n4.1. Structure of a ... \n",
|
||
"1 [Execution Model in AVAP\\n4.1. Structure of a ... \n",
|
||
"2 [Execution Model in AVAP\\n4.1. Structure of a ... \n",
|
||
"3 [Execution Model in AVAP\\n4.1. Structure of a ... \n",
|
||
"4 [Execution Model in AVAP\\n4.1. Structure of a ... \n",
|
||
".. ... \n",
|
||
"95 [Introduction\\nThe data model in AVAP™ defines... \n",
|
||
"96 [Introduction\\nThe data model in AVAP™ defines... \n",
|
||
"97 [Introduction\\nThe data model in AVAP™ defines... \n",
|
||
"98 [Introduction\\nThe data model in AVAP™ defines... \n",
|
||
"99 [Introduction\\nThe data model in AVAP™ defines... \n",
|
||
"\n",
|
||
" reference persona_name \\\n",
|
||
"0 In AVAP, when a division by zero occurs—whethe... Carlos Menendez \n",
|
||
"1 In AVAP, control flow structures include condi... Carlos Menendez \n",
|
||
"2 In AVAP, when you perform a division by zero, ... Carlos Medina \n",
|
||
"3 When an import statement is executed in AVAP, ... Carlos Medina \n",
|
||
"4 In AVAP, a TypeError exception is raised in tw... Carlos Menendez \n",
|
||
".. ... ... \n",
|
||
"95 The data model in AVAP™ is very similar to Pyt... Carlos Menendez \n",
|
||
"96 In AVAP™, the most common data types include i... Carlos Medina \n",
|
||
"97 In AVAP™, strings (str) represent sequences of... Carlos Medina \n",
|
||
"98 The data model in AVAP is similar to Python in... Carlos Mendieta \n",
|
||
"99 AVAP™ uses a flexible and dynamic data model s... Carlos Mendieta \n",
|
||
"\n",
|
||
" query_style query_length synthesizer_name \n",
|
||
"0 MISSPELLED LONG single_hop_specific_query_synthesizer \n",
|
||
"1 PERFECT_GRAMMAR LONG single_hop_specific_query_synthesizer \n",
|
||
"2 POOR_GRAMMAR MEDIUM single_hop_specific_query_synthesizer \n",
|
||
"3 POOR_GRAMMAR SHORT single_hop_specific_query_synthesizer \n",
|
||
"4 PERFECT_GRAMMAR MEDIUM single_hop_specific_query_synthesizer \n",
|
||
".. ... ... ... \n",
|
||
"95 PERFECT_GRAMMAR MEDIUM single_hop_specific_query_synthesizer \n",
|
||
"96 PERFECT_GRAMMAR SHORT single_hop_specific_query_synthesizer \n",
|
||
"97 WEB_SEARCH_LIKE SHORT single_hop_specific_query_synthesizer \n",
|
||
"98 WEB_SEARCH_LIKE MEDIUM single_hop_specific_query_synthesizer \n",
|
||
"99 WEB_SEARCH_LIKE SHORT single_hop_specific_query_synthesizer \n",
|
||
"\n",
|
||
"[100 rows x 7 columns]"
|
||
]
|
||
},
|
||
"execution_count": 15,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"synthetic_dataset"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 4,
|
||
"id": "ab1932b7",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"/tmp/ipykernel_716860/244266171.py:1: DeprecationWarning: LangchainLLMWrapper is deprecated and will be removed in a future version. Use llm_factory instead: from openai import OpenAI; from ragas.llms import llm_factory; llm = llm_factory('gpt-4o-mini', client=OpenAI(api_key='...'))\n",
|
||
" synth = SingleHopSpecificQuerySynthesizer(llm=LangchainLLMWrapper(llm))\n",
|
||
"/tmp/ipykernel_716860/244266171.py:3: DeprecationWarning: LangchainLLMWrapper is deprecated and will be removed in a future version. Use llm_factory instead: from openai import OpenAI; from ragas.llms import llm_factory; llm = llm_factory('gpt-4o-mini', client=OpenAI(api_key='...'))\n",
|
||
" generator = TestsetGenerator(llm=LangchainLLMWrapper(llm), embedding_model=LangchainEmbeddingsWrapper(embeddings))\n",
|
||
"/tmp/ipykernel_716860/244266171.py:3: DeprecationWarning: LangchainEmbeddingsWrapper is deprecated and will be removed in a future version. Use the modern embedding providers instead: embedding_factory('openai', model='text-embedding-3-small', client=openai_client) or from ragas.embeddings import OpenAIEmbeddings, GoogleEmbeddings, HuggingFaceEmbeddings\n",
|
||
" generator = TestsetGenerator(llm=LangchainLLMWrapper(llm), embedding_model=LangchainEmbeddingsWrapper(embeddings))\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"application/vnd.jupyter.widget-view+json": {
|
||
"model_id": "8ec6ef79b1964c44b78a75ca539f816b",
|
||
"version_major": 2,
|
||
"version_minor": 0
|
||
},
|
||
"text/plain": [
|
||
"Applying SummaryExtractor: 0%| | 0/24 [00:00<?, ?it/s]"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"application/vnd.jupyter.widget-view+json": {
|
||
"model_id": "f583879571cf4c818cbb7321b0839990",
|
||
"version_major": 2,
|
||
"version_minor": 0
|
||
},
|
||
"text/plain": [
|
||
"Applying CustomNodeFilter: 0%| | 0/24 [00:00<?, ?it/s]"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Node 75603049-8ebb-49dc-9e7d-da37fa927eb9 does not have a summary. Skipping filtering.\n",
|
||
"Node c2d1e8b0-ca69-47af-9bcd-39cbf8560edb does not have a summary. Skipping filtering.\n",
|
||
"Node 24c16f65-02fd-4d80-84c6-d7d1a8a2638c does not have a summary. Skipping filtering.\n",
|
||
"Node a0975db8-14b3-44eb-8aa2-e83274fb55ab does not have a summary. Skipping filtering.\n",
|
||
"Node 6768ece8-9a13-42b3-9aec-08e828044420 does not have a summary. Skipping filtering.\n",
|
||
"Node 54719709-293a-49db-86f2-8f697015e16a does not have a summary. Skipping filtering.\n",
|
||
"Node a049eacb-5a3e-404f-83ea-249061fcae0a does not have a summary. Skipping filtering.\n",
|
||
"Node eb4ac1be-55ae-487e-936c-ee43513f25e9 does not have a summary. Skipping filtering.\n",
|
||
"Node baf6b749-0280-46f0-a47b-8fd82373da1b does not have a summary. Skipping filtering.\n",
|
||
"Node 9caa0b62-10ea-4f19-98b7-5f10b2cbc486 does not have a summary. Skipping filtering.\n",
|
||
"Node d28505f3-cdd7-44d1-9c45-9741e27e25c3 does not have a summary. Skipping filtering.\n",
|
||
"Node f9a234cb-1af1-4f06-8d9a-6921c19ffbf5 does not have a summary. Skipping filtering.\n",
|
||
"Node 4f0b355e-81ca-450c-99e3-8458ebd304c6 does not have a summary. Skipping filtering.\n",
|
||
"Node 66cf6447-7639-497c-9ae2-e26b0c7443b5 does not have a summary. Skipping filtering.\n",
|
||
"Node 722bfb38-b24e-483f-9787-253d71716c1e does not have a summary. Skipping filtering.\n",
|
||
"Node ce76bfcc-8cb3-4de2-87e4-74f10ad5c549 does not have a summary. Skipping filtering.\n",
|
||
"Node dada2116-28ae-4d7c-a4ad-f8ccc3952eb1 does not have a summary. Skipping filtering.\n",
|
||
"Node e6f7360d-4309-453a-aab8-d3015d53dd88 does not have a summary. Skipping filtering.\n",
|
||
"Node a73eb1ba-9609-4ad8-80bc-98d9c4993fcd does not have a summary. Skipping filtering.\n",
|
||
"Node 004b6ce2-48a7-4bff-9393-67e963ebe7fc does not have a summary. Skipping filtering.\n",
|
||
"Node 854676ec-e80f-45ef-a84c-08d527b96813 does not have a summary. Skipping filtering.\n",
|
||
"Node 241a936b-3470-41be-8449-7994f3ba5eee does not have a summary. Skipping filtering.\n",
|
||
"Node 28f76e87-5e68-4a63-83a8-e7c4addb855a does not have a summary. Skipping filtering.\n",
|
||
"Node f7e3d432-5073-4004-af6c-683cc7e7a600 does not have a summary. Skipping filtering.\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"application/vnd.jupyter.widget-view+json": {
|
||
"model_id": "a3cf82e356d7485fa6ffa54b131d6a18",
|
||
"version_major": 2,
|
||
"version_minor": 0
|
||
},
|
||
"text/plain": [
|
||
"Applying EmbeddingExtractor: 0%| | 0/24 [00:00<?, ?it/s]"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"application/vnd.jupyter.widget-view+json": {
|
||
"model_id": "b14be1e6a8e74d9592860377b5fa0044",
|
||
"version_major": 2,
|
||
"version_minor": 0
|
||
},
|
||
"text/plain": [
|
||
"Applying ThemesExtractor: 0%| | 0/24 [00:00<?, ?it/s]"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"application/vnd.jupyter.widget-view+json": {
|
||
"model_id": "65089eea206341f290cda033732df991",
|
||
"version_major": 2,
|
||
"version_minor": 0
|
||
},
|
||
"text/plain": [
|
||
"Applying NERExtractor: 0%| | 0/24 [00:00<?, ?it/s]"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"application/vnd.jupyter.widget-view+json": {
|
||
"model_id": "3120af625643421eafc48c78fce57d8d",
|
||
"version_major": 2,
|
||
"version_minor": 0
|
||
},
|
||
"text/plain": [
|
||
"Applying CosineSimilarityBuilder: 0%| | 0/1 [00:00<?, ?it/s]"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"application/vnd.jupyter.widget-view+json": {
|
||
"model_id": "6a8e81dbde254d6e82d76f3752e211d2",
|
||
"version_major": 2,
|
||
"version_minor": 0
|
||
},
|
||
"text/plain": [
|
||
"Applying OverlapScoreBuilder: 0%| | 0/1 [00:00<?, ?it/s]"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"application/vnd.jupyter.widget-view+json": {
|
||
"model_id": "59ca74c675f14067a4a665d56b4e29ba",
|
||
"version_major": 2,
|
||
"version_minor": 0
|
||
},
|
||
"text/plain": [
|
||
"Generating personas: 0%| | 0/3 [00:00<?, ?it/s]"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"application/vnd.jupyter.widget-view+json": {
|
||
"model_id": "fa5aee3db9674f6eb50fef7214cadd92",
|
||
"version_major": 2,
|
||
"version_minor": 0
|
||
},
|
||
"text/plain": [
|
||
"Generating Scenarios: 0%| | 0/1 [00:00<?, ?it/s]"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"application/vnd.jupyter.widget-view+json": {
|
||
"model_id": "63030d2b67984b838c055b29d0443639",
|
||
"version_major": 2,
|
||
"version_minor": 0
|
||
},
|
||
"text/plain": [
|
||
"Generating Samples: 0%| | 0/100 [00:00<?, ?it/s]"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
}
|
||
],
|
||
"source": [
|
||
"synth = SingleHopSpecificQuerySynthesizer(llm=LangchainLLMWrapper(llm))\n",
|
||
"\n",
|
||
"generator = TestsetGenerator(llm=LangchainLLMWrapper(llm), embedding_model=LangchainEmbeddingsWrapper(embeddings))\n",
|
||
"synthetic_dataset = generator.generate_with_chunks(\n",
|
||
" chunks=docs,\n",
|
||
" testset_size=100,\n",
|
||
" query_distribution=[(synth, 1.0)]\n",
|
||
")\n",
|
||
"synthetic_dataset = synthetic_dataset.to_pandas()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 5,
|
||
"id": "18ceb119",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"retriever = vector_store.as_retriever(\n",
|
||
" search_type=\"similarity\",\n",
|
||
" search_kwargs={\"k\": 3},\n",
|
||
" )\n",
|
||
"\n",
|
||
"qa_chain = RetrievalQA.from_chain_type(\n",
|
||
" llm=agent_llm, retriever=retriever, return_source_documents=True\n",
|
||
")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 12,
|
||
"id": "344a1266",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"from datasets import Dataset\n",
|
||
"questions = synthetic_dataset[\"user_input\"]\n",
|
||
"ground_truths = synthetic_dataset[\"reference\"]\n",
|
||
"\n",
|
||
"answers = []\n",
|
||
"contexts = []\n",
|
||
"\n",
|
||
"for query in questions:\n",
|
||
" answers.append(qa_chain.invoke(query)[\"result\"])\n",
|
||
" contexts.append([docs.page_content for docs in retriever.invoke(query)])\n",
|
||
"\n",
|
||
"# To dict\n",
|
||
"data = {\n",
|
||
" \"question\": questions,\n",
|
||
" \"answer\": answers,\n",
|
||
" \"contexts\": contexts,\n",
|
||
" \"ground_truth\": ground_truths\n",
|
||
"}\n",
|
||
"\n",
|
||
"# Convert dict to dataset\n",
|
||
"dataset = Dataset.from_dict(data)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 13,
|
||
"id": "a9011f94",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"application/vnd.jupyter.widget-view+json": {
|
||
"model_id": "3239c7c9d6254330b9b079a249a74c60",
|
||
"version_major": 2,
|
||
"version_minor": 0
|
||
},
|
||
"text/plain": [
|
||
"Evaluating: 0%| | 0/700 [00:00<?, ?it/s]"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Exception in callback Task.__step()\n",
|
||
"handle: <Handle Task.__step()>\n",
|
||
"Traceback (most recent call last):\n",
|
||
" File \"/home/acano/.local/share/uv/python/cpython-3.11.13-linux-x86_64-gnu/lib/python3.11/asyncio/events.py\", line 84, in _run\n",
|
||
" self._context.run(self._callback, *self._args)\n",
|
||
"RuntimeError: cannot enter context: <_contextvars.Context object at 0x74fe3aa80780> is already entered\n",
|
||
"Task was destroyed but it is pending!\n",
|
||
"task: <Task pending name='Task-3487' coro=<_async_in_context.<locals>.run_in_context() done, defined at /home/acano/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/ipykernel/utils.py:57> wait_for=<Task pending name='Task-3489' coro=<Kernel.shell_main() running at /home/acano/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/ipykernel/kernelbase.py:597> cb=[Task.__wakeup()]> cb=[ZMQStream._run_callback.<locals>._log_error() at /home/acano/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/zmq/eventloop/zmqstream.py:563]>\n",
|
||
"/home/acano/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/pydantic/json_schema.py:335: RuntimeWarning: coroutine 'Kernel.shell_main' was never awaited\n",
|
||
" mapping[key] = getattr(self, method_name)\n",
|
||
"RuntimeWarning: Enable tracemalloc to get the object allocation traceback\n",
|
||
"Task was destroyed but it is pending!\n",
|
||
"task: <Task pending name='Task-3489' coro=<Kernel.shell_main() running at /home/acano/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/ipykernel/kernelbase.py:597> cb=[Task.__wakeup()]>\n",
|
||
"Exception in callback Task.__step()\n",
|
||
"handle: <Handle Task.__step()>\n",
|
||
"Traceback (most recent call last):\n",
|
||
" File \"/home/acano/.local/share/uv/python/cpython-3.11.13-linux-x86_64-gnu/lib/python3.11/asyncio/events.py\", line 84, in _run\n",
|
||
" self._context.run(self._callback, *self._args)\n",
|
||
"RuntimeError: cannot enter context: <_contextvars.Context object at 0x74fe3aa80780> is already entered\n",
|
||
"Task was destroyed but it is pending!\n",
|
||
"task: <Task pending name='Task-5636' coro=<_async_in_context.<locals>.run_in_context() done, defined at /home/acano/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/ipykernel/utils.py:57> wait_for=<Task pending name='Task-5637' coro=<Kernel.shell_main() running at /home/acano/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/ipykernel/kernelbase.py:597> cb=[Task.__wakeup()]> cb=[ZMQStream._run_callback.<locals>._log_error() at /home/acano/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/zmq/eventloop/zmqstream.py:563]>\n",
|
||
"/home/acano/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/pydantic/main.py:716: RuntimeWarning: coroutine 'Kernel.shell_main' was never awaited\n",
|
||
" return cls.__pydantic_validator__.validate_python(\n",
|
||
"RuntimeWarning: Enable tracemalloc to get the object allocation traceback\n",
|
||
"Task was destroyed but it is pending!\n",
|
||
"task: <Task pending name='Task-5637' coro=<Kernel.shell_main() running at /home/acano/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/ipykernel/kernelbase.py:597> cb=[Task.__wakeup()]>\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>user_input</th>\n",
|
||
" <th>retrieved_contexts</th>\n",
|
||
" <th>response</th>\n",
|
||
" <th>reference</th>\n",
|
||
" <th>faithfulness</th>\n",
|
||
" <th>answer_relevancy</th>\n",
|
||
" <th>context_precision</th>\n",
|
||
" <th>context_recall</th>\n",
|
||
" <th>context_entity_recall</th>\n",
|
||
" <th>answer_similarity</th>\n",
|
||
" <th>answer_correctness</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>How does AVAP handel a ZeroDivisionError when ...</td>\n",
|
||
" <td>[Execution Model in AVAP 4.1. Structure of a P...</td>\n",
|
||
" <td>AVAP (Advanced Virtual Application Platform) i...</td>\n",
|
||
" <td>In AVAP, when a division by zero occurs—whethe...</td>\n",
|
||
" <td>0.083333</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" <td>0.300000</td>\n",
|
||
" <td>0.833670</td>\n",
|
||
" <td>0.363590</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>As a backend developer who is learning AVAP an...</td>\n",
|
||
" <td>[SECTION III: Control Logic and Decision Struc...</td>\n",
|
||
" <td>I can provide information on the if statement ...</td>\n",
|
||
" <td>In AVAP, control flow structures include condi...</td>\n",
|
||
" <td>0.904762</td>\n",
|
||
" <td>0.837564</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" <td>0.454545</td>\n",
|
||
" <td>0.157895</td>\n",
|
||
" <td>0.809311</td>\n",
|
||
" <td>0.531596</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>hey so in AVAP when i do division by zero what...</td>\n",
|
||
" <td>[Execution Model in AVAP 4.1. Structure of a P...</td>\n",
|
||
" <td>In AVAP, when a division operation is attempte...</td>\n",
|
||
" <td>In AVAP, when you perform a division by zero, ...</td>\n",
|
||
" <td>0.312500</td>\n",
|
||
" <td>0.892255</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" <td>0.200000</td>\n",
|
||
" <td>0.923880</td>\n",
|
||
" <td>0.472905</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>what happen if file not found when i do import...</td>\n",
|
||
" <td>[Execution Model in AVAP 4.1. Structure of a P...</td>\n",
|
||
" <td>In AVAP, the equivalent of a `FileNotFoundErro...</td>\n",
|
||
" <td>When an import statement is executed in AVAP, ...</td>\n",
|
||
" <td>0.142857</td>\n",
|
||
" <td>0.912031</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" <td>0.642044</td>\n",
|
||
" <td>0.248746</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>In AVAP, under what circumstances is a TypeErr...</td>\n",
|
||
" <td>[Execution Model in AVAP 4.1. Structure of a P...</td>\n",
|
||
" <td>I don't have information on the specific circu...</td>\n",
|
||
" <td>In AVAP, a TypeError exception is raised in tw...</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" <td>0.806848</td>\n",
|
||
" <td>0.201712</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>95</th>\n",
|
||
" <td>How does the data model in AVAP™ compare to Py...</td>\n",
|
||
" <td>[Introduction The data model in AVAP™ defines ...</td>\n",
|
||
" <td>AVAP (Advanced Vector Analysis Platform) is a ...</td>\n",
|
||
" <td>The data model in AVAP™ is very similar to Pyt...</td>\n",
|
||
" <td>0.289474</td>\n",
|
||
" <td>0.925546</td>\n",
|
||
" <td>0.833333</td>\n",
|
||
" <td>0.800000</td>\n",
|
||
" <td>0.500000</td>\n",
|
||
" <td>0.736816</td>\n",
|
||
" <td>0.422299</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>96</th>\n",
|
||
" <td>What data types are available in AVAP™?</td>\n",
|
||
" <td>[Chapter 5: Data Types In this chapter, we wil...</td>\n",
|
||
" <td>I don't know the specific details about the da...</td>\n",
|
||
" <td>In AVAP™, the most common data types include i...</td>\n",
|
||
" <td>0.818182</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>0.500000</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" <td>0.500000</td>\n",
|
||
" <td>0.856402</td>\n",
|
||
" <td>0.658545</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>97</th>\n",
|
||
" <td>AVAP strings Unicode</td>\n",
|
||
" <td>[Chapter 2: Notation in AVAP™ Introduction Not...</td>\n",
|
||
" <td>AVAP (African Vertebrate Anatomy Project) uses...</td>\n",
|
||
" <td>In AVAP™, strings (str) represent sequences of...</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>0.918740</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>0.500000</td>\n",
|
||
" <td>0.686281</td>\n",
|
||
" <td>0.271570</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>98</th>\n",
|
||
" <td>AVAP data model comparison with Python data ty...</td>\n",
|
||
" <td>[Introduction The data model in AVAP™ defines ...</td>\n",
|
||
" <td>Here's a comparison of the AVAP data model wit...</td>\n",
|
||
" <td>The data model in AVAP is similar to Python in...</td>\n",
|
||
" <td>0.343750</td>\n",
|
||
" <td>0.954994</td>\n",
|
||
" <td>0.833333</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" <td>0.555556</td>\n",
|
||
" <td>0.824449</td>\n",
|
||
" <td>0.587930</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>99</th>\n",
|
||
" <td>AVAP™ data types and data structures overview</td>\n",
|
||
" <td>[Introduction The data model in AVAP™ defines ...</td>\n",
|
||
" <td>AVAP (Advanced Visual Analytics Platform) is a...</td>\n",
|
||
" <td>AVAP™ uses a flexible and dynamic data model s...</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>0.855719</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" <td>1.000000</td>\n",
|
||
" <td>0.100000</td>\n",
|
||
" <td>0.856107</td>\n",
|
||
" <td>0.323783</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>100 rows × 11 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" user_input \\\n",
|
||
"0 How does AVAP handel a ZeroDivisionError when ... \n",
|
||
"1 As a backend developer who is learning AVAP an... \n",
|
||
"2 hey so in AVAP when i do division by zero what... \n",
|
||
"3 what happen if file not found when i do import... \n",
|
||
"4 In AVAP, under what circumstances is a TypeErr... \n",
|
||
".. ... \n",
|
||
"95 How does the data model in AVAP™ compare to Py... \n",
|
||
"96 What data types are available in AVAP™? \n",
|
||
"97 AVAP strings Unicode \n",
|
||
"98 AVAP data model comparison with Python data ty... \n",
|
||
"99 AVAP™ data types and data structures overview \n",
|
||
"\n",
|
||
" retrieved_contexts \\\n",
|
||
"0 [Execution Model in AVAP 4.1. Structure of a P... \n",
|
||
"1 [SECTION III: Control Logic and Decision Struc... \n",
|
||
"2 [Execution Model in AVAP 4.1. Structure of a P... \n",
|
||
"3 [Execution Model in AVAP 4.1. Structure of a P... \n",
|
||
"4 [Execution Model in AVAP 4.1. Structure of a P... \n",
|
||
".. ... \n",
|
||
"95 [Introduction The data model in AVAP™ defines ... \n",
|
||
"96 [Chapter 5: Data Types In this chapter, we wil... \n",
|
||
"97 [Chapter 2: Notation in AVAP™ Introduction Not... \n",
|
||
"98 [Introduction The data model in AVAP™ defines ... \n",
|
||
"99 [Introduction The data model in AVAP™ defines ... \n",
|
||
"\n",
|
||
" response \\\n",
|
||
"0 AVAP (Advanced Virtual Application Platform) i... \n",
|
||
"1 I can provide information on the if statement ... \n",
|
||
"2 In AVAP, when a division operation is attempte... \n",
|
||
"3 In AVAP, the equivalent of a `FileNotFoundErro... \n",
|
||
"4 I don't have information on the specific circu... \n",
|
||
".. ... \n",
|
||
"95 AVAP (Advanced Vector Analysis Platform) is a ... \n",
|
||
"96 I don't know the specific details about the da... \n",
|
||
"97 AVAP (African Vertebrate Anatomy Project) uses... \n",
|
||
"98 Here's a comparison of the AVAP data model wit... \n",
|
||
"99 AVAP (Advanced Visual Analytics Platform) is a... \n",
|
||
"\n",
|
||
" reference faithfulness \\\n",
|
||
"0 In AVAP, when a division by zero occurs—whethe... 0.083333 \n",
|
||
"1 In AVAP, control flow structures include condi... 0.904762 \n",
|
||
"2 In AVAP, when you perform a division by zero, ... 0.312500 \n",
|
||
"3 When an import statement is executed in AVAP, ... 0.142857 \n",
|
||
"4 In AVAP, a TypeError exception is raised in tw... 0.000000 \n",
|
||
".. ... ... \n",
|
||
"95 The data model in AVAP™ is very similar to Pyt... 0.289474 \n",
|
||
"96 In AVAP™, the most common data types include i... 0.818182 \n",
|
||
"97 In AVAP™, strings (str) represent sequences of... 0.000000 \n",
|
||
"98 The data model in AVAP is similar to Python in... 0.343750 \n",
|
||
"99 AVAP™ uses a flexible and dynamic data model s... 0.000000 \n",
|
||
"\n",
|
||
" answer_relevancy context_precision context_recall \\\n",
|
||
"0 0.000000 1.000000 1.000000 \n",
|
||
"1 0.837564 1.000000 0.454545 \n",
|
||
"2 0.892255 1.000000 1.000000 \n",
|
||
"3 0.912031 1.000000 1.000000 \n",
|
||
"4 0.000000 1.000000 1.000000 \n",
|
||
".. ... ... ... \n",
|
||
"95 0.925546 0.833333 0.800000 \n",
|
||
"96 0.000000 0.500000 1.000000 \n",
|
||
"97 0.918740 0.000000 0.000000 \n",
|
||
"98 0.954994 0.833333 1.000000 \n",
|
||
"99 0.855719 1.000000 1.000000 \n",
|
||
"\n",
|
||
" context_entity_recall answer_similarity answer_correctness \n",
|
||
"0 0.300000 0.833670 0.363590 \n",
|
||
"1 0.157895 0.809311 0.531596 \n",
|
||
"2 0.200000 0.923880 0.472905 \n",
|
||
"3 1.000000 0.642044 0.248746 \n",
|
||
"4 1.000000 0.806848 0.201712 \n",
|
||
".. ... ... ... \n",
|
||
"95 0.500000 0.736816 0.422299 \n",
|
||
"96 0.500000 0.856402 0.658545 \n",
|
||
"97 0.500000 0.686281 0.271570 \n",
|
||
"98 0.555556 0.824449 0.587930 \n",
|
||
"99 0.100000 0.856107 0.323783 \n",
|
||
"\n",
|
||
"[100 rows x 11 columns]"
|
||
]
|
||
},
|
||
"execution_count": 13,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"metrics = [\n",
|
||
" faithfulness,\n",
|
||
" answer_relevancy,\n",
|
||
" context_precision,\n",
|
||
" context_recall,\n",
|
||
" context_entity_recall,\n",
|
||
" answer_similarity,\n",
|
||
" answer_correctness\n",
|
||
"]\n",
|
||
"\n",
|
||
"result = evaluate(\n",
|
||
" dataset=dataset, \n",
|
||
" metrics=metrics,\n",
|
||
" llm=llm,\n",
|
||
" embeddings=embeddings,\n",
|
||
")\n",
|
||
"\n",
|
||
"result_df = result.to_pandas()\n",
|
||
"result_df"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 14,
|
||
"id": "20c3fa64",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"faithfulness 0.254643\n",
|
||
"answer_relevancy 0.609250\n",
|
||
"context_precision 0.862500\n",
|
||
"context_recall 0.906242\n",
|
||
"context_entity_recall 0.354178\n",
|
||
"answer_similarity 0.781973\n",
|
||
"answer_correctness 0.359654\n",
|
||
"dtype: float64"
|
||
]
|
||
},
|
||
"execution_count": 14,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"result_df.mean(numeric_only=True)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 16,
|
||
"id": "350755fd",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"result_df.to_csv(\"/home/acano/PycharmProjects/assistance-engine/data/interim/embedding_eval_results/retrieve_eval_results/ragas_eval.csv\", index=False)"
|
||
]
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "assistance-engine",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.11.13"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 5
|
||
}
|