assistance-engine/scratches/acano/evaluate_retrieve.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8fed4518",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_102010/913720497.py:21: DeprecationWarning: Importing faithfulness from 'ragas.metrics' is deprecated and will be removed in v1.0. Please use 'ragas.metrics.collections' instead. Example: from ragas.metrics.collections import faithfulness\n",
      "  from ragas.metrics import (\n",
      "/tmp/ipykernel_102010/913720497.py:21: DeprecationWarning: Importing answer_relevancy from 'ragas.metrics' is deprecated and will be removed in v1.0. Please use 'ragas.metrics.collections' instead. Example: from ragas.metrics.collections import answer_relevancy\n",
      "  from ragas.metrics import (\n",
      "/tmp/ipykernel_102010/913720497.py:21: DeprecationWarning: Importing context_recall from 'ragas.metrics' is deprecated and will be removed in v1.0. Please use 'ragas.metrics.collections' instead. Example: from ragas.metrics.collections import context_recall\n",
      "  from ragas.metrics import (\n",
      "/tmp/ipykernel_102010/913720497.py:21: DeprecationWarning: Importing context_precision from 'ragas.metrics' is deprecated and will be removed in v1.0. Please use 'ragas.metrics.collections' instead. Example: from ragas.metrics.collections import context_precision\n",
      "  from ragas.metrics import (\n",
      "/tmp/ipykernel_102010/913720497.py:21: DeprecationWarning: Importing context_entity_recall from 'ragas.metrics' is deprecated and will be removed in v1.0. Please use 'ragas.metrics.collections' instead. Example: from ragas.metrics.collections import context_entity_recall\n",
      "  from ragas.metrics import (\n",
      "/tmp/ipykernel_102010/913720497.py:21: DeprecationWarning: Importing answer_similarity from 'ragas.metrics' is deprecated and will be removed in v1.0. Please use 'ragas.metrics.collections' instead. Example: from ragas.metrics.collections import answer_similarity\n",
      "  from ragas.metrics import (\n",
      "/tmp/ipykernel_102010/913720497.py:21: DeprecationWarning: Importing answer_correctness from 'ragas.metrics' is deprecated and will be removed in v1.0. Please use 'ragas.metrics.collections' instead. Example: from ragas.metrics.collections import answer_correctness\n",
      "  from ragas.metrics import (\n",
      "/tmp/ipykernel_102010/913720497.py:21: DeprecationWarning: Importing NonLLMContextRecall from 'ragas.metrics' is deprecated and will be removed in v1.0. Please use 'ragas.metrics.collections' instead. Example: from ragas.metrics.collections import NonLLMContextRecall\n",
      "  from ragas.metrics import (\n",
      "/tmp/ipykernel_102010/913720497.py:21: DeprecationWarning: Importing NonLLMContextPrecisionWithReference from 'ragas.metrics' is deprecated and will be removed in v1.0. Please use 'ragas.metrics.collections' instead. Example: from ragas.metrics.collections import NonLLMContextPrecisionWithReference\n",
      "  from ragas.metrics import (\n"
     ]
    }
   ],
   "source": [
    "import sys\n",
    "from pathlib import Path\n",
    "\n",
    "# Ensure the project root is on the path so `src` is importable\n",
    "_project_root = str(Path(__file__).resolve().parents[2]) if \"__file__\" in dir() else str(Path.cwd().parents[1])\n",
    "if _project_root not in sys.path:\n",
    "    sys.path.insert(0, _project_root)\n",
    "\n",
    "import pandas as pd\n",
    "from datasets import Dataset\n",
    "from langchain_core.documents import Document\n",
    "from langchain_classic.chains.retrieval_qa.base import RetrievalQA\n",
    "from langchain_elasticsearch import ElasticsearchStore\n",
    "from langchain_core.messages import HumanMessage\n",
    "from ragas import evaluate, SingleTurnSample\n",
    "from ragas.llms import LangchainLLMWrapper\n",
    "from ragas.embeddings import LangchainEmbeddingsWrapper\n",
    "from ragas.testset import TestsetGenerator\n",
    "from ragas.testset.persona import Persona\n",
    "from ragas.testset.synthesizers.single_hop.specific import SingleHopSpecificQuerySynthesizer\n",
    "from ragas.metrics import (\n",
    "    faithfulness,\n",
    "    answer_relevancy,\n",
    "    context_recall,\n",
    "    context_precision,\n",
    "    context_entity_recall,\n",
    "    answer_similarity,\n",
    "    answer_correctness,\n",
    "    NonLLMContextRecall,\n",
    "    NonLLMContextPrecisionWithReference\n",
    ")\n",
    "\n",
    "from src.utils.llm_factory import create_chat_model\n",
    "from src.utils.emb_factory import create_embedding_model\n",
    "from src.config import (\n",
    "    ELASTICSEARCH_LOCAL_URL,\n",
    "    ELASTICSEARCH_INDEX,\n",
    "    OLLAMA_MODEL_NAME,\n",
    "    OLLAMA_EMB_MODEL_NAME,\n",
    "    OLLAMA_LOCAL_URL,\n",
    "    RAW_DIR,\n",
    "    INTERIM_DIR\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "id": "4426d6c0",
   "metadata": {},
   "outputs": [],
   "source": [
    "llm = create_chat_model(\n",
    "    provider=\"bedrock\",\n",
    "    model=\"global.anthropic.claude-sonnet-4-6\",\n",
    "    temperature=0,\n",
    ")\n",
    "embeddings = create_embedding_model(\n",
    "    provider=\"ollama\",\n",
    "    model=OLLAMA_EMB_MODEL_NAME,\n",
    ")\n",
    "agent_llm = create_chat_model(\n",
    "    provider=\"ollama\",\n",
    "    model=OLLAMA_MODEL_NAME,\n",
    "    temperature=0,\n",
    "    validate_model_on_init=True,\n",
    ")\n",
    "vector_store = ElasticsearchStore(\n",
    "    es_url=ELASTICSEARCH_LOCAL_URL,\n",
    "    index_name=ELASTICSEARCH_INDEX,\n",
    "    embedding=embeddings,\n",
    "    query_field=\"text\",\n",
    "    vector_query_field=\"vector\",\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d2a6ab91",
   "metadata": {},
   "source": [
    "### Build langgraph"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "id": "7f9fc4de",
   "metadata": {},
   "outputs": [],
   "source": [
    "from typing import TypedDict, List, Annotated\n",
    "\n",
    "from langchain_core.messages import SystemMessage\n",
    "from langgraph.graph.message import add_messages\n",
    "from langchain_elasticsearch import ElasticsearchStore\n",
    "from langgraph.graph import StateGraph, END\n",
    "\n",
    "from src.llm_factory import create_chat_model\n",
    "from src.emb_factory import create_embedding_model\n",
    "from src.config import (\n",
    "    ELASTICSEARCH_LOCAL_URL,\n",
    "    ELASTICSEARCH_INDEX,\n",
    "    OLLAMA_MODEL_NAME,\n",
    "    OLLAMA_EMB_MODEL_NAME\n",
    ")\n",
    "\n",
    "class AgentState(TypedDict, total=False):\n",
    "    messages: Annotated[list, add_messages]\n",
    "    reformulated_query: str\n",
    "    context: str            \n",
    "    contexts: List[str]\n",
    "\n",
    "REFORMULATE_PROMPT = SystemMessage(\n",
    "    content=(\n",
    "        \"You are a deterministic lexical query rewriter used for vector retrieval.\\n\"\n",
    "        \"Your task is to rewrite user questions into optimized keyword search queries.\\n\\n\"\n",
    "\n",
    "        \"CRITICAL RULES (ABSOLUTE):\\n\"\n",
    "        \"1. NEVER answer the question.\\n\"\n",
    "        \"2. NEVER expand acronyms.\\n\"\n",
    "        \"3. NEVER introduce new terms not present in the original query.\\n\"\n",
    "        \"4. NEVER infer missing information.\\n\"\n",
    "        \"5. NEVER add explanations, definitions, or interpretations.\\n\"\n",
    "        \"6. Preserve all technical tokens exactly as written.\\n\"\n",
    "        \"7. Only remove filler words (e.g., what, does, is, explain, tell me, please).\\n\"\n",
    "        \"8. You may reorder terms for better retrieval.\\n\"\n",
    "        \"9. Output must be a single-line plain keyword query.\\n\"\n",
    "        \"10. If the query is already optimal, return it unchanged.\\n\\n\"\n",
    "        \"11. If you receive something that looks like code, do NOT attempt to rewrite it. Return it verbatim.\\n\\n\"\n",
    "\n",
    "        \"ALLOWED OPERATIONS:\\n\"\n",
    "        \"- Remove interrogative phrasing.\\n\"\n",
    "        \"- Remove stopwords.\\n\"\n",
    "        \"- Reorder words.\\n\"\n",
    "        \"- Convert to noun phrase form.\\n\\n\"\n",
    "\n",
    "        \"FORBIDDEN OPERATIONS:\\n\"\n",
    "        \"- Expanding abbreviations.\\n\"\n",
    "        \"- Paraphrasing into unseen vocabulary.\\n\"\n",
    "        \"- Adding definitions.\\n\"\n",
    "        \"- Answering implicitly.\\n\\n\"\n",
    "\n",
    "        \"Examples:\\n\"\n",
    "        \"Input: What does AVAP stand for?\\n\"\n",
    "        \"Output: AVAP stand for\\n\"\n",
    "\n",
    "        \"Input: Hey, I'm trying to understand how AVAP handels a ZeroDivisionError when doing divison or modulus operatoins. Can you explane what situatoins cause a ZeroDivisionError to be raised and how I can catch it in my AVAP scripts?\\n\"\n",
    "        \"Output: AVAP ZeroDivisionError division / modulus % catch try except\\n\"\n",
    "        \n",
    "        \"Input: What does AVAP stand for?\\n\"\n",
    "        \"Output: AVAP stand for\\n\"\n",
    "\n",
    "        \"Input: Please explain how the import statement works in AVAP scripts.\\n\"\n",
    "        \"Output: AVAP import statement syntax behavior\\n\\n\"\n",
    "\n",
    "        \"Return only the rewritten query.\"\n",
    "    )\n",
    ")\n",
    "\n",
    "GENERATE_PROMPT = SystemMessage(\n",
    "    content=(\n",
    "        \"\"\"You are a retrieval-grounded assistant.\n",
    "\n",
    "        CRITICAL SCOPE RULES\n",
    "        - AVAP (Advanced Virtual API Programming) is a new/unknown programming language.\n",
    "        - If the user message is AVAP-related (explicitly mentions \"AVAP\" case-insensitive OR the provided context is AVAP code/docs), you MUST answer using ONLY <CONTEXT>.\n",
    "        - You MUST NOT use general programming knowledge to interpret AVAP.\n",
    "        - You MUST NOT assume AVAP behaves like any other language even if syntax looks similar.\n",
    "        - You MUST NOT infer missing details. Only state what is explicitly present in <CONTEXT>.\n",
    "\n",
    "        GROUNDING REQUIREMENTS (AVAP-RELATED)\n",
    "        1) Every non-trivial factual claim MUST be directly supported by an EXACT QUOTE from <CONTEXT>.\n",
    "        2) If a claim is not supported by a quote, DO NOT include it.\n",
    "        3) If <CONTEXT> does not contain enough information to answer, reply with EXACTLY:\n",
    "        \"I don't have enough information in the provided context to answer that.\"\n",
    "\n",
    "        WORKFLOW (AVAP-RELATED) — FOLLOW IN ORDER\n",
    "        A) Identify the specific question(s) being asked.\n",
    "        B) Extract the minimum necessary quotes from <CONTEXT> that answer those question(s).\n",
    "        C) Write the answer using ONLY those quotes (paraphrase is allowed, but every statement must be backed by at least one quote).\n",
    "        D) Verify: for EACH sentence in your answer, confirm there is a supporting quote. If any sentence lacks a quote, delete it or refuse.\n",
    "\n",
    "        OUTPUT FORMAT (AVAP-RELATED ONLY)\n",
    "        Answer:\n",
    "        <short, direct answer; no extra speculation; no unrelated tips>\n",
    "\n",
    "        Evidence:\n",
    "        - \"<exact quote 1>\"\n",
    "        - \"<exact quote 2>\"\n",
    "        (Include only quotes you actually used. Prefer the smallest quotes that fully support the statements.)\n",
    "\n",
    "        NON-AVAP QUESTIONS\n",
    "        - If the question is clearly not AVAP-related, answer normally using general knowledge.\n",
    "\n",
    "        <CONTEXT>\n",
    "        {context}\n",
    "        </CONTEXT>\"\"\"\n",
    "    )\n",
    ")\n",
    "\n",
    "retrieve_kwargs = {\n",
    "    \"k\": 3\n",
    "}\n",
    "\n",
    "def format_context(docs: List[Document]) -> str:\n",
    "    chunks: List[str] = []\n",
    "    for i, doc in enumerate(docs, 1):\n",
    "        source = (doc.metadata or {}).get(\"source\", \"Untitled\")\n",
    "        source_id = (doc.metadata or {}).get(\"id\", f\"chunk-{i}\")\n",
    "        text = doc.page_content or \"\"\n",
    "        chunks.append(f\"[{i}] id={source_id} source={source}\\n{text}\")\n",
    "    return \"\\n\\n\".join(chunks)\n",
    "\n",
    "def reformulate(state: AgentState) -> AgentState:\n",
    "    \"\"\"Use the LLM to rewrite the user query for better retrieval.\"\"\"\n",
    "    user_msg = state[\"messages\"][-1]\n",
    "    resp = llm.invoke([REFORMULATE_PROMPT, user_msg])\n",
    "    reformulated = resp.content.strip()\n",
    "    print(f\"[reformulate] '{user_msg.content}' → '{reformulated}'\")\n",
    "    return {\"reformulated_query\": reformulated}\n",
    "\n",
    "\n",
    "def retrieve(state: AgentState) -> AgentState:\n",
    "    \"\"\"Retrieve context using the reformulated query.\"\"\"\n",
    "    query = state[\"reformulated_query\"]\n",
    "    docs = vector_store.as_retriever(\n",
    "        search_type=\"similarity\",\n",
    "        search_kwargs=retrieve_kwargs,\n",
    "    ).invoke(query)\n",
    "\n",
    "    context = format_context(docs)\n",
    "    contexts = [d.page_content or \"\" for d in docs]  # <-- for Dataset\n",
    "\n",
    "    print(f\"[retrieve] {len(docs)} docs fetched\")\n",
    "    return {\"context\": context, \"contexts\": contexts}\n",
    "\n",
    "\n",
    "def generate(state: AgentState) -> AgentState:\n",
    "    \"\"\"Generate the final answer using retrieved context.\"\"\"\n",
    "    prompt = SystemMessage(\n",
    "        content=GENERATE_PROMPT.content.format(context=state[\"context\"])\n",
    "    )\n",
    "    resp = llm.invoke([prompt] + state[\"messages\"])\n",
    "    return {\"messages\": [resp]}\n",
    "\n",
    "\n",
    "graph_builder = StateGraph(AgentState)\n",
    "\n",
    "graph_builder.add_node(\"reformulate\", reformulate)\n",
    "graph_builder.add_node(\"retrieve\", retrieve)\n",
    "graph_builder.add_node(\"generate\", generate)\n",
    "\n",
    "graph_builder.set_entry_point(\"reformulate\")\n",
    "graph_builder.add_edge(\"reformulate\", \"retrieve\")\n",
    "graph_builder.add_edge(\"retrieve\", \"generate\")\n",
    "graph_builder.add_edge(\"generate\", END)\n",
    "\n",
    "graph = graph_builder.compile()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9b723a42",
   "metadata": {},
   "source": [
    "### Create synthethic data (question, context, answer with SoTA model)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "id": "fe524d14",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loaded 28 documents from /home/acano/PycharmProjects/assistance-engine/data/raw\n"
     ]
    }
   ],
   "source": [
    "docs: list[Document] = []\n",
    "for txt_file in sorted((RAW_DIR / \"docs\").glob(\"*.txt\")):\n",
    "    text = txt_file.read_text(encoding=\"utf-8\")\n",
    "    docs.append(Document(page_content=text, metadata={\"source\": txt_file.name}))\n",
    "\n",
    "print(f\"Loaded {len(docs)} documents from {RAW_DIR}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "id": "ab1932b7",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_102010/1545617568.py:1: DeprecationWarning: LangchainLLMWrapper is deprecated and will be removed in a future version. Use llm_factory instead: from openai import OpenAI; from ragas.llms import llm_factory; llm = llm_factory('gpt-4o-mini', client=OpenAI(api_key='...'))\n",
      "  synth = SingleHopSpecificQuerySynthesizer(llm=LangchainLLMWrapper(llm))\n",
      "/tmp/ipykernel_102010/1545617568.py:3: DeprecationWarning: LangchainLLMWrapper is deprecated and will be removed in a future version. Use llm_factory instead: from openai import OpenAI; from ragas.llms import llm_factory; llm = llm_factory('gpt-4o-mini', client=OpenAI(api_key='...'))\n",
      "  generator = TestsetGenerator(llm=LangchainLLMWrapper(llm), embedding_model=LangchainEmbeddingsWrapper(embeddings))\n",
      "/tmp/ipykernel_102010/1545617568.py:3: DeprecationWarning: LangchainEmbeddingsWrapper is deprecated and will be removed in a future version. Use the modern embedding providers instead: embedding_factory('openai', model='text-embedding-3-small', client=openai_client) or from ragas.embeddings import OpenAIEmbeddings, GoogleEmbeddings, HuggingFaceEmbeddings\n",
      "  generator = TestsetGenerator(llm=LangchainLLMWrapper(llm), embedding_model=LangchainEmbeddingsWrapper(embeddings))\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "78fe99b4108b4731845407d26779bf25",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Applying SummaryExtractor:   0%|          | 0/28 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "c94617550c4047d984ecf4e514242856",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Applying CustomNodeFilter:   0%|          | 0/28 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Node 4f0571c5-9042-41cf-af53-7fed5673241f does not have a summary. Skipping filtering.\n",
      "Node c0d7490e-84eb-42bb-bfa2-2d54fb7c40c8 does not have a summary. Skipping filtering.\n",
      "Node bb351182-7859-4869-aedc-d4b90aeec552 does not have a summary. Skipping filtering.\n",
      "Node 9fffb9df-a036-4eee-9cee-8b1d0f282991 does not have a summary. Skipping filtering.\n",
      "Node 1f68ffd9-81ae-4f08-80f9-b599d65cc273 does not have a summary. Skipping filtering.\n",
      "Node ad4bcf0b-5266-48dd-8f43-1ceee702f45a does not have a summary. Skipping filtering.\n",
      "Node 6e1adf46-815f-4b40-bb77-a79ff81da087 does not have a summary. Skipping filtering.\n",
      "Node 95fc5a8c-0c51-4f18-b867-b474500ff984 does not have a summary. Skipping filtering.\n",
      "Node edaf8f1f-42aa-4894-bcbd-4f9a3842ea9c does not have a summary. Skipping filtering.\n",
      "Node 8284c3a9-4625-46a3-8e76-8ec9c958cb5f does not have a summary. Skipping filtering.\n",
      "Node 748c02e3-4986-40a1-ae36-7f3430258e2f does not have a summary. Skipping filtering.\n",
      "Node f28f2329-7024-47e5-abb4-3288450e4f0d does not have a summary. Skipping filtering.\n",
      "Node ac278604-0eab-428d-8066-8da0f9376e7f does not have a summary. Skipping filtering.\n",
      "Node 6704bf59-e21d-4e4e-b17c-dc12a855655e does not have a summary. Skipping filtering.\n",
      "Node 628662bf-cf73-4516-8cb1-da5140a0c0e1 does not have a summary. Skipping filtering.\n",
      "Node 0956b6a3-fa32-49ed-8847-1b0e84388105 does not have a summary. Skipping filtering.\n",
      "Node 368a21ac-6d92-42ff-a051-a8f59d6282e7 does not have a summary. Skipping filtering.\n",
      "Node 394532c7-dc1f-4315-a7ca-9892e78149d8 does not have a summary. Skipping filtering.\n",
      "Node 97462a75-214e-4ddd-ba50-d7a192a8c84d does not have a summary. Skipping filtering.\n",
      "Node f67f8ba3-963d-48d0-923c-5f0004672c41 does not have a summary. Skipping filtering.\n",
      "Node e4e794c4-65b2-4cd4-8c8b-460b3a217c4e does not have a summary. Skipping filtering.\n",
      "Node 72b0b65b-e43f-4eac-987b-9ef23b6d3904 does not have a summary. Skipping filtering.\n",
      "Node c0123882-a5a6-4e06-a820-d3983875388f does not have a summary. Skipping filtering.\n",
      "Node 2a2cfe9a-9ee7-4356-aafa-9530ac6f208d does not have a summary. Skipping filtering.\n",
      "Node d1c66d5d-b40c-4262-b775-f5737cccfc66 does not have a summary. Skipping filtering.\n",
      "Node 592bd141-ef0a-4bf5-97dc-5753ea1d9d82 does not have a summary. Skipping filtering.\n",
      "Node 62eedd3d-d2db-4634-b946-eda09c32b11c does not have a summary. Skipping filtering.\n",
      "Node 8e62281d-3e15-4cae-82be-989e0d88641b does not have a summary. Skipping filtering.\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "574402162a9440c9be2e1dcc6b1d9467",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Applying EmbeddingExtractor:   0%|          | 0/28 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "12c7928b642340ec9deab214b6dbec6e",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Applying ThemesExtractor:   0%|          | 0/28 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "1f972852844d4e71bd85041ce23352eb",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Applying NERExtractor:   0%|          | 0/28 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "3ca54cef2a81486a867e38426276b726",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Applying CosineSimilarityBuilder:   0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "0e5d0177732d44bb8e2ce093ad406b74",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Applying OverlapScoreBuilder:   0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "a6be98aa93474b4a911d112f78981c9f",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Generating personas:   0%|          | 0/3 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "de562456a87f4f78931b7fcfff0e025b",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Generating Scenarios:   0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "0b917029036b442a894dc11b8ad3e73f",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Generating Samples:   0%|          | 0/100 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "synth = SingleHopSpecificQuerySynthesizer(llm=LangchainLLMWrapper(llm))\n",
    "\n",
    "generator = TestsetGenerator(llm=LangchainLLMWrapper(llm), embedding_model=LangchainEmbeddingsWrapper(embeddings))\n",
    "synthetic_dataset = generator.generate_with_chunks(\n",
    "    chunks=docs,\n",
    "    testset_size=100,\n",
    "    query_distribution=[(synth, 1.0)]\n",
    ")\n",
    "synthetic_dataset = synthetic_dataset.to_pandas()\n",
    "synthetic_dataset.to_csv(INTERIM_DIR / \"retrieve_eval_results/synthetic_dataset.csv\", index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "698e060c",
   "metadata": {},
   "source": [
    "### Answer questions with agent"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "id": "344a1266",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[reformulate] 'So I been trying to understand how AVAP do the arithmetic conversions when you got different numeric types like complex and float and integer, can someone explain me what happens step by step when the arguments is different types because I want to make sure I not getting bugs in my code from wrong type conversions?' → 'AVAP arithmetic conversions numeric types complex float integer different types step by step type coercion rules'\n",
      "[retrieve] 3 docs fetched\n",
      "[reformulate] 'Hey, I'm trying to undrestand how arithmatic conversions work in AVAP when you have mixed numberic types like complex and floats — can you explane the full set of rules for how AVAP converts numberic arguments to a comon type when using arithmatic operators?' → 'AVAP arithmetic conversions mixed numeric types complex float rules numeric arguments common type arithmetic operators'\n",
      "[retrieve] 3 docs fetched\n",
      "[reformulate] 'Hey, I'm trying to undrestand how sliceing works in AVAP and how the __getitem__ methd is used internaly when you do a slice operaton - can you explane the full semantcs of how slice keys are constructd and passed to __getitem__?' → 'AVAP slicing __getitem__ slice operation semantics slice keys constructed passed internally'\n",
      "[retrieve] 3 docs fetched\n",
      "[reformulate] 'How does the __call__ method relate to callable objects in AVAP?' → '__call__ method callable objects AVAP'\n",
      "[retrieve] 3 docs fetched\n",
      "[reformulate] 'How does Decimel work in Python value comparisions?' → 'Decimal Python value comparisons'\n",
      "[retrieve] 3 docs fetched\n",
      "[reformulate] 'How does the Decimal type from the standrd library work with value comparisions and other numberic types in AVAP, and what are the limitatoins?' → 'AVAP Decimal standard library value comparisons numeric types limitations'\n",
      "[retrieve] 3 docs fetched\n",
      "[reformulate] 'Under what circumstances does a TypeError occur during comparison operations?' → 'TypeError comparison operations circumstances causes'\n",
      "[retrieve] 3 docs fetched\n",
      "[reformulate] 'According to the AVAP documentation, what does PEP 8 recommend regarding comparisons for singleton objects such as None and NotImplemented?' → 'PEP 8 comparisons singleton objects None NotImplemented AVAP documentation'\n",
      "[retrieve] 3 docs fetched\n",
      "[reformulate] 'hey so i been reading about expression statements in AVAP and i dont really get how repr() work when you in interactive mode, can someone explain what it do with the value?' → 'AVAP expression statements repr() interactive mode value behavior'\n",
      "[retrieve] 3 docs fetched\n",
      "[reformulate] 'AVAP programming language how do simple statements work and what types of simple statements are available in AVAP syntax' → 'AVAP simple statements types syntax'\n",
      "[retrieve] 3 docs fetched\n",
      "[reformulate] 'what types of simple statements AVAP have?' → 'AVAP simple statements types'\n",
      "[retrieve] 3 docs fetched\n",
      "[reformulate] 'How AVAP future statement is related to Python?' → 'AVAP future statement relation Python'\n",
      "[retrieve] 3 docs fetched\n",
      "[reformulate] 'Hey so in AVAP when I use True in a literal pattern how does it work exactly, like what it match against?' → 'AVAP True literal pattern match behavior'\n",
      "[retrieve] 3 docs fetched\n",
      "[reformulate] 'How does the literal patern for False work in AVAP match statments, and what are the other literal paterns availble alongside it?' → 'AVAP match statement False literal pattern literal patterns available'\n",
      "[retrieve] 3 docs fetched\n",
      "[reformulate] 'How does a class pattern involving MyClass work in AVAP's match statements?' → 'AVAP match statement class pattern MyClass'\n",
      "[retrieve] 3 docs fetched\n",
      "[reformulate] 'How does the literal pattern for False work in AVAP's match statements, and what is the expected behavior when a value matches it?' → 'AVAP match statement literal pattern False value matching behavior'\n",
      "[retrieve] 3 docs fetched\n",
      "[reformulate] 'how does main.avap import and execute functions from other files in AVAP' → 'main.avap import execute functions other files AVAP'\n",
      "[retrieve] 3 docs fetched\n",
      "[reformulate] 'AVAP import operations.avap example how functions are used after import' → 'AVAP import operations.avap example functions usage after import'\n",
      "[retrieve] 3 docs fetched\n",
      "[reformulate] 'What happens in AVAP when the file specifed in an import statment is not found - does it raise a FileNotFoundEror?' → 'AVAP import statement file not found FileNotFoundError'\n",
      "[retrieve] 3 docs fetched\n",
      "[reformulate] 'hey so i been trying to understand like what exactly happen when you use a variable name in AVAP but it not defined nowhere in the scope, like does it throw a NameError or something else and also what about if you call a function before it is defined in the file does same thing happen with NameError too?' → 'AVAP undefined variable NameError scope undeclared function call before defined NameError'\n",
      "[retrieve] 3 docs fetched\n",
      "[reformulate] 'How does the IF-THEN-ESLE statment work in AVAP™?' → 'IF THEN ELSE statement AVAP'\n",
      "[retrieve] 3 docs fetched\n",
      "[reformulate] 'Hey so I been trying to understand how IF-THEN-ELSE work in AVAP, like what happen when the condition is true and what happen when it false? Can someone explain me the whole flow?' → 'AVAP IF-THEN-ELSE condition true false flow'\n",
      "[retrieve] 3 docs fetched\n",
      "[reformulate] 'How does the IF-THEN-ELSE statement work in AVAP™?' → 'IF THEN ELSE statement AVAP™'\n",
      "[retrieve] 3 docs fetched\n",
      "[reformulate] 'AVAP IF-THEN-ELSE statement how does API result work with conditions' → 'AVAP IF-THEN-ELSE statement API result conditions'\n",
      "[retrieve] 3 docs fetched\n",
      "[reformulate] 'what addVar do in the loop example?' → 'addVar loop example'\n",
      "[retrieve] 3 docs fetched\n",
      "[reformulate] 'How does the loop statement in AVAP interact with the API result, and what function is used to add a variable's value to the API response?' → 'AVAP loop statement API result function add variable value API response'\n",
      "[retrieve] 3 docs fetched\n",
      "[reformulate] 'endLoop() AVAP what happens after loop ends' → 'AVAP endLoop() behavior after loop ends'\n",
      "[retrieve] 3 docs fetched\n",
      "[reformulate] 'hey so i been trying to understand how addResult work in AVAP, like after a loop finish and you want to send back the value to the API response, how does addResult do that and can you show me the example from the loop code because i not sure how it connect to the final result that the API give back?' → 'AVAP addResult loop API response final result example'\n",
      "[retrieve] 3 docs fetched\n",
      "[reformulate] 'As a developer working with the AVAP programming language, I would like to understand how the addParam() function operates internally and what considerations I should keep in mind when using it to construct API calls in my applications.' → 'AVAP addParam() function internal operation API calls considerations'\n",
      "[retrieve] 3 docs fetched\n",
      "[reformulate] 'How does the addParam() function work internally when constructing an API call in AVAP?' → 'addParam() function internal API call construction AVAP'\n",
      "[retrieve] 3 docs fetched\n",
      "[reformulate] 'As a developer working with AVAP™, I would like to understand how the addParam() function operates internally and what important considerations I should keep in mind when using it to construct API calls in my applications.' → 'AVAP addParam() function internal operation considerations API calls construction'\n",
      "[retrieve] 3 docs fetched\n",
      "[reformulate] 'Hey so I been trying to learn AVAP™ and I want to know like what is the addParam() function doing internally when you use it in a API call and also what things I need to be careful about when I using it because I dont want to mess up my application?' → 'AVAP addParam() function internal behavior API call usage precautions best practices'\n",
      "[retrieve] 3 docs fetched\n",
      "[reformulate] 'How does the syntax for including files in AVAP compare to the syntax used in C, and what are the two main methods available for including files in AVAP?' → 'AVAP include files syntax C comparison two methods'\n",
      "[retrieve] 3 docs fetched\n",
      "[reformulate] 'In the AVAP programming language, how would I include a project-specific file named utils.avap into my main project, and what are the key advantages of doing so?' → 'AVAP include project file utils.avap main project advantages'\n",
      "[retrieve] 3 docs fetched\n",
      "[reformulate] 'How does AVAP™ utilize includes and Function Products to promote modularity and extend the capabilities of the base language?' → 'AVAP™ includes Function Products modularity extend base language capabilities'\n",
      "[retrieve] 3 docs fetched\n",
      "[reformulate] 'how do AVS servers relate to function products and function libraries in AVAP development' → 'AVS servers function products function libraries AVAP development relationship'\n",
      "[retrieve] 3 docs fetched\n",
      "[reformulate] 'how functions work in AVAP™ and what they do?' → 'AVAP™ functions work behavior'\n",
      "[retrieve] 3 docs fetched\n",
      "[reformulate] 'How AVAP functions is similar to Python?' → 'AVAP functions similar Python'\n",
      "[retrieve] 3 docs fetched\n",
      "[reformulate] 'How do functons work in AVAP™ and what are the technicall fetures like paramters, return valeus, and scoping rules that I need to undrestand as a devloper lerning the langauge?' → 'AVAP functions parameters return values scoping rules features'\n",
      "[retrieve] 3 docs fetched\n",
      "[reformulate] 'How do u define functons in AVAP™?' → 'define functions AVAP™'\n",
      "[retrieve] 3 docs fetched\n",
      "[reformulate] 'How does the getDateTime() comand use pytx for timezones?' → 'getDateTime() command pytx timezones'\n",
      "[retrieve] 3 docs fetched\n",
      "[reformulate] 'how does getDateTime command use pytz library time zones to convert current date and time in different formats' → 'getDateTime command pytz library time zones convert current date time formats'\n",
      "[retrieve] 3 docs fetched\n",
      "[reformulate] 'how to extract value from JSON object by key python variableFromJSON' → 'extract value JSON object key python variableFromJSON'\n",
      "[retrieve] 3 docs fetched\n",
      "[reformulate] 'How does the getDateTime() command handle UTC as a time zone parameter?' → 'getDateTime() command UTC time zone parameter handling'\n",
      "[retrieve] 3 docs fetched\n",
      "[reformulate] 'AVAP how to build JSON response with multiple variables using addResult and addVar' → 'AVAP build JSON response multiple variables addResult addVar'\n",
      "[retrieve] 3 docs fetched\n",
      "[reformulate] 'How does the registerEndpont command handle HTTP methods that dont match the request?' → 'registerEndpont HTTP methods mismatch request handling'\n",
      "[retrieve] 3 docs fetched\n",
      "[reformulate] 'How does AVAP handle arithmatic like Pyhton?' → 'AVAP arithmetic handling Python'\n",
      "[retrieve] 3 docs fetched\n",
      "[reformulate] 'How does the registerEndpoint command handle a POST method request that does not match the specified HTTP verb?' → 'registerEndpoint command POST method request mismatch specified HTTP verb handling'\n",
      "[retrieve] 3 docs fetched\n",
      "[reformulate] 'As a developer learning AVAP, I would like to understand how the addParam command interacts with a JSON body in an incoming HTTP request and what the cascading search priority mechanism looks like when extracting parameters from multiple sources.' → 'AVAP addParam command JSON body HTTP request cascading search priority mechanism extracting parameters multiple sources'\n",
      "[retrieve] 3 docs fetched\n",
      "[reformulate] 'how AVAP handle when same parameter sent multiple times?' → 'AVAP handle same parameter sent multiple times duplicate parameter'\n",
      "[retrieve] 3 docs fetched\n",
      "[reformulate] 'In the context of the AVAP programming language, how does the addParam command handle encoding when extracting parameters from an incoming HTTP request, particularly with respect to UTF-8, and what happens if the requested parameter is not found in any of the available sources?' → 'AVAP addParam command encoding UTF-8 HTTP request parameter extraction not found behavior'\n",
      "[retrieve] 3 docs fetched\n",
      "[reformulate] 'How does the addParam comand in AVAP handle ASCI decoding when it recieves paramaters from an HTTP request, and what hapens if the paramater is not found in any sorce?' → 'AVAP addParam ASCII decoding HTTP request parameters not found source'\n",
      "[retrieve] 3 docs fetched\n",
      "[reformulate] 'How does the loop structure in AVAP reference data captured in Section II, and what is the mechanism for processing lists within a startLoop block?' → 'AVAP loop structure Section II reference data captured startLoop block mechanism processing lists'\n",
      "[retrieve] 3 docs fetched\n",
      "[reformulate] 'AVAP RequestGet usage inside try catch error handling example' → 'AVAP RequestGet try catch error handling usage example'\n",
      "[retrieve] 3 docs fetched\n",
      "[reformulate] 'how AVAP handle HTTP request when it fail?' → 'AVAP handle HTTP request failure error handling'\n",
      "[retrieve] 3 docs fetched\n",
      "[reformulate] 'what is Section V about in the try catch example?' → 'Section V try catch example'\n",
      "[retrieve] 3 docs fetched\n",
      "[reformulate] 'AVAP™ virtuality attribute explained' → 'AVAP™ virtuality attribute'\n",
      "[retrieve] 3 docs fetched\n",
      "[reformulate] 'Hey so I been learning AVAP and I want to know how does OpenAI work with the language like what it do exactly and how it help with databases and stuff because I not really understanding the integration part can you explain it to me in detail?' → 'AVAP OpenAI integration databases functionality details'\n",
      "[retrieve] 3 docs fetched\n",
      "[reformulate] 'How does the virtuality attribute in Advance Virtual API Programming (AVAP) enable dynamic code construction, and what benefits does this provide for API development?' → 'AVAP virtuality attribute dynamic code construction benefits API development'\n",
      "[retrieve] 3 docs fetched\n",
      "[reformulate] 'AVAP programming language Europe availability features syntax virtualization APIs what makes it special' → 'AVAP programming language Europe availability features syntax virtualization APIs special'\n",
      "[retrieve] 3 docs fetched\n",
      "[reformulate] 'Hey so I trying to understand, if I want to query like multiple databases in Asia region at same time, how AVAP handle the parallel execution and what happen if one take too long?' → 'AVAP parallel execution query multiple databases Asia region timeout handling'\n",
      "[retrieve] 3 docs fetched\n",
      "[reformulate] 'Hey, I'm trying to understand how RequstPost works inside an async thred in AVAP — like, can you explane what happens when you use RequestPost inside a go_async block and what the output looks like if you dont use gather before addResult?' → 'AVAP RequestPost async go_async block gather addResult output behavior'\n",
      "[retrieve] 3 docs fetched\n",
      "[reformulate] 'Hey so I been trying to understand how go_async work in AVAP and like what happen with the variables when you launch a new thread, does the new thread get access to same variables or it get its own copy? Also what happen to the main flow after go_async is called, does it wait or keep going? I need to know all this stuff because I building an app that need to send emails in background without making user wait' → 'AVAP go_async thread variables copy access main flow execution wait continue asynchronous background'\n",
      "[retrieve] 3 docs fetched\n",
      "[reformulate] 'what hapens with variabels when go_async runs?' → 'variables go_async runs behavior'\n",
      "[retrieve] 3 docs fetched\n",
      "[reformulate] 'AVAP ormDirect how to use for SQL statements that don't return rows' → 'AVAP ormDirect SQL statements no return rows usage'\n",
      "[retrieve] 3 docs fetched\n",
      "[reformulate] 'how does AVAP avapConnector work with Slack API integration and what methods are exposed for third-party connectors' → 'AVAP avapConnector Slack API integration methods third-party connectors'\n",
      "[retrieve] 3 docs fetched\n",
      "[reformulate] 'How does the ormAccessSelect command work in AVAP, and what type of data structure does it return when performing filtered queries?' → 'ormAccessSelect command AVAP data structure return filtered queries'\n",
      "[retrieve] 3 docs fetched\n",
      "[reformulate] 'Hey so in AVAP how do the ormDirect and the .query() method is different from each other, like when you supposed to use which one?' → 'AVAP ormDirect .query() method difference usage comparison'\n",
      "[retrieve] 3 docs fetched\n",
      "[reformulate] 'AVAP getDateTime timeDelta token expiration' → 'AVAP getDateTime timeDelta token expiration'\n",
      "[retrieve] 3 docs fetched\n",
      "[reformulate] 'How do I use the getDateTyme function in AVAP to get the curent UTC timestamp and also calculte an experation date using UTC timezone?' → 'AVAP getDateTyme UTC timestamp expiration date timezone calculation'\n",
      "[retrieve] 3 docs fetched\n",
      "[reformulate] 'How does the randonString function work in AVAP and what are its main aplications?' → 'AVAP randonString function applications'\n",
      "[retrieve] 3 docs fetched\n",
      "[reformulate] 'AVAP encodeSHA256 command how does SHA256 hashing work for secure password comparison without storing plaintext' → 'AVAP encodeSHA256 command SHA256 hashing secure password comparison plaintext'\n",
      "[retrieve] 3 docs fetched\n",
      "[reformulate] 'As a developer learning AVAP, I would like to understand how functions in AVAP manage variable scope and memory isolation to prevent side effects, and how the return command facilitates data transfer and cleanup within the function architecture.' → 'AVAP functions variable scope memory isolation side effects return command data transfer cleanup function architecture'\n",
      "[retrieve] 3 docs fetched\n",
      "[reformulate] 'AVAP function local scope and variable isolation how does it work' → 'AVAP function local scope variable isolation'\n",
      "[retrieve] 3 docs fetched\n",
      "[reformulate] 'how do AVAP functions work as middleware for API key verification' → 'AVAP functions middleware API key verification'\n",
      "[retrieve] 3 docs fetched\n",
      "[reformulate] 'hey so i been trying to learn AVAP and i want to know like when you make a function and it finish running what happen to the variables that was inside it and also how does the return command work exactly because i not sure if the local variables stay or get deleted after the function done executing?' → 'AVAP function local variables scope lifetime return command execution deleted'\n",
      "[retrieve] 3 docs fetched\n",
      "[reformulate] 'What does SECTION I handle in the master example's real flow?' → 'SECTION I master example real flow'\n",
      "[retrieve] 3 docs fetched\n",
      "[reformulate] 'SECTION II in master example what does it handle registration input response' → 'SECTION II master example registration input response handling'\n",
      "[retrieve] 3 docs fetched\n",
      "[reformulate] 'SECTION III AVAP validation example' → 'SECTION III AVAP validation example'\n",
      "[retrieve] 3 docs fetched\n",
      "[reformulate] 'In the Master Example that combines multiple sections, how does SECTION III handle validation and what specific expression syntax is used to check whether a required parameter is missing?' → 'Master Example SECTION III validation required parameter missing expression syntax check'\n",
      "[retrieve] 3 docs fetched\n",
      "[reformulate] 'Hey so I been trying to understand AVAP™ better and I wanna know like what is the thing with the runtime interpreter in AVAP™ because I heard it does something special with code during execution and how is it different from just running code line by line and also what does it mean that language specifications is isolated from the interpreter can someone explain all of this to me?' → 'AVAP™ runtime interpreter code execution language specifications isolated interpreter'\n",
      "[retrieve] 3 docs fetched\n",
      "[reformulate] 'How does AVAP™ handl memmory managment automaticaly without devlopers needing to do it manualy?' → 'AVAP™ automatic memory management'\n",
      "[retrieve] 3 docs fetched\n",
      "[reformulate] 'what make AVAP™ runtime interpreter different from normal ones?' → 'AVAP™ runtime interpreter differences unique features'\n",
      "[retrieve] 3 docs fetched\n",
      "[reformulate] 'AVAP™ runtime interpreter dynamic code construction how does it work' → 'AVAP™ runtime interpreter dynamic code construction'\n",
      "[retrieve] 3 docs fetched\n",
      "[reformulate] 'How does the BNF grammar notation used in AVAP™ define the structure of an expression and its supported operators?' → 'AVAP BNF grammar notation expression structure supported operators definition'\n",
      "[retrieve] 3 docs fetched\n",
      "[reformulate] 'what operators AVAP™ support in expressions?' → 'AVAP™ operators expressions supported'\n",
      "[retrieve] 3 docs fetched\n",
      "[reformulate] 'How does AVAP use a modified Backus-Naur fourm (BNF) grammer notation to define its syntax and what operators does it support?' → 'AVAP modified Backus-Naur BNF grammer notation syntax definition operators supported'\n",
      "[retrieve] 3 docs fetched\n",
      "[reformulate] 'As a software developer learning AVAP, I would like to understand how the modified Backus–Naur form grammar notation is used in AVAP to define the language's lexical analysis and syntax, and specifically, how does it describe the structure of a program, statements, and expressions?' → 'AVAP modified Backus-Naur form grammar notation lexical analysis syntax program structure statements expressions'\n",
      "[retrieve] 3 docs fetched\n",
      "[reformulate] 'Hey so I been reading about AVAP keywords and I see ormCheckTable listed there, can you tell me what it is and why I cant use it as variable name or something?' → 'AVAP ormCheckTable keyword reserved variable name'\n",
      "[retrieve] 3 docs fetched\n",
      "[reformulate] 'AVAP keywords list' → 'AVAP keywords list'\n",
      "[retrieve] 3 docs fetched\n",
      "[reformulate] 'In the AVAP programming language, what is the role of ormCheckTable, and how is it classified within the language's lexical components?' → 'AVAP ormCheckTable role classification lexical components'\n",
      "[retrieve] 3 docs fetched\n",
      "[reformulate] 'RequestPost keyword in AVAP lexical analysis what is it' → 'RequestPost keyword AVAP lexical analysis'\n",
      "[retrieve] 3 docs fetched\n",
      "[reformulate] 'AVAP™ data model data types and data structures complete list with examples' → 'AVAP™ data model data types data structures complete list examples'\n",
      "[retrieve] 3 docs fetched\n",
      "[reformulate] 'Python like data types and data structures in AVAP language' → 'AVAP Python like data types data structures'\n",
      "[retrieve] 3 docs fetched\n",
      "[reformulate] 'AVAP data model compared to Python data types and structures' → 'AVAP data model Python data types structures comparison'\n",
      "[retrieve] 3 docs fetched\n",
      "[reformulate] 'What are the diffrent data typs available in AVAP™ and how do they compair to Python's data types?' → 'AVAP data types comparison Python data types'\n",
      "[retrieve] 3 docs fetched\n",
      "[reformulate] 'In Python and AVAP how do you does conversion between data types work, like if I got a string and want to make it a number or something?' → 'AVAP Python data type conversion string to number type casting'\n",
      "[retrieve] 3 docs fetched\n",
      "[reformulate] 'How does AVAP compare to Python in terms of basic data types and data type conversion?' → 'AVAP Python comparison basic data types data type conversion'\n",
      "[retrieve] 3 docs fetched\n",
      "[reformulate] 'Hey so I been trying to learn AVAP and I notice it look a lot like Python in many ways, so can you tell me what are all the basic data types that AVAP have and how they is similar to Python when it come to converting between them and doing operations?' → 'AVAP basic data types conversion operations comparison Python'\n",
      "[retrieve] 3 docs fetched\n",
      "[reformulate] 'How AVAP data types is similar to Python?' → 'AVAP data types similar Python'\n",
      "[retrieve] 3 docs fetched\n"
     ]
    }
   ],
   "source": [
    "questions = synthetic_dataset[\"user_input\"]\n",
    "ground_truths = synthetic_dataset[\"reference\"]\n",
    "ground_truth_contexts = synthetic_dataset[\"reference_contexts\"]\n",
    "\n",
    "answers, contexts = [], []\n",
    "\n",
    "for q in questions:\n",
    "    out = graph.invoke({\"messages\": [HumanMessage(content=q)]})\n",
    "\n",
    "    # final assistant message content\n",
    "    answers.append(out[\"messages\"][-1].content)\n",
    "\n",
    "    # contexts captured from retrieval node\n",
    "    contexts.append(out.get(\"contexts\", []))\n",
    "\n",
    "dataset = Dataset.from_dict({\n",
    "    \"question\": questions,\n",
    "    \"answer\": answers,\n",
    "    \"contexts\": contexts,\n",
    "    \"ground_truth\": ground_truths,\n",
    "    \"ground_truth_contexts\": ground_truth_contexts,\n",
    "})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "id": "97c1d1af",
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset.to_pandas().to_csv(INTERIM_DIR / \"retrieve_eval_results/full_synthetic_dataset.csv\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "id": "a9011f94",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "1340ee969c6149d5aa1bf2d1f278fed8",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Evaluating:   0%|          | 0/700 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Exception raised in Job[160]: OutputParserException(Invalid json output: {\"TP\": [{\"statement\": \"The IF-THEN-ELSE statement in AVAP™ allows for decision-making based on specific conditions.\", \"reason\": \"This is supported by the ground truth which states that the IF-THEN-ELSE statement evaluates a given condition and executes different blocks based on the result.\"}, {\"statement\": \"The IF-THEN-ELSE statement in AVAP™ executes different blocks of code depending on the outcome of evaluated conditions.\", \"reason\": \"This is directly supported by the ground truth statements about executing code within IF() or ELSE() blocks based on condition evaluation.\"}, {\"statement\": \"The condition parameter in the IF-THEN-ELSE statement is an expression that evaluates to either true or false.\", \"reason\": \"This is directly supported by the ground truth which states 'condition is an expression that evaluates to true or false'.\"}, {\"statement\": \"The true_value parameter in the IF-THEN-ELSE statement is the value assigned if the condition is true.\", \"reason\": \"This is directly supported by the ground truth which states 'true_value is the value assigned if the condition is true'.\"}, {\"statement\": \"The operator parameter in the IF-THEN-ELSE statement is the operator used to compare the condition with the true value.\", \"reason\": \"This is directly supported by the ground truth which states 'operator is the operator used to compare the condition with the true value'.\"}, {\"statement\": \"The IF-THEN-ELSE statement evaluates the given condition and, if the condition is true, executes the block of code within the IF() block.\", \"reason\": \"This is directly supported by the ground truth statements about evaluating conditions and executing the IF() block when true.\"}, {\"statement\": \"If the condition is false, the IF-THEN-ELSE statement executes the block of code within the ELSE() block.\", \"reason\": \"This is directly supported by the ground truth which states the ELSE() block runs when the condition is false.\"}, {\"statement\": \"An IF block in AVAP™ must always terminate with the END() command.\", \"reason\": \"This is directly supported by the ground truth which states 'The IF-THEN-ELSE statement in AVAP™ concludes with END()'.\"}, {\"statement\": \"The statement IF(selector,'yes','=') evaluates whether the value of selector is equal to 'yes'.\", \"reason\": \"This is directly supported by the ground truth which states 'IF(selector,\\'yes\\',\\'=\\') evaluates whether the value of selector is equal to \\'yes\\''.\"}, {\"statement\": \"In the provided example, the condition IF(selector,'yes','=') evaluates to true.\", \"reason\": \"This is supported by the ground truth which describes the condition evaluation and the corresponding execution of the IF() block.\"}, {\"statement\": \"Since the condition of the IF() block is true, the code block inside the ELSE() block is not executed.\", \"reason\": \"This is supported by the ground truth which states the ELSE() block runs only when the condition is false.\"}], \"FP\": [{\"statement\": \"The operator '=' in AVAP™ represents strict equality or numeric equivalence.\", \"reason\": \"This specific description of the '=' operator is not mentioned or supported by any statement in the ground truth.\"}, {\"statement\": \"The operator '!=' in AVAP™ represents inequality.\", \"reason\": \"This specific operator description is not mentioned or supported by any statement in the ground truth.\"}, {\"statement\": \"The operators '>' and '<' in AVAP™ are used for numeric magnitude comparison.\", \"reason\": \"This specific operator description is not mentioned or supported by any statement in the ground truth.\"}, {\"statement\": \"The operator 'in' in AVAP™ checks whether an element belongs to a list or string.\", \"reason\": \"This specific operator description is not mentioned or supported by any statement in the ground truth.\"}, {\"statement\": \"An IF block in AVAP™ may include an optional ELSE() block.\", \"reason\": \"The ground truth does not mention that the ELSE() block is optional; it only describes its behavior when present.\"}, {\"statement\": \"In the provided example, the variable selector is initialized with the value 'yes'.\", \"reason\": \"This specific detail about variable initialization is not mentioned in the ground truth.\"}, {\"statement\": \"Inside the IF() block, addVar(result,1) is executed, which assigns the value 1 to the result variable.\", \"reason\": \"This specific detail about the addVar command and result variable is not mentioned in the ground truth.\"}, {\"statement\": \"AVAP™ allows omission of comparison parameters to evaluate a complete logical expression directly in the third parameter.\", \"reason\": \"This specific behavior about omitting comparison parameters is not mentioned or supported by any statement in the ground truth.\"}, {\"statement\": \"The result returned by the API for the provided example has a status value of true.\", \"reason\": \"This specific detail about the API result status is not mentioned in the ground truth.\"}, {\"statement\": \"The result returned by the API for the provided example has a result value of 1.\", \"reason\": \"This specific detail about the API result value is not mentioned in the ground truth.\"}, {\"statement\": \"A status value of true in the API result indicates that the execution was successful.\", \"reason\": \"This interpretation of the API status value is not mentioned or supported by any statement in the ground truth.\"}], \"FN\": [{\"statement\": \"The basic syntax of the IF-THEN-ELSE statement in AVAP™ is IF(condition, true_value, operator).\", \"reason\": \"While individual parameters are mentioned in the answer, the complete basic syntax format is not explicitly stated.\"}, {\"statement\": \"If the condition in IF(selector,'yes','=') is false, the code inside the ELSE() block runs.\", \"reason\": \"The answer mentions the general ELSE() behavior but does not explicitly state this in the context of the IF(selector,'yes','=') example.\"}]}\n",
      "For troubleshooting, visit: https://docs.langchain.com/oss/python/langchain/errors/OUTPUT_PARSING_FAILURE )\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user_input</th>\n",
       "      <th>retrieved_contexts</th>\n",
       "      <th>response</th>\n",
       "      <th>reference</th>\n",
       "      <th>faithfulness</th>\n",
       "      <th>answer_relevancy</th>\n",
       "      <th>context_precision</th>\n",
       "      <th>context_recall</th>\n",
       "      <th>context_entity_recall</th>\n",
       "      <th>answer_similarity</th>\n",
       "      <th>answer_correctness</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>So I been trying to understand how AVAP do the...</td>\n",
       "      <td>[Chapter 5: Data Types In this chapter, we wil...</td>\n",
       "      <td># Arithmetic Conversions in AVAP\\n\\nWhen you'r...</td>\n",
       "      <td>In AVAP, when an arithmetic operator is descri...</td>\n",
       "      <td>0.923077</td>\n",
       "      <td>0.915934</td>\n",
       "      <td>0.333333</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.931432</td>\n",
       "      <td>0.778313</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Hey, I'm trying to undrestand how arithmatic c...</td>\n",
       "      <td>[6. Expressions in AVAP This chapter explains ...</td>\n",
       "      <td># Arithmetic Conversions in AVAP\\n\\nWhen using...</td>\n",
       "      <td>In AVAP, when an arithmetic operator is descri...</td>\n",
       "      <td>0.857143</td>\n",
       "      <td>0.918821</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.250000</td>\n",
       "      <td>0.940588</td>\n",
       "      <td>0.770861</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Hey, I'm trying to undrestand how sliceing wor...</td>\n",
       "      <td>[6. Expressions in AVAP This chapter explains ...</td>\n",
       "      <td># Slicing Semantics in AVAP\\n\\nHere is a full ...</td>\n",
       "      <td>In AVAP, when you perform a slice operation, t...</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.896008</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.071429</td>\n",
       "      <td>0.922016</td>\n",
       "      <td>0.865119</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>How does the __call__ method relate to callabl...</td>\n",
       "      <td>[6. Expressions in AVAP This chapter explains ...</td>\n",
       "      <td>Answer:\\n\\nIn AVAP, the `__call__` method is w...</td>\n",
       "      <td>In AVAP, any object that has a __call__() meth...</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.896314</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.142857</td>\n",
       "      <td>0.878617</td>\n",
       "      <td>0.894654</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>How does Decimel work in Python value comparis...</td>\n",
       "      <td>[Binary Arithmetic Operations Binary arithmeti...</td>\n",
       "      <td>This question is about Python value comparison...</td>\n",
       "      <td>In Python, decimal.Decimal (from the standard ...</td>\n",
       "      <td>0.823529</td>\n",
       "      <td>0.834290</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.825050</td>\n",
       "      <td>0.686262</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>95</th>\n",
       "      <td>What are the diffrent data typs available in A...</td>\n",
       "      <td>[Chapter 5: Data Types In this chapter, we wil...</td>\n",
       "      <td># Data Types in AVAP™ and Their Comparison to ...</td>\n",
       "      <td>In AVAP™, just like in Python, data types are ...</td>\n",
       "      <td>0.931034</td>\n",
       "      <td>0.980644</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.523810</td>\n",
       "      <td>0.908651</td>\n",
       "      <td>0.670344</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>96</th>\n",
       "      <td>In Python and AVAP how do you does conversion ...</td>\n",
       "      <td>[Chapter 5: Data Types In this chapter, we wil...</td>\n",
       "      <td># Data Type Conversion in AVAP™\\n\\nIn AVAP™, y...</td>\n",
       "      <td>In AVAP™, just like in Python, it is possible ...</td>\n",
       "      <td>0.818182</td>\n",
       "      <td>0.887462</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.400000</td>\n",
       "      <td>0.902631</td>\n",
       "      <td>0.943049</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>97</th>\n",
       "      <td>How does AVAP compare to Python in terms of ba...</td>\n",
       "      <td>[Chapter 5: Data Types In this chapter, we wil...</td>\n",
       "      <td># AVAP™ Compared to Python: Basic Data Types a...</td>\n",
       "      <td>AVAP shares several similarities with Python w...</td>\n",
       "      <td>0.826087</td>\n",
       "      <td>0.989468</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.893903</td>\n",
       "      <td>0.877322</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>98</th>\n",
       "      <td>Hey so I been trying to learn AVAP and I notic...</td>\n",
       "      <td>[Chapter 5: Data Types In this chapter, we wil...</td>\n",
       "      <td># AVAP Basic Data Types, Conversions, and Oper...</td>\n",
       "      <td>In AVAP, just like in Python, there are severa...</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.890796</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.411765</td>\n",
       "      <td>0.883287</td>\n",
       "      <td>0.915944</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>99</th>\n",
       "      <td>How AVAP data types is similar to Python?</td>\n",
       "      <td>[Introduction The data model in AVAP™ defines ...</td>\n",
       "      <td># AVAP Data Types: Similarities to Python\\n\\nB...</td>\n",
       "      <td>In AVAP™, like in Python, there are several ba...</td>\n",
       "      <td>0.956522</td>\n",
       "      <td>0.988650</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.555556</td>\n",
       "      <td>0.857555</td>\n",
       "      <td>0.611448</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>100 rows × 11 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                           user_input  \\\n",
       "0   So I been trying to understand how AVAP do the...   \n",
       "1   Hey, I'm trying to undrestand how arithmatic c...   \n",
       "2   Hey, I'm trying to undrestand how sliceing wor...   \n",
       "3   How does the __call__ method relate to callabl...   \n",
       "4   How does Decimel work in Python value comparis...   \n",
       "..                                                ...   \n",
       "95  What are the diffrent data typs available in A...   \n",
       "96  In Python and AVAP how do you does conversion ...   \n",
       "97  How does AVAP compare to Python in terms of ba...   \n",
       "98  Hey so I been trying to learn AVAP and I notic...   \n",
       "99          How AVAP data types is similar to Python?   \n",
       "\n",
       "                                   retrieved_contexts  \\\n",
       "0   [Chapter 5: Data Types In this chapter, we wil...   \n",
       "1   [6. Expressions in AVAP This chapter explains ...   \n",
       "2   [6. Expressions in AVAP This chapter explains ...   \n",
       "3   [6. Expressions in AVAP This chapter explains ...   \n",
       "4   [Binary Arithmetic Operations Binary arithmeti...   \n",
       "..                                                ...   \n",
       "95  [Chapter 5: Data Types In this chapter, we wil...   \n",
       "96  [Chapter 5: Data Types In this chapter, we wil...   \n",
       "97  [Chapter 5: Data Types In this chapter, we wil...   \n",
       "98  [Chapter 5: Data Types In this chapter, we wil...   \n",
       "99  [Introduction The data model in AVAP™ defines ...   \n",
       "\n",
       "                                             response  \\\n",
       "0   # Arithmetic Conversions in AVAP\\n\\nWhen you'r...   \n",
       "1   # Arithmetic Conversions in AVAP\\n\\nWhen using...   \n",
       "2   # Slicing Semantics in AVAP\\n\\nHere is a full ...   \n",
       "3   Answer:\\n\\nIn AVAP, the `__call__` method is w...   \n",
       "4   This question is about Python value comparison...   \n",
       "..                                                ...   \n",
       "95  # Data Types in AVAP™ and Their Comparison to ...   \n",
       "96  # Data Type Conversion in AVAP™\\n\\nIn AVAP™, y...   \n",
       "97  # AVAP™ Compared to Python: Basic Data Types a...   \n",
       "98  # AVAP Basic Data Types, Conversions, and Oper...   \n",
       "99  # AVAP Data Types: Similarities to Python\\n\\nB...   \n",
       "\n",
       "                                            reference  faithfulness  \\\n",
       "0   In AVAP, when an arithmetic operator is descri...      0.923077   \n",
       "1   In AVAP, when an arithmetic operator is descri...      0.857143   \n",
       "2   In AVAP, when you perform a slice operation, t...      1.000000   \n",
       "3   In AVAP, any object that has a __call__() meth...      1.000000   \n",
       "4   In Python, decimal.Decimal (from the standard ...      0.823529   \n",
       "..                                                ...           ...   \n",
       "95  In AVAP™, just like in Python, data types are ...      0.931034   \n",
       "96  In AVAP™, just like in Python, it is possible ...      0.818182   \n",
       "97  AVAP shares several similarities with Python w...      0.826087   \n",
       "98  In AVAP, just like in Python, there are severa...      1.000000   \n",
       "99  In AVAP™, like in Python, there are several ba...      0.956522   \n",
       "\n",
       "    answer_relevancy  context_precision  context_recall  \\\n",
       "0           0.915934           0.333333             1.0   \n",
       "1           0.918821           1.000000             1.0   \n",
       "2           0.896008           1.000000             1.0   \n",
       "3           0.896314           1.000000             1.0   \n",
       "4           0.834290           1.000000             1.0   \n",
       "..               ...                ...             ...   \n",
       "95          0.980644           1.000000             1.0   \n",
       "96          0.887462           1.000000             1.0   \n",
       "97          0.989468           1.000000             1.0   \n",
       "98          0.890796           1.000000             1.0   \n",
       "99          0.988650           1.000000             1.0   \n",
       "\n",
       "    context_entity_recall  answer_similarity  answer_correctness  \n",
       "0                0.000000           0.931432            0.778313  \n",
       "1                0.250000           0.940588            0.770861  \n",
       "2                0.071429           0.922016            0.865119  \n",
       "3                0.142857           0.878617            0.894654  \n",
       "4                1.000000           0.825050            0.686262  \n",
       "..                    ...                ...                 ...  \n",
       "95               0.523810           0.908651            0.670344  \n",
       "96               0.400000           0.902631            0.943049  \n",
       "97               0.500000           0.893903            0.877322  \n",
       "98               0.411765           0.883287            0.915944  \n",
       "99               0.555556           0.857555            0.611448  \n",
       "\n",
       "[100 rows x 11 columns]"
      ]
     },
     "execution_count": 44,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "metrics = [\n",
    "    faithfulness,\n",
    "    answer_relevancy,\n",
    "    context_precision,\n",
    "    context_recall,\n",
    "    context_entity_recall,\n",
    "    answer_similarity,\n",
    "    answer_correctness\n",
    "]\n",
    "\n",
    "result = evaluate(\n",
    "    dataset=dataset, \n",
    "    metrics=metrics,\n",
    "    llm=llm,\n",
    "    embeddings=embeddings,\n",
    ")\n",
    "\n",
    "result_df = result.to_pandas()\n",
    "result_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "id": "20c3fa64",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "faithfulness             0.958168\n",
       "answer_relevancy         0.821373\n",
       "context_precision        0.819167\n",
       "context_recall           0.896841\n",
       "context_entity_recall    0.389715\n",
       "answer_similarity        0.876510\n",
       "answer_correctness       0.725832\n",
       "dtype: float64"
      ]
     },
     "execution_count": 45,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "result_df.mean(numeric_only=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "id": "350755fd",
   "metadata": {},
   "outputs": [],
   "source": [
    "result_df.to_csv(INTERIM_DIR / \"retrieve_eval_results/ragas_eval.csv\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "id": "71743384",
   "metadata": {},
   "outputs": [],
   "source": [
    "from evidently import Dataset, DataDefinition, Report\n",
    "from evidently.descriptors import *\n",
    "from evidently.llm.options import OllamaOptions\n",
    "from evidently.presets import TextEvals\n",
    "from evidently.metrics import *\n",
    "from evidently.tests import *"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "id": "4a1210cb",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>question</th>\n",
       "      <th>answer</th>\n",
       "      <th>contexts</th>\n",
       "      <th>ground_truth</th>\n",
       "      <th>ground_truth_contexts</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>So I been trying to understand how AVAP do the...</td>\n",
       "      <td># Arithmetic Conversions in AVAP\\n\\nWhen you'r...</td>\n",
       "      <td>['Chapter 5: Data Types In this chapter, we wi...</td>\n",
       "      <td>In AVAP, when an arithmetic operator is descri...</td>\n",
       "      <td>['6. Expressions in AVAP\\nThis chapter explain...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Hey, I'm trying to undrestand how arithmatic c...</td>\n",
       "      <td># Arithmetic Conversions in AVAP\\n\\nWhen using...</td>\n",
       "      <td>['6. Expressions in AVAP This chapter explains...</td>\n",
       "      <td>In AVAP, when an arithmetic operator is descri...</td>\n",
       "      <td>['6. Expressions in AVAP\\nThis chapter explain...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Hey, I'm trying to undrestand how sliceing wor...</td>\n",
       "      <td># Slicing Semantics in AVAP\\n\\nHere is a full ...</td>\n",
       "      <td>['6. Expressions in AVAP This chapter explains...</td>\n",
       "      <td>In AVAP, when you perform a slice operation, t...</td>\n",
       "      <td>['6. Expressions in AVAP\\nThis chapter explain...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>How does the __call__ method relate to callabl...</td>\n",
       "      <td>Answer:\\n\\nIn AVAP, the `__call__` method is w...</td>\n",
       "      <td>['6. Expressions in AVAP This chapter explains...</td>\n",
       "      <td>In AVAP, any object that has a __call__() meth...</td>\n",
       "      <td>['6. Expressions in AVAP\\nThis chapter explain...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>How does Decimel work in Python value comparis...</td>\n",
       "      <td>This question is about Python value comparison...</td>\n",
       "      <td>['Binary Arithmetic Operations Binary arithmet...</td>\n",
       "      <td>In Python, decimal.Decimal (from the standard ...</td>\n",
       "      <td>['Binary Arithmetic Operations\\nBinary arithme...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>How does the Decimal type from the standrd lib...</td>\n",
       "      <td>I don't have enough information in the provide...</td>\n",
       "      <td>['6. Expressions in AVAP This chapter explains...</td>\n",
       "      <td>According to the context, decimal.Decimal is o...</td>\n",
       "      <td>['Binary Arithmetic Operations\\nBinary arithme...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>Under what circumstances does a TypeError occu...</td>\n",
       "      <td>Answer:\\nA `TypeError` occurs during compariso...</td>\n",
       "      <td>['Binary Arithmetic Operations Binary arithmet...</td>\n",
       "      <td>A TypeError is generated when attempting to pe...</td>\n",
       "      <td>['Binary Arithmetic Operations\\nBinary arithme...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>According to the AVAP documentation, what does...</td>\n",
       "      <td>I don't have enough information in the provide...</td>\n",
       "      <td>['6. Expressions in AVAP This chapter explains...</td>\n",
       "      <td>PEP 8 advises that comparisons for singletons,...</td>\n",
       "      <td>['Binary Arithmetic Operations\\nBinary arithme...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>hey so i been reading about expression stateme...</td>\n",
       "      <td>Answer:\\n\\nIn AVAP, when you use an expression...</td>\n",
       "      <td>[\"Expressions in AVAP™ Introduction Expression...</td>\n",
       "      <td>In interactive mode in AVAP, when an expressio...</td>\n",
       "      <td>['Simple Statements\\nIn AVAP, a simple stateme...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>AVAP programming language how do simple statem...</td>\n",
       "      <td># Simple Statements in AVAP\\n\\n## Overview\\n\\n...</td>\n",
       "      <td>['Simple Statements In AVAP, a simple statemen...</td>\n",
       "      <td>In AVAP, a simple statement consists of a sing...</td>\n",
       "      <td>['Simple Statements\\nIn AVAP, a simple stateme...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                            question  \\\n",
       "0  So I been trying to understand how AVAP do the...   \n",
       "1  Hey, I'm trying to undrestand how arithmatic c...   \n",
       "2  Hey, I'm trying to undrestand how sliceing wor...   \n",
       "3  How does the __call__ method relate to callabl...   \n",
       "4  How does Decimel work in Python value comparis...   \n",
       "5  How does the Decimal type from the standrd lib...   \n",
       "6  Under what circumstances does a TypeError occu...   \n",
       "7  According to the AVAP documentation, what does...   \n",
       "8  hey so i been reading about expression stateme...   \n",
       "9  AVAP programming language how do simple statem...   \n",
       "\n",
       "                                              answer  \\\n",
       "0  # Arithmetic Conversions in AVAP\\n\\nWhen you'r...   \n",
       "1  # Arithmetic Conversions in AVAP\\n\\nWhen using...   \n",
       "2  # Slicing Semantics in AVAP\\n\\nHere is a full ...   \n",
       "3  Answer:\\n\\nIn AVAP, the `__call__` method is w...   \n",
       "4  This question is about Python value comparison...   \n",
       "5  I don't have enough information in the provide...   \n",
       "6  Answer:\\nA `TypeError` occurs during compariso...   \n",
       "7  I don't have enough information in the provide...   \n",
       "8  Answer:\\n\\nIn AVAP, when you use an expression...   \n",
       "9  # Simple Statements in AVAP\\n\\n## Overview\\n\\n...   \n",
       "\n",
       "                                            contexts  \\\n",
       "0  ['Chapter 5: Data Types In this chapter, we wi...   \n",
       "1  ['6. Expressions in AVAP This chapter explains...   \n",
       "2  ['6. Expressions in AVAP This chapter explains...   \n",
       "3  ['6. Expressions in AVAP This chapter explains...   \n",
       "4  ['Binary Arithmetic Operations Binary arithmet...   \n",
       "5  ['6. Expressions in AVAP This chapter explains...   \n",
       "6  ['Binary Arithmetic Operations Binary arithmet...   \n",
       "7  ['6. Expressions in AVAP This chapter explains...   \n",
       "8  [\"Expressions in AVAP™ Introduction Expression...   \n",
       "9  ['Simple Statements In AVAP, a simple statemen...   \n",
       "\n",
       "                                        ground_truth  \\\n",
       "0  In AVAP, when an arithmetic operator is descri...   \n",
       "1  In AVAP, when an arithmetic operator is descri...   \n",
       "2  In AVAP, when you perform a slice operation, t...   \n",
       "3  In AVAP, any object that has a __call__() meth...   \n",
       "4  In Python, decimal.Decimal (from the standard ...   \n",
       "5  According to the context, decimal.Decimal is o...   \n",
       "6  A TypeError is generated when attempting to pe...   \n",
       "7  PEP 8 advises that comparisons for singletons,...   \n",
       "8  In interactive mode in AVAP, when an expressio...   \n",
       "9  In AVAP, a simple statement consists of a sing...   \n",
       "\n",
       "                               ground_truth_contexts  \n",
       "0  ['6. Expressions in AVAP\\nThis chapter explain...  \n",
       "1  ['6. Expressions in AVAP\\nThis chapter explain...  \n",
       "2  ['6. Expressions in AVAP\\nThis chapter explain...  \n",
       "3  ['6. Expressions in AVAP\\nThis chapter explain...  \n",
       "4  ['Binary Arithmetic Operations\\nBinary arithme...  \n",
       "5  ['Binary Arithmetic Operations\\nBinary arithme...  \n",
       "6  ['Binary Arithmetic Operations\\nBinary arithme...  \n",
       "7  ['Binary Arithmetic Operations\\nBinary arithme...  \n",
       "8  ['Simple Statements\\nIn AVAP, a simple stateme...  \n",
       "9  ['Simple Statements\\nIn AVAP, a simple stateme...  "
      ]
     },
     "execution_count": 49,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "full_synthetic_dataset = pd.read_csv(INTERIM_DIR / \"retrieve_eval_results/full_synthetic_dataset.csv\")\n",
    "full_synthetic_dataset = full_synthetic_dataset[0:10]\n",
    "full_synthetic_dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "id": "e1ac1a41",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>question</th>\n",
       "      <th>answer</th>\n",
       "      <th>contexts</th>\n",
       "      <th>ground_truth</th>\n",
       "      <th>ground_truth_contexts</th>\n",
       "      <th>ContextQuality</th>\n",
       "      <th>ContextQuality score</th>\n",
       "      <th>ContextQuality reasoning</th>\n",
       "      <th>ContextRelevance</th>\n",
       "      <th>ContextRelevance scores</th>\n",
       "      <th>Faithfulness</th>\n",
       "      <th>Faithfulness score</th>\n",
       "      <th>Faithfulness reasoning</th>\n",
       "      <th>Completeness</th>\n",
       "      <th>Completeness score</th>\n",
       "      <th>Completeness reasoning</th>\n",
       "      <th>Correctness</th>\n",
       "      <th>Correctness score</th>\n",
       "      <th>Correctness reasoning</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>So I been trying to understand how AVAP do the...</td>\n",
       "      <td># Arithmetic Conversions in AVAP\\n\\nWhen you'r...</td>\n",
       "      <td>['Chapter 5: Data Types In this chapter, we wi...</td>\n",
       "      <td>In AVAP, when an arithmetic operator is descri...</td>\n",
       "      <td>['6. Expressions in AVAP\\nThis chapter explain...</td>\n",
       "      <td>VALID</td>\n",
       "      <td>0.90</td>\n",
       "      <td>The text contains a section (6.1. Arithmetic C...</td>\n",
       "      <td>0.836118</td>\n",
       "      <td>[0.8361175]</td>\n",
       "      <td>UNFAITHFUL</td>\n",
       "      <td>0.30</td>\n",
       "      <td>The response is largely faithful to the source...</td>\n",
       "      <td>COMPLETE</td>\n",
       "      <td>0.05</td>\n",
       "      <td>The output covers all the key information from...</td>\n",
       "      <td>CORRECT</td>\n",
       "      <td>0.00</td>\n",
       "      <td>The OUTPUT accurately conveys all the key info...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Hey, I'm trying to undrestand how arithmatic c...</td>\n",
       "      <td># Arithmetic Conversions in AVAP\\n\\nWhen using...</td>\n",
       "      <td>['6. Expressions in AVAP This chapter explains...</td>\n",
       "      <td>In AVAP, when an arithmetic operator is descri...</td>\n",
       "      <td>['6. Expressions in AVAP\\nThis chapter explain...</td>\n",
       "      <td>VALID</td>\n",
       "      <td>0.95</td>\n",
       "      <td>The text contains the specific section '6.1. A...</td>\n",
       "      <td>0.828277</td>\n",
       "      <td>[0.82827663]</td>\n",
       "      <td>FAITHFUL</td>\n",
       "      <td>0.05</td>\n",
       "      <td>The response accurately summarizes the arithme...</td>\n",
       "      <td>COMPLETE</td>\n",
       "      <td>0.05</td>\n",
       "      <td>The output covers all the key information from...</td>\n",
       "      <td>CORRECT</td>\n",
       "      <td>0.00</td>\n",
       "      <td>The output accurately conveys all the key info...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Hey, I'm trying to undrestand how sliceing wor...</td>\n",
       "      <td># Slicing Semantics in AVAP\\n\\nHere is a full ...</td>\n",
       "      <td>['6. Expressions in AVAP This chapter explains...</td>\n",
       "      <td>In AVAP, when you perform a slice operation, t...</td>\n",
       "      <td>['6. Expressions in AVAP\\nThis chapter explain...</td>\n",
       "      <td>VALID</td>\n",
       "      <td>0.95</td>\n",
       "      <td>The first document contains a dedicated 'Slice...</td>\n",
       "      <td>0.647847</td>\n",
       "      <td>[0.6478467]</td>\n",
       "      <td>FAITHFUL</td>\n",
       "      <td>0.05</td>\n",
       "      <td>The response accurately reproduces the slicing...</td>\n",
       "      <td>COMPLETE</td>\n",
       "      <td>0.05</td>\n",
       "      <td>The output covers all the key information from...</td>\n",
       "      <td>CORRECT</td>\n",
       "      <td>0.05</td>\n",
       "      <td>The OUTPUT accurately conveys all the key fact...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>How does the __call__ method relate to callabl...</td>\n",
       "      <td>Answer:\\n\\nIn AVAP, the `__call__` method is w...</td>\n",
       "      <td>['6. Expressions in AVAP This chapter explains...</td>\n",
       "      <td>In AVAP, any object that has a __call__() meth...</td>\n",
       "      <td>['6. Expressions in AVAP\\nThis chapter explain...</td>\n",
       "      <td>VALID</td>\n",
       "      <td>0.85</td>\n",
       "      <td>The first text chunk explicitly addresses the ...</td>\n",
       "      <td>0.682974</td>\n",
       "      <td>[0.6829743]</td>\n",
       "      <td>FAITHFUL</td>\n",
       "      <td>0.00</td>\n",
       "      <td>The response accurately quotes and summarizes ...</td>\n",
       "      <td>INCOMPLETE</td>\n",
       "      <td>0.85</td>\n",
       "      <td>The output focuses narrowly on only one aspect...</td>\n",
       "      <td>CORRECT</td>\n",
       "      <td>0.10</td>\n",
       "      <td>The OUTPUT conveys the same core facts as the ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>How does Decimel work in Python value comparis...</td>\n",
       "      <td>This question is about Python value comparison...</td>\n",
       "      <td>['Binary Arithmetic Operations Binary arithmet...</td>\n",
       "      <td>In Python, decimal.Decimal (from the standard ...</td>\n",
       "      <td>['Binary Arithmetic Operations\\nBinary arithme...</td>\n",
       "      <td>VALID</td>\n",
       "      <td>0.65</td>\n",
       "      <td>The first text block explicitly mentions decim...</td>\n",
       "      <td>0.716216</td>\n",
       "      <td>[0.7162162]</td>\n",
       "      <td>UNFAITHFUL</td>\n",
       "      <td>0.35</td>\n",
       "      <td>The response is mostly faithful to the source,...</td>\n",
       "      <td>COMPLETE</td>\n",
       "      <td>0.10</td>\n",
       "      <td>The output accurately captures the key informa...</td>\n",
       "      <td>CORRECT</td>\n",
       "      <td>0.20</td>\n",
       "      <td>The OUTPUT correctly conveys the main facts fr...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>How does the Decimal type from the standrd lib...</td>\n",
       "      <td>I don't have enough information in the provide...</td>\n",
       "      <td>['6. Expressions in AVAP This chapter explains...</td>\n",
       "      <td>According to the context, decimal.Decimal is o...</td>\n",
       "      <td>['Binary Arithmetic Operations\\nBinary arithme...</td>\n",
       "      <td>INVALID</td>\n",
       "      <td>0.05</td>\n",
       "      <td>The question specifically asks about how the D...</td>\n",
       "      <td>0.763026</td>\n",
       "      <td>[0.76302594]</td>\n",
       "      <td>FAITHFUL</td>\n",
       "      <td>0.05</td>\n",
       "      <td>The response accurately states that it cannot ...</td>\n",
       "      <td>COMPLETE</td>\n",
       "      <td>0.05</td>\n",
       "      <td>The output is a response explaining that the s...</td>\n",
       "      <td>INCORRECT</td>\n",
       "      <td>0.85</td>\n",
       "      <td>The OUTPUT claims there is not enough informat...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>Under what circumstances does a TypeError occu...</td>\n",
       "      <td>Answer:\\nA `TypeError` occurs during compariso...</td>\n",
       "      <td>['Binary Arithmetic Operations Binary arithmet...</td>\n",
       "      <td>A TypeError is generated when attempting to pe...</td>\n",
       "      <td>['Binary Arithmetic Operations\\nBinary arithme...</td>\n",
       "      <td>VALID</td>\n",
       "      <td>0.90</td>\n",
       "      <td>The first document (Python documentation) expl...</td>\n",
       "      <td>0.632650</td>\n",
       "      <td>[0.6326503]</td>\n",
       "      <td>FAITHFUL</td>\n",
       "      <td>0.05</td>\n",
       "      <td>The response accurately reflects information f...</td>\n",
       "      <td>COMPLETE</td>\n",
       "      <td>0.10</td>\n",
       "      <td>The output accurately covers all the main circ...</td>\n",
       "      <td>INCORRECT</td>\n",
       "      <td>0.30</td>\n",
       "      <td>The OUTPUT correctly conveys the main facts fr...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>According to the AVAP documentation, what does...</td>\n",
       "      <td>I don't have enough information in the provide...</td>\n",
       "      <td>['6. Expressions in AVAP This chapter explains...</td>\n",
       "      <td>PEP 8 advises that comparisons for singletons,...</td>\n",
       "      <td>['Binary Arithmetic Operations\\nBinary arithme...</td>\n",
       "      <td>INVALID</td>\n",
       "      <td>0.05</td>\n",
       "      <td>The question asks specifically about what PEP ...</td>\n",
       "      <td>0.624027</td>\n",
       "      <td>[0.6240271]</td>\n",
       "      <td>FAITHFUL</td>\n",
       "      <td>0.00</td>\n",
       "      <td>The response is a refusal to answer, stating t...</td>\n",
       "      <td>COMPLETE</td>\n",
       "      <td>0.05</td>\n",
       "      <td>The output is a complete response to a questio...</td>\n",
       "      <td>INCORRECT</td>\n",
       "      <td>0.90</td>\n",
       "      <td>The OUTPUT claims there is not enough informat...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>hey so i been reading about expression stateme...</td>\n",
       "      <td>Answer:\\n\\nIn AVAP, when you use an expression...</td>\n",
       "      <td>[\"Expressions in AVAP™ Introduction Expression...</td>\n",
       "      <td>In interactive mode in AVAP, when an expressio...</td>\n",
       "      <td>['Simple Statements\\nIn AVAP, a simple stateme...</td>\n",
       "      <td>VALID</td>\n",
       "      <td>0.95</td>\n",
       "      <td>The text contains a direct and complete answer...</td>\n",
       "      <td>0.776070</td>\n",
       "      <td>[0.7760695]</td>\n",
       "      <td>FAITHFUL</td>\n",
       "      <td>0.00</td>\n",
       "      <td>The response accurately reflects information f...</td>\n",
       "      <td>INCOMPLETE</td>\n",
       "      <td>0.85</td>\n",
       "      <td>The output focuses on a very narrow aspect of ...</td>\n",
       "      <td>CORRECT</td>\n",
       "      <td>0.00</td>\n",
       "      <td>The OUTPUT accurately conveys all the key fact...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>AVAP programming language how do simple statem...</td>\n",
       "      <td># Simple Statements in AVAP\\n\\n## Overview\\n\\n...</td>\n",
       "      <td>['Simple Statements In AVAP, a simple statemen...</td>\n",
       "      <td>In AVAP, a simple statement consists of a sing...</td>\n",
       "      <td>['Simple Statements\\nIn AVAP, a simple stateme...</td>\n",
       "      <td>VALID</td>\n",
       "      <td>0.92</td>\n",
       "      <td>The text provides comprehensive information ab...</td>\n",
       "      <td>0.883411</td>\n",
       "      <td>[0.8834107]</td>\n",
       "      <td>FAITHFUL</td>\n",
       "      <td>0.00</td>\n",
       "      <td>The response accurately reproduces information...</td>\n",
       "      <td>INCOMPLETE</td>\n",
       "      <td>0.85</td>\n",
       "      <td>The OUTPUT only covers the introductory overvi...</td>\n",
       "      <td>INCORRECT</td>\n",
       "      <td>0.30</td>\n",
       "      <td>The OUTPUT contains most of the same informati...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                            question  \\\n",
       "0  So I been trying to understand how AVAP do the...   \n",
       "1  Hey, I'm trying to undrestand how arithmatic c...   \n",
       "2  Hey, I'm trying to undrestand how sliceing wor...   \n",
       "3  How does the __call__ method relate to callabl...   \n",
       "4  How does Decimel work in Python value comparis...   \n",
       "5  How does the Decimal type from the standrd lib...   \n",
       "6  Under what circumstances does a TypeError occu...   \n",
       "7  According to the AVAP documentation, what does...   \n",
       "8  hey so i been reading about expression stateme...   \n",
       "9  AVAP programming language how do simple statem...   \n",
       "\n",
       "                                              answer  \\\n",
       "0  # Arithmetic Conversions in AVAP\\n\\nWhen you'r...   \n",
       "1  # Arithmetic Conversions in AVAP\\n\\nWhen using...   \n",
       "2  # Slicing Semantics in AVAP\\n\\nHere is a full ...   \n",
       "3  Answer:\\n\\nIn AVAP, the `__call__` method is w...   \n",
       "4  This question is about Python value comparison...   \n",
       "5  I don't have enough information in the provide...   \n",
       "6  Answer:\\nA `TypeError` occurs during compariso...   \n",
       "7  I don't have enough information in the provide...   \n",
       "8  Answer:\\n\\nIn AVAP, when you use an expression...   \n",
       "9  # Simple Statements in AVAP\\n\\n## Overview\\n\\n...   \n",
       "\n",
       "                                            contexts  \\\n",
       "0  ['Chapter 5: Data Types In this chapter, we wi...   \n",
       "1  ['6. Expressions in AVAP This chapter explains...   \n",
       "2  ['6. Expressions in AVAP This chapter explains...   \n",
       "3  ['6. Expressions in AVAP This chapter explains...   \n",
       "4  ['Binary Arithmetic Operations Binary arithmet...   \n",
       "5  ['6. Expressions in AVAP This chapter explains...   \n",
       "6  ['Binary Arithmetic Operations Binary arithmet...   \n",
       "7  ['6. Expressions in AVAP This chapter explains...   \n",
       "8  [\"Expressions in AVAP™ Introduction Expression...   \n",
       "9  ['Simple Statements In AVAP, a simple statemen...   \n",
       "\n",
       "                                        ground_truth  \\\n",
       "0  In AVAP, when an arithmetic operator is descri...   \n",
       "1  In AVAP, when an arithmetic operator is descri...   \n",
       "2  In AVAP, when you perform a slice operation, t...   \n",
       "3  In AVAP, any object that has a __call__() meth...   \n",
       "4  In Python, decimal.Decimal (from the standard ...   \n",
       "5  According to the context, decimal.Decimal is o...   \n",
       "6  A TypeError is generated when attempting to pe...   \n",
       "7  PEP 8 advises that comparisons for singletons,...   \n",
       "8  In interactive mode in AVAP, when an expressio...   \n",
       "9  In AVAP, a simple statement consists of a sing...   \n",
       "\n",
       "                               ground_truth_contexts ContextQuality  \\\n",
       "0  ['6. Expressions in AVAP\\nThis chapter explain...          VALID   \n",
       "1  ['6. Expressions in AVAP\\nThis chapter explain...          VALID   \n",
       "2  ['6. Expressions in AVAP\\nThis chapter explain...          VALID   \n",
       "3  ['6. Expressions in AVAP\\nThis chapter explain...          VALID   \n",
       "4  ['Binary Arithmetic Operations\\nBinary arithme...          VALID   \n",
       "5  ['Binary Arithmetic Operations\\nBinary arithme...        INVALID   \n",
       "6  ['Binary Arithmetic Operations\\nBinary arithme...          VALID   \n",
       "7  ['Binary Arithmetic Operations\\nBinary arithme...        INVALID   \n",
       "8  ['Simple Statements\\nIn AVAP, a simple stateme...          VALID   \n",
       "9  ['Simple Statements\\nIn AVAP, a simple stateme...          VALID   \n",
       "\n",
       "   ContextQuality score                           ContextQuality reasoning  \\\n",
       "0                  0.90  The text contains a section (6.1. Arithmetic C...   \n",
       "1                  0.95  The text contains the specific section '6.1. A...   \n",
       "2                  0.95  The first document contains a dedicated 'Slice...   \n",
       "3                  0.85  The first text chunk explicitly addresses the ...   \n",
       "4                  0.65  The first text block explicitly mentions decim...   \n",
       "5                  0.05  The question specifically asks about how the D...   \n",
       "6                  0.90  The first document (Python documentation) expl...   \n",
       "7                  0.05  The question asks specifically about what PEP ...   \n",
       "8                  0.95  The text contains a direct and complete answer...   \n",
       "9                  0.92  The text provides comprehensive information ab...   \n",
       "\n",
       "   ContextRelevance ContextRelevance scores Faithfulness  Faithfulness score  \\\n",
       "0          0.836118             [0.8361175]   UNFAITHFUL                0.30   \n",
       "1          0.828277            [0.82827663]     FAITHFUL                0.05   \n",
       "2          0.647847             [0.6478467]     FAITHFUL                0.05   \n",
       "3          0.682974             [0.6829743]     FAITHFUL                0.00   \n",
       "4          0.716216             [0.7162162]   UNFAITHFUL                0.35   \n",
       "5          0.763026            [0.76302594]     FAITHFUL                0.05   \n",
       "6          0.632650             [0.6326503]     FAITHFUL                0.05   \n",
       "7          0.624027             [0.6240271]     FAITHFUL                0.00   \n",
       "8          0.776070             [0.7760695]     FAITHFUL                0.00   \n",
       "9          0.883411             [0.8834107]     FAITHFUL                0.00   \n",
       "\n",
       "                              Faithfulness reasoning Completeness  \\\n",
       "0  The response is largely faithful to the source...     COMPLETE   \n",
       "1  The response accurately summarizes the arithme...     COMPLETE   \n",
       "2  The response accurately reproduces the slicing...     COMPLETE   \n",
       "3  The response accurately quotes and summarizes ...   INCOMPLETE   \n",
       "4  The response is mostly faithful to the source,...     COMPLETE   \n",
       "5  The response accurately states that it cannot ...     COMPLETE   \n",
       "6  The response accurately reflects information f...     COMPLETE   \n",
       "7  The response is a refusal to answer, stating t...     COMPLETE   \n",
       "8  The response accurately reflects information f...   INCOMPLETE   \n",
       "9  The response accurately reproduces information...   INCOMPLETE   \n",
       "\n",
       "   Completeness score                             Completeness reasoning  \\\n",
       "0                0.05  The output covers all the key information from...   \n",
       "1                0.05  The output covers all the key information from...   \n",
       "2                0.05  The output covers all the key information from...   \n",
       "3                0.85  The output focuses narrowly on only one aspect...   \n",
       "4                0.10  The output accurately captures the key informa...   \n",
       "5                0.05  The output is a response explaining that the s...   \n",
       "6                0.10  The output accurately covers all the main circ...   \n",
       "7                0.05  The output is a complete response to a questio...   \n",
       "8                0.85  The output focuses on a very narrow aspect of ...   \n",
       "9                0.85  The OUTPUT only covers the introductory overvi...   \n",
       "\n",
       "  Correctness  Correctness score  \\\n",
       "0     CORRECT               0.00   \n",
       "1     CORRECT               0.00   \n",
       "2     CORRECT               0.05   \n",
       "3     CORRECT               0.10   \n",
       "4     CORRECT               0.20   \n",
       "5   INCORRECT               0.85   \n",
       "6   INCORRECT               0.30   \n",
       "7   INCORRECT               0.90   \n",
       "8     CORRECT               0.00   \n",
       "9   INCORRECT               0.30   \n",
       "\n",
       "                               Correctness reasoning  \n",
       "0  The OUTPUT accurately conveys all the key info...  \n",
       "1  The output accurately conveys all the key info...  \n",
       "2  The OUTPUT accurately conveys all the key fact...  \n",
       "3  The OUTPUT conveys the same core facts as the ...  \n",
       "4  The OUTPUT correctly conveys the main facts fr...  \n",
       "5  The OUTPUT claims there is not enough informat...  \n",
       "6  The OUTPUT correctly conveys the main facts fr...  \n",
       "7  The OUTPUT claims there is not enough informat...  \n",
       "8  The OUTPUT accurately conveys all the key fact...  \n",
       "9  The OUTPUT contains most of the same informati...  "
      ]
     },
     "execution_count": 50,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "context_based_evals = Dataset.from_pandas(\n",
    "    full_synthetic_dataset,\n",
    "    descriptors=[ContextQualityLLMEval(question=\"question\", column_name=\"contexts\", provider=\"bedrock\", \n",
    "                                       model=\"global.anthropic.claude-sonnet-4-6\", include_score=True, alias=\"ContextQuality\"),\n",
    "                ContextRelevance(input=\"question\", contexts=\"contexts\", output_scores=True, \n",
    "                                 aggregation_method=\"mean\", alias=\"ContextRelevance\"),\n",
    "                FaithfulnessLLMEval(column_name=\"answer\", context=\"contexts\", provider=\"bedrock\", \n",
    "                                    model=\"global.anthropic.claude-sonnet-4-6\", include_score=True, alias=\"Faithfulness\"),\n",
    "                CompletenessLLMEval(column_name=\"answer\", context=\"contexts\", provider=\"bedrock\", \n",
    "                                     model=\"global.anthropic.claude-sonnet-4-6\", include_score=True, alias=\"Completeness\"),\n",
    "                CorrectnessLLMEval(column_name=\"answer\", target_output=\"ground_truth\", provider=\"bedrock\", \n",
    "                                    model=\"global.anthropic.claude-sonnet-4-6\", include_score=True, alias=\"Correctness\")\n",
    "            ],\n",
    "    #  options=OllamaOptions(api_url=OLLAMA_LOCAL_URL)\n",
    ")\n",
    "context_based_evals.as_dataframe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9ff2705b",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "assistance-engine",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}