{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "8fed4518", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_102010/913720497.py:21: DeprecationWarning: Importing faithfulness from 'ragas.metrics' is deprecated and will be removed in v1.0. Please use 'ragas.metrics.collections' instead. Example: from ragas.metrics.collections import faithfulness\n", " from ragas.metrics import (\n", "/tmp/ipykernel_102010/913720497.py:21: DeprecationWarning: Importing answer_relevancy from 'ragas.metrics' is deprecated and will be removed in v1.0. Please use 'ragas.metrics.collections' instead. Example: from ragas.metrics.collections import answer_relevancy\n", " from ragas.metrics import (\n", "/tmp/ipykernel_102010/913720497.py:21: DeprecationWarning: Importing context_recall from 'ragas.metrics' is deprecated and will be removed in v1.0. Please use 'ragas.metrics.collections' instead. Example: from ragas.metrics.collections import context_recall\n", " from ragas.metrics import (\n", "/tmp/ipykernel_102010/913720497.py:21: DeprecationWarning: Importing context_precision from 'ragas.metrics' is deprecated and will be removed in v1.0. Please use 'ragas.metrics.collections' instead. Example: from ragas.metrics.collections import context_precision\n", " from ragas.metrics import (\n", "/tmp/ipykernel_102010/913720497.py:21: DeprecationWarning: Importing context_entity_recall from 'ragas.metrics' is deprecated and will be removed in v1.0. Please use 'ragas.metrics.collections' instead. Example: from ragas.metrics.collections import context_entity_recall\n", " from ragas.metrics import (\n", "/tmp/ipykernel_102010/913720497.py:21: DeprecationWarning: Importing answer_similarity from 'ragas.metrics' is deprecated and will be removed in v1.0. Please use 'ragas.metrics.collections' instead. Example: from ragas.metrics.collections import answer_similarity\n", " from ragas.metrics import (\n", "/tmp/ipykernel_102010/913720497.py:21: DeprecationWarning: Importing answer_correctness from 'ragas.metrics' is deprecated and will be removed in v1.0. Please use 'ragas.metrics.collections' instead. Example: from ragas.metrics.collections import answer_correctness\n", " from ragas.metrics import (\n", "/tmp/ipykernel_102010/913720497.py:21: DeprecationWarning: Importing NonLLMContextRecall from 'ragas.metrics' is deprecated and will be removed in v1.0. Please use 'ragas.metrics.collections' instead. Example: from ragas.metrics.collections import NonLLMContextRecall\n", " from ragas.metrics import (\n", "/tmp/ipykernel_102010/913720497.py:21: DeprecationWarning: Importing NonLLMContextPrecisionWithReference from 'ragas.metrics' is deprecated and will be removed in v1.0. Please use 'ragas.metrics.collections' instead. Example: from ragas.metrics.collections import NonLLMContextPrecisionWithReference\n", " from ragas.metrics import (\n" ] } ], "source": [ "import sys\n", "from pathlib import Path\n", "\n", "# Ensure the project root is on the path so `src` is importable\n", "_project_root = str(Path(__file__).resolve().parents[2]) if \"__file__\" in dir() else str(Path.cwd().parents[1])\n", "if _project_root not in sys.path:\n", " sys.path.insert(0, _project_root)\n", "\n", "import pandas as pd\n", "from datasets import Dataset\n", "from langchain_core.documents import Document\n", "from langchain_classic.chains.retrieval_qa.base import RetrievalQA\n", "from langchain_elasticsearch import ElasticsearchStore\n", "from langchain_core.messages import HumanMessage\n", "from ragas import evaluate, SingleTurnSample\n", "from ragas.llms import LangchainLLMWrapper\n", "from ragas.embeddings import LangchainEmbeddingsWrapper\n", "from ragas.testset import TestsetGenerator\n", "from ragas.testset.persona import Persona\n", "from ragas.testset.synthesizers.single_hop.specific import SingleHopSpecificQuerySynthesizer\n", "from ragas.metrics import (\n", " faithfulness,\n", " answer_relevancy,\n", " context_recall,\n", " context_precision,\n", " context_entity_recall,\n", " answer_similarity,\n", " answer_correctness,\n", " NonLLMContextRecall,\n", " NonLLMContextPrecisionWithReference\n", ")\n", "\n", "from src.utils.llm_factory import create_chat_model\n", "from src.utils.emb_factory import create_embedding_model\n", "from src.config import (\n", " ELASTICSEARCH_LOCAL_URL,\n", " ELASTICSEARCH_INDEX,\n", " OLLAMA_MODEL_NAME,\n", " OLLAMA_EMB_MODEL_NAME,\n", " OLLAMA_LOCAL_URL,\n", " RAW_DIR,\n", " INTERIM_DIR\n", ")" ] }, { "cell_type": "code", "execution_count": 42, "id": "4426d6c0", "metadata": {}, "outputs": [], "source": [ "llm = create_chat_model(\n", " provider=\"bedrock\",\n", " model=\"global.anthropic.claude-sonnet-4-6\",\n", " temperature=0,\n", ")\n", "embeddings = create_embedding_model(\n", " provider=\"ollama\",\n", " model=OLLAMA_EMB_MODEL_NAME,\n", ")\n", "agent_llm = create_chat_model(\n", " provider=\"ollama\",\n", " model=OLLAMA_MODEL_NAME,\n", " temperature=0,\n", " validate_model_on_init=True,\n", ")\n", "vector_store = ElasticsearchStore(\n", " es_url=ELASTICSEARCH_LOCAL_URL,\n", " index_name=ELASTICSEARCH_INDEX,\n", " embedding=embeddings,\n", " query_field=\"text\",\n", " vector_query_field=\"vector\",\n", ")" ] }, { "cell_type": "markdown", "id": "d2a6ab91", "metadata": {}, "source": [ "### Build langgraph" ] }, { "cell_type": "code", "execution_count": 35, "id": "7f9fc4de", "metadata": {}, "outputs": [], "source": [ "from typing import TypedDict, List, Annotated\n", "\n", "from langchain_core.messages import SystemMessage\n", "from langgraph.graph.message import add_messages\n", "from langchain_elasticsearch import ElasticsearchStore\n", "from langgraph.graph import StateGraph, END\n", "\n", "from src.llm_factory import create_chat_model\n", "from src.emb_factory import create_embedding_model\n", "from src.config import (\n", " ELASTICSEARCH_LOCAL_URL,\n", " ELASTICSEARCH_INDEX,\n", " OLLAMA_MODEL_NAME,\n", " OLLAMA_EMB_MODEL_NAME\n", ")\n", "\n", "class AgentState(TypedDict, total=False):\n", " messages: Annotated[list, add_messages]\n", " reformulated_query: str\n", " context: str \n", " contexts: List[str]\n", "\n", "REFORMULATE_PROMPT = SystemMessage(\n", " content=(\n", " \"You are a deterministic lexical query rewriter used for vector retrieval.\\n\"\n", " \"Your task is to rewrite user questions into optimized keyword search queries.\\n\\n\"\n", "\n", " \"CRITICAL RULES (ABSOLUTE):\\n\"\n", " \"1. NEVER answer the question.\\n\"\n", " \"2. NEVER expand acronyms.\\n\"\n", " \"3. NEVER introduce new terms not present in the original query.\\n\"\n", " \"4. NEVER infer missing information.\\n\"\n", " \"5. NEVER add explanations, definitions, or interpretations.\\n\"\n", " \"6. Preserve all technical tokens exactly as written.\\n\"\n", " \"7. Only remove filler words (e.g., what, does, is, explain, tell me, please).\\n\"\n", " \"8. You may reorder terms for better retrieval.\\n\"\n", " \"9. Output must be a single-line plain keyword query.\\n\"\n", " \"10. If the query is already optimal, return it unchanged.\\n\\n\"\n", " \"11. If you receive something that looks like code, do NOT attempt to rewrite it. Return it verbatim.\\n\\n\"\n", "\n", " \"ALLOWED OPERATIONS:\\n\"\n", " \"- Remove interrogative phrasing.\\n\"\n", " \"- Remove stopwords.\\n\"\n", " \"- Reorder words.\\n\"\n", " \"- Convert to noun phrase form.\\n\\n\"\n", "\n", " \"FORBIDDEN OPERATIONS:\\n\"\n", " \"- Expanding abbreviations.\\n\"\n", " \"- Paraphrasing into unseen vocabulary.\\n\"\n", " \"- Adding definitions.\\n\"\n", " \"- Answering implicitly.\\n\\n\"\n", "\n", " \"Examples:\\n\"\n", " \"Input: What does AVAP stand for?\\n\"\n", " \"Output: AVAP stand for\\n\"\n", "\n", " \"Input: Hey, I'm trying to understand how AVAP handels a ZeroDivisionError when doing divison or modulus operatoins. Can you explane what situatoins cause a ZeroDivisionError to be raised and how I can catch it in my AVAP scripts?\\n\"\n", " \"Output: AVAP ZeroDivisionError division / modulus % catch try except\\n\"\n", " \n", " \"Input: What does AVAP stand for?\\n\"\n", " \"Output: AVAP stand for\\n\"\n", "\n", " \"Input: Please explain how the import statement works in AVAP scripts.\\n\"\n", " \"Output: AVAP import statement syntax behavior\\n\\n\"\n", "\n", " \"Return only the rewritten query.\"\n", " )\n", ")\n", "\n", "GENERATE_PROMPT = SystemMessage(\n", " content=(\n", " \"\"\"You are a retrieval-grounded assistant.\n", "\n", " CRITICAL SCOPE RULES\n", " - AVAP (Advanced Virtual API Programming) is a new/unknown programming language.\n", " - If the user message is AVAP-related (explicitly mentions \"AVAP\" case-insensitive OR the provided context is AVAP code/docs), you MUST answer using ONLY .\n", " - You MUST NOT use general programming knowledge to interpret AVAP.\n", " - You MUST NOT assume AVAP behaves like any other language even if syntax looks similar.\n", " - You MUST NOT infer missing details. Only state what is explicitly present in .\n", "\n", " GROUNDING REQUIREMENTS (AVAP-RELATED)\n", " 1) Every non-trivial factual claim MUST be directly supported by an EXACT QUOTE from .\n", " 2) If a claim is not supported by a quote, DO NOT include it.\n", " 3) If does not contain enough information to answer, reply with EXACTLY:\n", " \"I don't have enough information in the provided context to answer that.\"\n", "\n", " WORKFLOW (AVAP-RELATED) — FOLLOW IN ORDER\n", " A) Identify the specific question(s) being asked.\n", " B) Extract the minimum necessary quotes from that answer those question(s).\n", " C) Write the answer using ONLY those quotes (paraphrase is allowed, but every statement must be backed by at least one quote).\n", " D) Verify: for EACH sentence in your answer, confirm there is a supporting quote. If any sentence lacks a quote, delete it or refuse.\n", "\n", " OUTPUT FORMAT (AVAP-RELATED ONLY)\n", " Answer:\n", " \n", "\n", " Evidence:\n", " - \"\"\n", " - \"\"\n", " (Include only quotes you actually used. Prefer the smallest quotes that fully support the statements.)\n", "\n", " NON-AVAP QUESTIONS\n", " - If the question is clearly not AVAP-related, answer normally using general knowledge.\n", "\n", " \n", " {context}\n", " \"\"\"\n", " )\n", ")\n", "\n", "retrieve_kwargs = {\n", " \"k\": 3\n", "}\n", "\n", "def format_context(docs: List[Document]) -> str:\n", " chunks: List[str] = []\n", " for i, doc in enumerate(docs, 1):\n", " source = (doc.metadata or {}).get(\"source\", \"Untitled\")\n", " source_id = (doc.metadata or {}).get(\"id\", f\"chunk-{i}\")\n", " text = doc.page_content or \"\"\n", " chunks.append(f\"[{i}] id={source_id} source={source}\\n{text}\")\n", " return \"\\n\\n\".join(chunks)\n", "\n", "def reformulate(state: AgentState) -> AgentState:\n", " \"\"\"Use the LLM to rewrite the user query for better retrieval.\"\"\"\n", " user_msg = state[\"messages\"][-1]\n", " resp = llm.invoke([REFORMULATE_PROMPT, user_msg])\n", " reformulated = resp.content.strip()\n", " print(f\"[reformulate] '{user_msg.content}' → '{reformulated}'\")\n", " return {\"reformulated_query\": reformulated}\n", "\n", "\n", "def retrieve(state: AgentState) -> AgentState:\n", " \"\"\"Retrieve context using the reformulated query.\"\"\"\n", " query = state[\"reformulated_query\"]\n", " docs = vector_store.as_retriever(\n", " search_type=\"similarity\",\n", " search_kwargs=retrieve_kwargs,\n", " ).invoke(query)\n", "\n", " context = format_context(docs)\n", " contexts = [d.page_content or \"\" for d in docs] # <-- for Dataset\n", "\n", " print(f\"[retrieve] {len(docs)} docs fetched\")\n", " return {\"context\": context, \"contexts\": contexts}\n", "\n", "\n", "def generate(state: AgentState) -> AgentState:\n", " \"\"\"Generate the final answer using retrieved context.\"\"\"\n", " prompt = SystemMessage(\n", " content=GENERATE_PROMPT.content.format(context=state[\"context\"])\n", " )\n", " resp = llm.invoke([prompt] + state[\"messages\"])\n", " return {\"messages\": [resp]}\n", "\n", "\n", "graph_builder = StateGraph(AgentState)\n", "\n", "graph_builder.add_node(\"reformulate\", reformulate)\n", "graph_builder.add_node(\"retrieve\", retrieve)\n", "graph_builder.add_node(\"generate\", generate)\n", "\n", "graph_builder.set_entry_point(\"reformulate\")\n", "graph_builder.add_edge(\"reformulate\", \"retrieve\")\n", "graph_builder.add_edge(\"retrieve\", \"generate\")\n", "graph_builder.add_edge(\"generate\", END)\n", "\n", "graph = graph_builder.compile()" ] }, { "cell_type": "markdown", "id": "9b723a42", "metadata": {}, "source": [ "### Create synthethic data (question, context, answer with SoTA model)" ] }, { "cell_type": "code", "execution_count": 38, "id": "fe524d14", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Loaded 28 documents from /home/acano/PycharmProjects/assistance-engine/data/raw\n" ] } ], "source": [ "docs: list[Document] = []\n", "for txt_file in sorted((RAW_DIR / \"docs\").glob(\"*.txt\")):\n", " text = txt_file.read_text(encoding=\"utf-8\")\n", " docs.append(Document(page_content=text, metadata={\"source\": txt_file.name}))\n", "\n", "print(f\"Loaded {len(docs)} documents from {RAW_DIR}\")" ] }, { "cell_type": "code", "execution_count": 39, "id": "ab1932b7", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_102010/1545617568.py:1: DeprecationWarning: LangchainLLMWrapper is deprecated and will be removed in a future version. Use llm_factory instead: from openai import OpenAI; from ragas.llms import llm_factory; llm = llm_factory('gpt-4o-mini', client=OpenAI(api_key='...'))\n", " synth = SingleHopSpecificQuerySynthesizer(llm=LangchainLLMWrapper(llm))\n", "/tmp/ipykernel_102010/1545617568.py:3: DeprecationWarning: LangchainLLMWrapper is deprecated and will be removed in a future version. Use llm_factory instead: from openai import OpenAI; from ragas.llms import llm_factory; llm = llm_factory('gpt-4o-mini', client=OpenAI(api_key='...'))\n", " generator = TestsetGenerator(llm=LangchainLLMWrapper(llm), embedding_model=LangchainEmbeddingsWrapper(embeddings))\n", "/tmp/ipykernel_102010/1545617568.py:3: DeprecationWarning: LangchainEmbeddingsWrapper is deprecated and will be removed in a future version. Use the modern embedding providers instead: embedding_factory('openai', model='text-embedding-3-small', client=openai_client) or from ragas.embeddings import OpenAIEmbeddings, GoogleEmbeddings, HuggingFaceEmbeddings\n", " generator = TestsetGenerator(llm=LangchainLLMWrapper(llm), embedding_model=LangchainEmbeddingsWrapper(embeddings))\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "78fe99b4108b4731845407d26779bf25", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Applying SummaryExtractor: 0%| | 0/28 [00:00' and '<' in AVAP™ are used for numeric magnitude comparison.\", \"reason\": \"This specific operator description is not mentioned or supported by any statement in the ground truth.\"}, {\"statement\": \"The operator 'in' in AVAP™ checks whether an element belongs to a list or string.\", \"reason\": \"This specific operator description is not mentioned or supported by any statement in the ground truth.\"}, {\"statement\": \"An IF block in AVAP™ may include an optional ELSE() block.\", \"reason\": \"The ground truth does not mention that the ELSE() block is optional; it only describes its behavior when present.\"}, {\"statement\": \"In the provided example, the variable selector is initialized with the value 'yes'.\", \"reason\": \"This specific detail about variable initialization is not mentioned in the ground truth.\"}, {\"statement\": \"Inside the IF() block, addVar(result,1) is executed, which assigns the value 1 to the result variable.\", \"reason\": \"This specific detail about the addVar command and result variable is not mentioned in the ground truth.\"}, {\"statement\": \"AVAP™ allows omission of comparison parameters to evaluate a complete logical expression directly in the third parameter.\", \"reason\": \"This specific behavior about omitting comparison parameters is not mentioned or supported by any statement in the ground truth.\"}, {\"statement\": \"The result returned by the API for the provided example has a status value of true.\", \"reason\": \"This specific detail about the API result status is not mentioned in the ground truth.\"}, {\"statement\": \"The result returned by the API for the provided example has a result value of 1.\", \"reason\": \"This specific detail about the API result value is not mentioned in the ground truth.\"}, {\"statement\": \"A status value of true in the API result indicates that the execution was successful.\", \"reason\": \"This interpretation of the API status value is not mentioned or supported by any statement in the ground truth.\"}], \"FN\": [{\"statement\": \"The basic syntax of the IF-THEN-ELSE statement in AVAP™ is IF(condition, true_value, operator).\", \"reason\": \"While individual parameters are mentioned in the answer, the complete basic syntax format is not explicitly stated.\"}, {\"statement\": \"If the condition in IF(selector,'yes','=') is false, the code inside the ELSE() block runs.\", \"reason\": \"The answer mentions the general ELSE() behavior but does not explicitly state this in the context of the IF(selector,'yes','=') example.\"}]}\n", "For troubleshooting, visit: https://docs.langchain.com/oss/python/langchain/errors/OUTPUT_PARSING_FAILURE )\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
user_inputretrieved_contextsresponsereferencefaithfulnessanswer_relevancycontext_precisioncontext_recallcontext_entity_recallanswer_similarityanswer_correctness
0So I been trying to understand how AVAP do the...[Chapter 5: Data Types In this chapter, we wil...# Arithmetic Conversions in AVAP\\n\\nWhen you'r...In AVAP, when an arithmetic operator is descri...0.9230770.9159340.3333331.00.0000000.9314320.778313
1Hey, I'm trying to undrestand how arithmatic c...[6. Expressions in AVAP This chapter explains ...# Arithmetic Conversions in AVAP\\n\\nWhen using...In AVAP, when an arithmetic operator is descri...0.8571430.9188211.0000001.00.2500000.9405880.770861
2Hey, I'm trying to undrestand how sliceing wor...[6. Expressions in AVAP This chapter explains ...# Slicing Semantics in AVAP\\n\\nHere is a full ...In AVAP, when you perform a slice operation, t...1.0000000.8960081.0000001.00.0714290.9220160.865119
3How does the __call__ method relate to callabl...[6. Expressions in AVAP This chapter explains ...Answer:\\n\\nIn AVAP, the `__call__` method is w...In AVAP, any object that has a __call__() meth...1.0000000.8963141.0000001.00.1428570.8786170.894654
4How does Decimel work in Python value comparis...[Binary Arithmetic Operations Binary arithmeti...This question is about Python value comparison...In Python, decimal.Decimal (from the standard ...0.8235290.8342901.0000001.01.0000000.8250500.686262
....................................
95What are the diffrent data typs available in A...[Chapter 5: Data Types In this chapter, we wil...# Data Types in AVAP™ and Their Comparison to ...In AVAP™, just like in Python, data types are ...0.9310340.9806441.0000001.00.5238100.9086510.670344
96In Python and AVAP how do you does conversion ...[Chapter 5: Data Types In this chapter, we wil...# Data Type Conversion in AVAP™\\n\\nIn AVAP™, y...In AVAP™, just like in Python, it is possible ...0.8181820.8874621.0000001.00.4000000.9026310.943049
97How does AVAP compare to Python in terms of ba...[Chapter 5: Data Types In this chapter, we wil...# AVAP™ Compared to Python: Basic Data Types a...AVAP shares several similarities with Python w...0.8260870.9894681.0000001.00.5000000.8939030.877322
98Hey so I been trying to learn AVAP and I notic...[Chapter 5: Data Types In this chapter, we wil...# AVAP Basic Data Types, Conversions, and Oper...In AVAP, just like in Python, there are severa...1.0000000.8907961.0000001.00.4117650.8832870.915944
99How AVAP data types is similar to Python?[Introduction The data model in AVAP™ defines ...# AVAP Data Types: Similarities to Python\\n\\nB...In AVAP™, like in Python, there are several ba...0.9565220.9886501.0000001.00.5555560.8575550.611448
\n", "

100 rows × 11 columns

\n", "
" ], "text/plain": [ " user_input \\\n", "0 So I been trying to understand how AVAP do the... \n", "1 Hey, I'm trying to undrestand how arithmatic c... \n", "2 Hey, I'm trying to undrestand how sliceing wor... \n", "3 How does the __call__ method relate to callabl... \n", "4 How does Decimel work in Python value comparis... \n", ".. ... \n", "95 What are the diffrent data typs available in A... \n", "96 In Python and AVAP how do you does conversion ... \n", "97 How does AVAP compare to Python in terms of ba... \n", "98 Hey so I been trying to learn AVAP and I notic... \n", "99 How AVAP data types is similar to Python? \n", "\n", " retrieved_contexts \\\n", "0 [Chapter 5: Data Types In this chapter, we wil... \n", "1 [6. Expressions in AVAP This chapter explains ... \n", "2 [6. Expressions in AVAP This chapter explains ... \n", "3 [6. Expressions in AVAP This chapter explains ... \n", "4 [Binary Arithmetic Operations Binary arithmeti... \n", ".. ... \n", "95 [Chapter 5: Data Types In this chapter, we wil... \n", "96 [Chapter 5: Data Types In this chapter, we wil... \n", "97 [Chapter 5: Data Types In this chapter, we wil... \n", "98 [Chapter 5: Data Types In this chapter, we wil... \n", "99 [Introduction The data model in AVAP™ defines ... \n", "\n", " response \\\n", "0 # Arithmetic Conversions in AVAP\\n\\nWhen you'r... \n", "1 # Arithmetic Conversions in AVAP\\n\\nWhen using... \n", "2 # Slicing Semantics in AVAP\\n\\nHere is a full ... \n", "3 Answer:\\n\\nIn AVAP, the `__call__` method is w... \n", "4 This question is about Python value comparison... \n", ".. ... \n", "95 # Data Types in AVAP™ and Their Comparison to ... \n", "96 # Data Type Conversion in AVAP™\\n\\nIn AVAP™, y... \n", "97 # AVAP™ Compared to Python: Basic Data Types a... \n", "98 # AVAP Basic Data Types, Conversions, and Oper... \n", "99 # AVAP Data Types: Similarities to Python\\n\\nB... \n", "\n", " reference faithfulness \\\n", "0 In AVAP, when an arithmetic operator is descri... 0.923077 \n", "1 In AVAP, when an arithmetic operator is descri... 0.857143 \n", "2 In AVAP, when you perform a slice operation, t... 1.000000 \n", "3 In AVAP, any object that has a __call__() meth... 1.000000 \n", "4 In Python, decimal.Decimal (from the standard ... 0.823529 \n", ".. ... ... \n", "95 In AVAP™, just like in Python, data types are ... 0.931034 \n", "96 In AVAP™, just like in Python, it is possible ... 0.818182 \n", "97 AVAP shares several similarities with Python w... 0.826087 \n", "98 In AVAP, just like in Python, there are severa... 1.000000 \n", "99 In AVAP™, like in Python, there are several ba... 0.956522 \n", "\n", " answer_relevancy context_precision context_recall \\\n", "0 0.915934 0.333333 1.0 \n", "1 0.918821 1.000000 1.0 \n", "2 0.896008 1.000000 1.0 \n", "3 0.896314 1.000000 1.0 \n", "4 0.834290 1.000000 1.0 \n", ".. ... ... ... \n", "95 0.980644 1.000000 1.0 \n", "96 0.887462 1.000000 1.0 \n", "97 0.989468 1.000000 1.0 \n", "98 0.890796 1.000000 1.0 \n", "99 0.988650 1.000000 1.0 \n", "\n", " context_entity_recall answer_similarity answer_correctness \n", "0 0.000000 0.931432 0.778313 \n", "1 0.250000 0.940588 0.770861 \n", "2 0.071429 0.922016 0.865119 \n", "3 0.142857 0.878617 0.894654 \n", "4 1.000000 0.825050 0.686262 \n", ".. ... ... ... \n", "95 0.523810 0.908651 0.670344 \n", "96 0.400000 0.902631 0.943049 \n", "97 0.500000 0.893903 0.877322 \n", "98 0.411765 0.883287 0.915944 \n", "99 0.555556 0.857555 0.611448 \n", "\n", "[100 rows x 11 columns]" ] }, "execution_count": 44, "metadata": {}, "output_type": "execute_result" } ], "source": [ "metrics = [\n", " faithfulness,\n", " answer_relevancy,\n", " context_precision,\n", " context_recall,\n", " context_entity_recall,\n", " answer_similarity,\n", " answer_correctness\n", "]\n", "\n", "result = evaluate(\n", " dataset=dataset, \n", " metrics=metrics,\n", " llm=llm,\n", " embeddings=embeddings,\n", ")\n", "\n", "result_df = result.to_pandas()\n", "result_df" ] }, { "cell_type": "code", "execution_count": 45, "id": "20c3fa64", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "faithfulness 0.958168\n", "answer_relevancy 0.821373\n", "context_precision 0.819167\n", "context_recall 0.896841\n", "context_entity_recall 0.389715\n", "answer_similarity 0.876510\n", "answer_correctness 0.725832\n", "dtype: float64" ] }, "execution_count": 45, "metadata": {}, "output_type": "execute_result" } ], "source": [ "result_df.mean(numeric_only=True)" ] }, { "cell_type": "code", "execution_count": 47, "id": "350755fd", "metadata": {}, "outputs": [], "source": [ "result_df.to_csv(INTERIM_DIR / \"retrieve_eval_results/ragas_eval.csv\", index=False)" ] }, { "cell_type": "code", "execution_count": 48, "id": "71743384", "metadata": {}, "outputs": [], "source": [ "from evidently import Dataset, DataDefinition, Report\n", "from evidently.descriptors import *\n", "from evidently.llm.options import OllamaOptions\n", "from evidently.presets import TextEvals\n", "from evidently.metrics import *\n", "from evidently.tests import *" ] }, { "cell_type": "code", "execution_count": 49, "id": "4a1210cb", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
questionanswercontextsground_truthground_truth_contexts
0So I been trying to understand how AVAP do the...# Arithmetic Conversions in AVAP\\n\\nWhen you'r...['Chapter 5: Data Types In this chapter, we wi...In AVAP, when an arithmetic operator is descri...['6. Expressions in AVAP\\nThis chapter explain...
1Hey, I'm trying to undrestand how arithmatic c...# Arithmetic Conversions in AVAP\\n\\nWhen using...['6. Expressions in AVAP This chapter explains...In AVAP, when an arithmetic operator is descri...['6. Expressions in AVAP\\nThis chapter explain...
2Hey, I'm trying to undrestand how sliceing wor...# Slicing Semantics in AVAP\\n\\nHere is a full ...['6. Expressions in AVAP This chapter explains...In AVAP, when you perform a slice operation, t...['6. Expressions in AVAP\\nThis chapter explain...
3How does the __call__ method relate to callabl...Answer:\\n\\nIn AVAP, the `__call__` method is w...['6. Expressions in AVAP This chapter explains...In AVAP, any object that has a __call__() meth...['6. Expressions in AVAP\\nThis chapter explain...
4How does Decimel work in Python value comparis...This question is about Python value comparison...['Binary Arithmetic Operations Binary arithmet...In Python, decimal.Decimal (from the standard ...['Binary Arithmetic Operations\\nBinary arithme...
5How does the Decimal type from the standrd lib...I don't have enough information in the provide...['6. Expressions in AVAP This chapter explains...According to the context, decimal.Decimal is o...['Binary Arithmetic Operations\\nBinary arithme...
6Under what circumstances does a TypeError occu...Answer:\\nA `TypeError` occurs during compariso...['Binary Arithmetic Operations Binary arithmet...A TypeError is generated when attempting to pe...['Binary Arithmetic Operations\\nBinary arithme...
7According to the AVAP documentation, what does...I don't have enough information in the provide...['6. Expressions in AVAP This chapter explains...PEP 8 advises that comparisons for singletons,...['Binary Arithmetic Operations\\nBinary arithme...
8hey so i been reading about expression stateme...Answer:\\n\\nIn AVAP, when you use an expression...[\"Expressions in AVAP™ Introduction Expression...In interactive mode in AVAP, when an expressio...['Simple Statements\\nIn AVAP, a simple stateme...
9AVAP programming language how do simple statem...# Simple Statements in AVAP\\n\\n## Overview\\n\\n...['Simple Statements In AVAP, a simple statemen...In AVAP, a simple statement consists of a sing...['Simple Statements\\nIn AVAP, a simple stateme...
\n", "
" ], "text/plain": [ " question \\\n", "0 So I been trying to understand how AVAP do the... \n", "1 Hey, I'm trying to undrestand how arithmatic c... \n", "2 Hey, I'm trying to undrestand how sliceing wor... \n", "3 How does the __call__ method relate to callabl... \n", "4 How does Decimel work in Python value comparis... \n", "5 How does the Decimal type from the standrd lib... \n", "6 Under what circumstances does a TypeError occu... \n", "7 According to the AVAP documentation, what does... \n", "8 hey so i been reading about expression stateme... \n", "9 AVAP programming language how do simple statem... \n", "\n", " answer \\\n", "0 # Arithmetic Conversions in AVAP\\n\\nWhen you'r... \n", "1 # Arithmetic Conversions in AVAP\\n\\nWhen using... \n", "2 # Slicing Semantics in AVAP\\n\\nHere is a full ... \n", "3 Answer:\\n\\nIn AVAP, the `__call__` method is w... \n", "4 This question is about Python value comparison... \n", "5 I don't have enough information in the provide... \n", "6 Answer:\\nA `TypeError` occurs during compariso... \n", "7 I don't have enough information in the provide... \n", "8 Answer:\\n\\nIn AVAP, when you use an expression... \n", "9 # Simple Statements in AVAP\\n\\n## Overview\\n\\n... \n", "\n", " contexts \\\n", "0 ['Chapter 5: Data Types In this chapter, we wi... \n", "1 ['6. Expressions in AVAP This chapter explains... \n", "2 ['6. Expressions in AVAP This chapter explains... \n", "3 ['6. Expressions in AVAP This chapter explains... \n", "4 ['Binary Arithmetic Operations Binary arithmet... \n", "5 ['6. Expressions in AVAP This chapter explains... \n", "6 ['Binary Arithmetic Operations Binary arithmet... \n", "7 ['6. Expressions in AVAP This chapter explains... \n", "8 [\"Expressions in AVAP™ Introduction Expression... \n", "9 ['Simple Statements In AVAP, a simple statemen... \n", "\n", " ground_truth \\\n", "0 In AVAP, when an arithmetic operator is descri... \n", "1 In AVAP, when an arithmetic operator is descri... \n", "2 In AVAP, when you perform a slice operation, t... \n", "3 In AVAP, any object that has a __call__() meth... \n", "4 In Python, decimal.Decimal (from the standard ... \n", "5 According to the context, decimal.Decimal is o... \n", "6 A TypeError is generated when attempting to pe... \n", "7 PEP 8 advises that comparisons for singletons,... \n", "8 In interactive mode in AVAP, when an expressio... \n", "9 In AVAP, a simple statement consists of a sing... \n", "\n", " ground_truth_contexts \n", "0 ['6. Expressions in AVAP\\nThis chapter explain... \n", "1 ['6. Expressions in AVAP\\nThis chapter explain... \n", "2 ['6. Expressions in AVAP\\nThis chapter explain... \n", "3 ['6. Expressions in AVAP\\nThis chapter explain... \n", "4 ['Binary Arithmetic Operations\\nBinary arithme... \n", "5 ['Binary Arithmetic Operations\\nBinary arithme... \n", "6 ['Binary Arithmetic Operations\\nBinary arithme... \n", "7 ['Binary Arithmetic Operations\\nBinary arithme... \n", "8 ['Simple Statements\\nIn AVAP, a simple stateme... \n", "9 ['Simple Statements\\nIn AVAP, a simple stateme... " ] }, "execution_count": 49, "metadata": {}, "output_type": "execute_result" } ], "source": [ "full_synthetic_dataset = pd.read_csv(INTERIM_DIR / \"retrieve_eval_results/full_synthetic_dataset.csv\")\n", "full_synthetic_dataset = full_synthetic_dataset[0:10]\n", "full_synthetic_dataset" ] }, { "cell_type": "code", "execution_count": 50, "id": "e1ac1a41", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
questionanswercontextsground_truthground_truth_contextsContextQualityContextQuality scoreContextQuality reasoningContextRelevanceContextRelevance scoresFaithfulnessFaithfulness scoreFaithfulness reasoningCompletenessCompleteness scoreCompleteness reasoningCorrectnessCorrectness scoreCorrectness reasoning
0So I been trying to understand how AVAP do the...# Arithmetic Conversions in AVAP\\n\\nWhen you'r...['Chapter 5: Data Types In this chapter, we wi...In AVAP, when an arithmetic operator is descri...['6. Expressions in AVAP\\nThis chapter explain...VALID0.90The text contains a section (6.1. Arithmetic C...0.836118[0.8361175]UNFAITHFUL0.30The response is largely faithful to the source...COMPLETE0.05The output covers all the key information from...CORRECT0.00The OUTPUT accurately conveys all the key info...
1Hey, I'm trying to undrestand how arithmatic c...# Arithmetic Conversions in AVAP\\n\\nWhen using...['6. Expressions in AVAP This chapter explains...In AVAP, when an arithmetic operator is descri...['6. Expressions in AVAP\\nThis chapter explain...VALID0.95The text contains the specific section '6.1. A...0.828277[0.82827663]FAITHFUL0.05The response accurately summarizes the arithme...COMPLETE0.05The output covers all the key information from...CORRECT0.00The output accurately conveys all the key info...
2Hey, I'm trying to undrestand how sliceing wor...# Slicing Semantics in AVAP\\n\\nHere is a full ...['6. Expressions in AVAP This chapter explains...In AVAP, when you perform a slice operation, t...['6. Expressions in AVAP\\nThis chapter explain...VALID0.95The first document contains a dedicated 'Slice...0.647847[0.6478467]FAITHFUL0.05The response accurately reproduces the slicing...COMPLETE0.05The output covers all the key information from...CORRECT0.05The OUTPUT accurately conveys all the key fact...
3How does the __call__ method relate to callabl...Answer:\\n\\nIn AVAP, the `__call__` method is w...['6. Expressions in AVAP This chapter explains...In AVAP, any object that has a __call__() meth...['6. Expressions in AVAP\\nThis chapter explain...VALID0.85The first text chunk explicitly addresses the ...0.682974[0.6829743]FAITHFUL0.00The response accurately quotes and summarizes ...INCOMPLETE0.85The output focuses narrowly on only one aspect...CORRECT0.10The OUTPUT conveys the same core facts as the ...
4How does Decimel work in Python value comparis...This question is about Python value comparison...['Binary Arithmetic Operations Binary arithmet...In Python, decimal.Decimal (from the standard ...['Binary Arithmetic Operations\\nBinary arithme...VALID0.65The first text block explicitly mentions decim...0.716216[0.7162162]UNFAITHFUL0.35The response is mostly faithful to the source,...COMPLETE0.10The output accurately captures the key informa...CORRECT0.20The OUTPUT correctly conveys the main facts fr...
5How does the Decimal type from the standrd lib...I don't have enough information in the provide...['6. Expressions in AVAP This chapter explains...According to the context, decimal.Decimal is o...['Binary Arithmetic Operations\\nBinary arithme...INVALID0.05The question specifically asks about how the D...0.763026[0.76302594]FAITHFUL0.05The response accurately states that it cannot ...COMPLETE0.05The output is a response explaining that the s...INCORRECT0.85The OUTPUT claims there is not enough informat...
6Under what circumstances does a TypeError occu...Answer:\\nA `TypeError` occurs during compariso...['Binary Arithmetic Operations Binary arithmet...A TypeError is generated when attempting to pe...['Binary Arithmetic Operations\\nBinary arithme...VALID0.90The first document (Python documentation) expl...0.632650[0.6326503]FAITHFUL0.05The response accurately reflects information f...COMPLETE0.10The output accurately covers all the main circ...INCORRECT0.30The OUTPUT correctly conveys the main facts fr...
7According to the AVAP documentation, what does...I don't have enough information in the provide...['6. Expressions in AVAP This chapter explains...PEP 8 advises that comparisons for singletons,...['Binary Arithmetic Operations\\nBinary arithme...INVALID0.05The question asks specifically about what PEP ...0.624027[0.6240271]FAITHFUL0.00The response is a refusal to answer, stating t...COMPLETE0.05The output is a complete response to a questio...INCORRECT0.90The OUTPUT claims there is not enough informat...
8hey so i been reading about expression stateme...Answer:\\n\\nIn AVAP, when you use an expression...[\"Expressions in AVAP™ Introduction Expression...In interactive mode in AVAP, when an expressio...['Simple Statements\\nIn AVAP, a simple stateme...VALID0.95The text contains a direct and complete answer...0.776070[0.7760695]FAITHFUL0.00The response accurately reflects information f...INCOMPLETE0.85The output focuses on a very narrow aspect of ...CORRECT0.00The OUTPUT accurately conveys all the key fact...
9AVAP programming language how do simple statem...# Simple Statements in AVAP\\n\\n## Overview\\n\\n...['Simple Statements In AVAP, a simple statemen...In AVAP, a simple statement consists of a sing...['Simple Statements\\nIn AVAP, a simple stateme...VALID0.92The text provides comprehensive information ab...0.883411[0.8834107]FAITHFUL0.00The response accurately reproduces information...INCOMPLETE0.85The OUTPUT only covers the introductory overvi...INCORRECT0.30The OUTPUT contains most of the same informati...
\n", "
" ], "text/plain": [ " question \\\n", "0 So I been trying to understand how AVAP do the... \n", "1 Hey, I'm trying to undrestand how arithmatic c... \n", "2 Hey, I'm trying to undrestand how sliceing wor... \n", "3 How does the __call__ method relate to callabl... \n", "4 How does Decimel work in Python value comparis... \n", "5 How does the Decimal type from the standrd lib... \n", "6 Under what circumstances does a TypeError occu... \n", "7 According to the AVAP documentation, what does... \n", "8 hey so i been reading about expression stateme... \n", "9 AVAP programming language how do simple statem... \n", "\n", " answer \\\n", "0 # Arithmetic Conversions in AVAP\\n\\nWhen you'r... \n", "1 # Arithmetic Conversions in AVAP\\n\\nWhen using... \n", "2 # Slicing Semantics in AVAP\\n\\nHere is a full ... \n", "3 Answer:\\n\\nIn AVAP, the `__call__` method is w... \n", "4 This question is about Python value comparison... \n", "5 I don't have enough information in the provide... \n", "6 Answer:\\nA `TypeError` occurs during compariso... \n", "7 I don't have enough information in the provide... \n", "8 Answer:\\n\\nIn AVAP, when you use an expression... \n", "9 # Simple Statements in AVAP\\n\\n## Overview\\n\\n... \n", "\n", " contexts \\\n", "0 ['Chapter 5: Data Types In this chapter, we wi... \n", "1 ['6. Expressions in AVAP This chapter explains... \n", "2 ['6. Expressions in AVAP This chapter explains... \n", "3 ['6. Expressions in AVAP This chapter explains... \n", "4 ['Binary Arithmetic Operations Binary arithmet... \n", "5 ['6. Expressions in AVAP This chapter explains... \n", "6 ['Binary Arithmetic Operations Binary arithmet... \n", "7 ['6. Expressions in AVAP This chapter explains... \n", "8 [\"Expressions in AVAP™ Introduction Expression... \n", "9 ['Simple Statements In AVAP, a simple statemen... \n", "\n", " ground_truth \\\n", "0 In AVAP, when an arithmetic operator is descri... \n", "1 In AVAP, when an arithmetic operator is descri... \n", "2 In AVAP, when you perform a slice operation, t... \n", "3 In AVAP, any object that has a __call__() meth... \n", "4 In Python, decimal.Decimal (from the standard ... \n", "5 According to the context, decimal.Decimal is o... \n", "6 A TypeError is generated when attempting to pe... \n", "7 PEP 8 advises that comparisons for singletons,... \n", "8 In interactive mode in AVAP, when an expressio... \n", "9 In AVAP, a simple statement consists of a sing... \n", "\n", " ground_truth_contexts ContextQuality \\\n", "0 ['6. Expressions in AVAP\\nThis chapter explain... VALID \n", "1 ['6. Expressions in AVAP\\nThis chapter explain... VALID \n", "2 ['6. Expressions in AVAP\\nThis chapter explain... VALID \n", "3 ['6. Expressions in AVAP\\nThis chapter explain... VALID \n", "4 ['Binary Arithmetic Operations\\nBinary arithme... VALID \n", "5 ['Binary Arithmetic Operations\\nBinary arithme... INVALID \n", "6 ['Binary Arithmetic Operations\\nBinary arithme... VALID \n", "7 ['Binary Arithmetic Operations\\nBinary arithme... INVALID \n", "8 ['Simple Statements\\nIn AVAP, a simple stateme... VALID \n", "9 ['Simple Statements\\nIn AVAP, a simple stateme... VALID \n", "\n", " ContextQuality score ContextQuality reasoning \\\n", "0 0.90 The text contains a section (6.1. Arithmetic C... \n", "1 0.95 The text contains the specific section '6.1. A... \n", "2 0.95 The first document contains a dedicated 'Slice... \n", "3 0.85 The first text chunk explicitly addresses the ... \n", "4 0.65 The first text block explicitly mentions decim... \n", "5 0.05 The question specifically asks about how the D... \n", "6 0.90 The first document (Python documentation) expl... \n", "7 0.05 The question asks specifically about what PEP ... \n", "8 0.95 The text contains a direct and complete answer... \n", "9 0.92 The text provides comprehensive information ab... \n", "\n", " ContextRelevance ContextRelevance scores Faithfulness Faithfulness score \\\n", "0 0.836118 [0.8361175] UNFAITHFUL 0.30 \n", "1 0.828277 [0.82827663] FAITHFUL 0.05 \n", "2 0.647847 [0.6478467] FAITHFUL 0.05 \n", "3 0.682974 [0.6829743] FAITHFUL 0.00 \n", "4 0.716216 [0.7162162] UNFAITHFUL 0.35 \n", "5 0.763026 [0.76302594] FAITHFUL 0.05 \n", "6 0.632650 [0.6326503] FAITHFUL 0.05 \n", "7 0.624027 [0.6240271] FAITHFUL 0.00 \n", "8 0.776070 [0.7760695] FAITHFUL 0.00 \n", "9 0.883411 [0.8834107] FAITHFUL 0.00 \n", "\n", " Faithfulness reasoning Completeness \\\n", "0 The response is largely faithful to the source... COMPLETE \n", "1 The response accurately summarizes the arithme... COMPLETE \n", "2 The response accurately reproduces the slicing... COMPLETE \n", "3 The response accurately quotes and summarizes ... INCOMPLETE \n", "4 The response is mostly faithful to the source,... COMPLETE \n", "5 The response accurately states that it cannot ... COMPLETE \n", "6 The response accurately reflects information f... COMPLETE \n", "7 The response is a refusal to answer, stating t... COMPLETE \n", "8 The response accurately reflects information f... INCOMPLETE \n", "9 The response accurately reproduces information... INCOMPLETE \n", "\n", " Completeness score Completeness reasoning \\\n", "0 0.05 The output covers all the key information from... \n", "1 0.05 The output covers all the key information from... \n", "2 0.05 The output covers all the key information from... \n", "3 0.85 The output focuses narrowly on only one aspect... \n", "4 0.10 The output accurately captures the key informa... \n", "5 0.05 The output is a response explaining that the s... \n", "6 0.10 The output accurately covers all the main circ... \n", "7 0.05 The output is a complete response to a questio... \n", "8 0.85 The output focuses on a very narrow aspect of ... \n", "9 0.85 The OUTPUT only covers the introductory overvi... \n", "\n", " Correctness Correctness score \\\n", "0 CORRECT 0.00 \n", "1 CORRECT 0.00 \n", "2 CORRECT 0.05 \n", "3 CORRECT 0.10 \n", "4 CORRECT 0.20 \n", "5 INCORRECT 0.85 \n", "6 INCORRECT 0.30 \n", "7 INCORRECT 0.90 \n", "8 CORRECT 0.00 \n", "9 INCORRECT 0.30 \n", "\n", " Correctness reasoning \n", "0 The OUTPUT accurately conveys all the key info... \n", "1 The output accurately conveys all the key info... \n", "2 The OUTPUT accurately conveys all the key fact... \n", "3 The OUTPUT conveys the same core facts as the ... \n", "4 The OUTPUT correctly conveys the main facts fr... \n", "5 The OUTPUT claims there is not enough informat... \n", "6 The OUTPUT correctly conveys the main facts fr... \n", "7 The OUTPUT claims there is not enough informat... \n", "8 The OUTPUT accurately conveys all the key fact... \n", "9 The OUTPUT contains most of the same informati... " ] }, "execution_count": 50, "metadata": {}, "output_type": "execute_result" } ], "source": [ "context_based_evals = Dataset.from_pandas(\n", " full_synthetic_dataset,\n", " descriptors=[ContextQualityLLMEval(question=\"question\", column_name=\"contexts\", provider=\"bedrock\", \n", " model=\"global.anthropic.claude-sonnet-4-6\", include_score=True, alias=\"ContextQuality\"),\n", " ContextRelevance(input=\"question\", contexts=\"contexts\", output_scores=True, \n", " aggregation_method=\"mean\", alias=\"ContextRelevance\"),\n", " FaithfulnessLLMEval(column_name=\"answer\", context=\"contexts\", provider=\"bedrock\", \n", " model=\"global.anthropic.claude-sonnet-4-6\", include_score=True, alias=\"Faithfulness\"),\n", " CompletenessLLMEval(column_name=\"answer\", context=\"contexts\", provider=\"bedrock\", \n", " model=\"global.anthropic.claude-sonnet-4-6\", include_score=True, alias=\"Completeness\"),\n", " CorrectnessLLMEval(column_name=\"answer\", target_output=\"ground_truth\", provider=\"bedrock\", \n", " model=\"global.anthropic.claude-sonnet-4-6\", include_score=True, alias=\"Correctness\")\n", " ],\n", " # options=OllamaOptions(api_url=OLLAMA_LOCAL_URL)\n", ")\n", "context_based_evals.as_dataframe()" ] }, { "cell_type": "code", "execution_count": null, "id": "9ff2705b", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "assistance-engine", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.13" } }, "nbformat": 4, "nbformat_minor": 5 }