assistance-engine/scratches/pseco/synthetic_dataset/n00 Ragas TestSet Generatio...

{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "efbabfec",
   "metadata": {},
   "source": [
    "# Libraries"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e06aaf8c",
   "metadata": {},
   "outputs": [],
   "source": [
    "from src.config import ELASTICSEARCH_INDEX, OPENAI_API_KEY, OPENAI_MODEL, BASE_URL, MODEL_NAME"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8488db9f",
   "metadata": {},
   "outputs": [
    {
     "ename": "ModuleNotFoundError",
     "evalue": "No module named 'ragas.testset.generator'",
     "output_type": "error",
     "traceback": [
      "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
      "\u001b[31mModuleNotFoundError\u001b[39m                       Traceback (most recent call last)",
      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[1]\u001b[39m\u001b[32m, line 6\u001b[39m\n\u001b[32m      4\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mlangchain_core\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mdocuments\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m Document\n\u001b[32m      5\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mlangchain_ollama\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m ChatOllama, OllamaEmbeddings\n\u001b[32m----> \u001b[39m\u001b[32m6\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mragas\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mtestset\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mgenerator\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m TestsetGenerator\n\u001b[32m      7\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mragas\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mllms\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m LangchainLLMWrapper\n",
      "\u001b[31mModuleNotFoundError\u001b[39m: No module named 'ragas.testset.generator'"
     ]
    }
   ],
   "source": [
    "from langchain_core.embeddings import Embeddings  \n",
    "from ragas.testset import TestsetGenerator  \n",
    "from langchain_community.document_loaders import DirectoryLoader  \n",
    "from openai import OpenAI  \n",
    "from ragas.llms import llm_factory  \n",
    "from langchain_core.documents import Document"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "816afb58",
   "metadata": {},
   "outputs": [],
   "source": [
    "class ElasticsearchEmbeddings(Embeddings):  \n",
    "    \"\"\"Fetch precomputed embeddings from Elasticsearch.\"\"\"  \n",
    "    def __init__(self, es_client, index_name, id_field=\"_id\", embedding_field=\"embedding_vector\"):  \n",
    "        self.es = es_client  \n",
    "        self.index = index_name  \n",
    "        self.id_field = id_field  \n",
    "        self.embedding_field = embedding_field  \n",
    "  \n",
    "    def embed_query(self, text: str):  \n",
    "        # Example: hash text to a doc ID or perform a text lookup in ES  \n",
    "        doc_id = self._text_to_doc_id(text)  # implement your mapping  \n",
    "        resp = self.es.get(index=self.index, id=doc_id)  \n",
    "        return resp[\"_source\"][self.embedding_field]  \n",
    "  \n",
    "    def embed_documents(self, texts):  \n",
    "        return [self.embed_query(t) for t in texts]  \n",
    "  \n",
    "    # Optional: implement async variants if Ragas calls them"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "65a7c504",
   "metadata": {},
   "outputs": [],
   "source": [
    "loader = DirectoryLoader(\"./my_code_repo\", glob=\"**/*.py\", recursive=True)  \n",
    "docs = loader.load()  \n",
    "  \n",
    "# 2) Ollama LLM via OpenAI-compatible client  \n",
    "ollama_client = OpenAI(base_url=\"http://localhost:11434/v1\", api_key=\"ollama\")  \n",
    "generator_llm = llm_factory(\"codellama\", provider=\"openai\", client=ollama_client)  \n",
    "  \n",
    "# 3) Custom ES embeddings wrapper  \n",
    "from elasticsearch import Elasticsearch  \n",
    "es_client = Elasticsearch(\"http://localhost:9200\")  \n",
    "es_embeddings = ElasticsearchEmbeddings(es_client, index_name=\"my_docs_embeddings\")  \n",
    "  \n",
    "# 4) Ragas TestsetGenerator using the ES embeddings  \n",
    "generator = TestsetGenerator.from_langchain(generator_llm, es_embeddings)  \n",
    "  \n",
    "# 5) Generate testset  \n",
    "dataset = generator.generate_with_langchain_docs(docs, testset_size=20)  \n",
    "  \n",
    "# 6) Export  \n",
    "df = dataset.to_pandas()  \n",
    "print(df[[\"user_input\", \"reference\"]].head())"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "06f41852",
   "metadata": {},
   "source": [
    "# Functions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d24262b0",
   "metadata": {},
   "outputs": [],
   "source": [
    "def fetch_documents_from_elasticsearch(\n",
    "    es_url: str, \n",
    "    index_name: str, \n",
    "    size: int = 100\n",
    ") -> List[Document]:\n",
    "    \"\"\"\n",
    "    Retrieve documents from Elasticsearch and convert to LangChain Document objects.\n",
    "    \n",
    "    Args:\n",
    "        es_url: Elasticsearch URL\n",
    "        index_name: Index name to query\n",
    "        size: Number of documents to retrieve\n",
    "        \n",
    "    Returns:\n",
    "        List of LangChain Document objects with metadata\n",
    "    \"\"\"\n",
    "    es = Elasticsearch(es_url)\n",
    "    \n",
    "    # Query all documents from the index\n",
    "    response = es.search(\n",
    "        index=index_name,\n",
    "        body={\"query\": {\"match_all\": {}}, \"size\": size}\n",
    "    )\n",
    "    \n",
    "    documents = []\n",
    "    for hit in response[\"hits\"][\"hits\"]:\n",
    "        source = hit.get(\"_source\", {})\n",
    "        \n",
    "        # Extract text content (field name is 'text' based on your ES setup)\n",
    "        page_content = source.get(\"text\", \"\")\n",
    "        \n",
    "        if not page_content:\n",
    "            continue\n",
    "        \n",
    "        # Extract metadata from the document\n",
    "        metadata = {\n",
    "            \"source\": source.get(\"source\", \"unknown\"),\n",
    "            \"doc_id\": source.get(\"doc_id\", hit.get(\"_id\", \"\")),\n",
    "            \"chunk_id\": source.get(\"chunk_id\", \"\"),\n",
    "            \"title\": source.get(\"title\", \"Untitled\"),\n",
    "            \"es_id\": hit.get(\"_id\")  # Store the ES document ID for reference\n",
    "        }\n",
    "        \n",
    "        # Create LangChain Document object\n",
    "        doc = Document(page_content=page_content, metadata=metadata)\n",
    "        documents.append(doc)\n",
    "    \n",
    "    print(f\"✓ Retrieved {len(documents)} documents from Elasticsearch\")\n",
    "    return documents\n",
    "\n",
    "\n",
    "# Fetch documents from Elasticsearch\n",
    "documents = fetch_documents_from_elasticsearch(ELASTICSEARCH_URL, ELASTICSEARCH_INDEX, size=100)\n",
    "\n",
    "if documents:\n",
    "    print(f\"\\nFirst document sample:\")\n",
    "    print(f\"  Content length: {len(documents[0].page_content)} chars\")\n",
    "    print(f\"  Metadata: {documents[0].metadata}\")\n",
    "else:\n",
    "    print(\"⚠️  No documents found in Elasticsearch. Check index name and ES connection.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bf8797ac",
   "metadata": {},
   "outputs": [],
   "source": [
    "if not documents:\n",
    "    print(\"❌ Cannot proceed: No documents to generate testset from.\")\n",
    "    print(\"Ensure Elasticsearch is running and the index contains documents.\")\n",
    "else:\n",
    "    print(\"\\n\" + \"=\" * 60)\n",
    "    print(\"Initializing Ragas TestSetGenerator\")\n",
    "    print(\"=\" * 60)\n",
    "    \n",
    "    # Initialize LLMs for test generation\n",
    "    generator_llm = LangchainLLMWrapper(\n",
    "        ChatOllama(base_url=BASE_URL, model=MODEL_NAME)\n",
    "    )\n",
    "    \n",
    "    # Initialize embeddings for semantic similarity evaluation\n",
    "    embeddings = OllamaEmbeddings(base_url=BASE_URL, model=MODEL_NAME)\n",
    "    \n",
    "    # Create the TestsetGenerator\n",
    "    generator = TestsetGenerator.from_langchain(\n",
    "        llm=generator_llm,\n",
    "        embedding_model=embeddings\n",
    "    )\n",
    "    \n",
    "    print(\"✓ TestsetGenerator initialized successfully\")\n",
    "    print(f\"  - LLM Model: {MODEL_NAME}\")\n",
    "    print(f\"  - Using {len(documents)} documents for generation\")\n",
    "    print(\"\\n⏳ Generating synthetic test set (this may take a few minutes)...\")\n",
    "    \n",
    "    # Generate synthetic test set\n",
    "    # Adjust testset_size based on your needs and token budget\n",
    "    testset_size = min(50, len(documents) * 2)  # Generate up to 50 test cases or 2x the docs\n",
    "    \n",
    "    testset = generator.generate_with_langchain_docs(\n",
    "        documents=documents,\n",
    "        testset_size=testset_size,\n",
    "        raise_exceptions=False\n",
    "    )\n",
    "    \n",
    "    print(f\"✓ Test set generated with {len(testset)} examples\")\n",
    "    \n",
    "    # Convert to DataFrame for easy viewing and exporting\n",
    "    df = testset.to_pandas()\n",
    "    \n",
    "    print(\"\\n\" + \"=\" * 60)\n",
    "    print(\"Test Set Summary\")\n",
    "    print(\"=\" * 60)\n",
    "    print(f\"Total test cases: {len(df)}\")\n",
    "    print(f\"\\nColumns: {list(df.columns)}\")\n",
    "    print(f\"\\nFirst few rows:\")\n",
    "    print(df.head())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8dfd3472",
   "metadata": {},
   "outputs": [],
   "source": [
    "from datetime import datetime\n",
    "from pathlib import Path\n",
    "\n",
    "# Define output directory\n",
    "output_dir = Path(\"data/interim/synthetic_testsets\")\n",
    "output_dir.mkdir(parents=True, exist_ok=True)\n",
    "\n",
    "# Generate filename with timestamp\n",
    "timestamp = datetime.now().strftime(\"%Y%m%d_%H%M%S\")\n",
    "output_file = output_dir / f\"testset_{timestamp}.csv\"\n",
    "\n",
    "# Save to CSV\n",
    "df.to_csv(output_file, index=False)\n",
    "print(f\"✓ Test set saved to: {output_file}\")\n",
    "\n",
    "# Optional: Save as JSON for better structure preservation\n",
    "json_file = output_dir / f\"testset_{timestamp}.json\"\n",
    "testset.save_as_json(str(json_file))\n",
    "print(f\"✓ Test set saved (JSON) to: {json_file}\")\n",
    "\n",
    "# Display statistics\n",
    "print(\"\\n\" + \"=\" * 60)\n",
    "print(\"Test Set Distribution\")\n",
    "print(\"=\" * 60)\n",
    "if \"question_type\" in df.columns:\n",
    "    print(\"\\nQuestion types:\")\n",
    "    print(df[\"question_type\"].value_counts())\n",
    "\n",
    "if \"retrieval_context\" in df.columns:\n",
    "    avg_context_len = df[\"retrieval_context\"].apply(lambda x: len(x) if x else 0).mean()\n",
    "    print(f\"\\nAverage context length: {avg_context_len:.0f} characters\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "assistance-engine",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}