{ "cells": [ { "cell_type": "markdown", "id": "efbabfec", "metadata": {}, "source": [ "# Libraries" ] }, { "cell_type": "code", "execution_count": null, "id": "e06aaf8c", "metadata": {}, "outputs": [], "source": [ "from src.config import ELASTICSEARCH_INDEX, OPENAI_API_KEY, OPENAI_MODEL, BASE_URL, MODEL_NAME" ] }, { "cell_type": "code", "execution_count": null, "id": "8488db9f", "metadata": {}, "outputs": [ { "ename": "ModuleNotFoundError", "evalue": "No module named 'ragas.testset.generator'", "output_type": "error", "traceback": [ "\u001b[31m---------------------------------------------------------------------------\u001b[39m", "\u001b[31mModuleNotFoundError\u001b[39m Traceback (most recent call last)", "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[1]\u001b[39m\u001b[32m, line 6\u001b[39m\n\u001b[32m 4\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mlangchain_core\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mdocuments\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m Document\n\u001b[32m 5\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mlangchain_ollama\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m ChatOllama, OllamaEmbeddings\n\u001b[32m----> \u001b[39m\u001b[32m6\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mragas\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mtestset\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mgenerator\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m TestsetGenerator\n\u001b[32m 7\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mragas\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mllms\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m LangchainLLMWrapper\n", "\u001b[31mModuleNotFoundError\u001b[39m: No module named 'ragas.testset.generator'" ] } ], "source": [ "from langchain_core.embeddings import Embeddings \n", "from ragas.testset import TestsetGenerator \n", "from langchain_community.document_loaders import DirectoryLoader \n", "from openai import OpenAI \n", "from ragas.llms import llm_factory \n", "from langchain_core.documents import Document" ] }, { "cell_type": "code", "execution_count": null, "id": "816afb58", "metadata": {}, "outputs": [], "source": [ "class ElasticsearchEmbeddings(Embeddings): \n", " \"\"\"Fetch precomputed embeddings from Elasticsearch.\"\"\" \n", " def __init__(self, es_client, index_name, id_field=\"_id\", embedding_field=\"embedding_vector\"): \n", " self.es = es_client \n", " self.index = index_name \n", " self.id_field = id_field \n", " self.embedding_field = embedding_field \n", " \n", " def embed_query(self, text: str): \n", " # Example: hash text to a doc ID or perform a text lookup in ES \n", " doc_id = self._text_to_doc_id(text) # implement your mapping \n", " resp = self.es.get(index=self.index, id=doc_id) \n", " return resp[\"_source\"][self.embedding_field] \n", " \n", " def embed_documents(self, texts): \n", " return [self.embed_query(t) for t in texts] \n", " \n", " # Optional: implement async variants if Ragas calls them" ] }, { "cell_type": "code", "execution_count": null, "id": "65a7c504", "metadata": {}, "outputs": [], "source": [ "loader = DirectoryLoader(\"./my_code_repo\", glob=\"**/*.py\", recursive=True) \n", "docs = loader.load() \n", " \n", "# 2) Ollama LLM via OpenAI-compatible client \n", "ollama_client = OpenAI(base_url=\"http://localhost:11434/v1\", api_key=\"ollama\") \n", "generator_llm = llm_factory(\"codellama\", provider=\"openai\", client=ollama_client) \n", " \n", "# 3) Custom ES embeddings wrapper \n", "from elasticsearch import Elasticsearch \n", "es_client = Elasticsearch(\"http://localhost:9200\") \n", "es_embeddings = ElasticsearchEmbeddings(es_client, index_name=\"my_docs_embeddings\") \n", " \n", "# 4) Ragas TestsetGenerator using the ES embeddings \n", "generator = TestsetGenerator.from_langchain(generator_llm, es_embeddings) \n", " \n", "# 5) Generate testset \n", "dataset = generator.generate_with_langchain_docs(docs, testset_size=20) \n", " \n", "# 6) Export \n", "df = dataset.to_pandas() \n", "print(df[[\"user_input\", \"reference\"]].head())" ] }, { "cell_type": "markdown", "id": "06f41852", "metadata": {}, "source": [ "# Functions" ] }, { "cell_type": "code", "execution_count": null, "id": "d24262b0", "metadata": {}, "outputs": [], "source": [ "def fetch_documents_from_elasticsearch(\n", " es_url: str, \n", " index_name: str, \n", " size: int = 100\n", ") -> List[Document]:\n", " \"\"\"\n", " Retrieve documents from Elasticsearch and convert to LangChain Document objects.\n", " \n", " Args:\n", " es_url: Elasticsearch URL\n", " index_name: Index name to query\n", " size: Number of documents to retrieve\n", " \n", " Returns:\n", " List of LangChain Document objects with metadata\n", " \"\"\"\n", " es = Elasticsearch(es_url)\n", " \n", " # Query all documents from the index\n", " response = es.search(\n", " index=index_name,\n", " body={\"query\": {\"match_all\": {}}, \"size\": size}\n", " )\n", " \n", " documents = []\n", " for hit in response[\"hits\"][\"hits\"]:\n", " source = hit.get(\"_source\", {})\n", " \n", " # Extract text content (field name is 'text' based on your ES setup)\n", " page_content = source.get(\"text\", \"\")\n", " \n", " if not page_content:\n", " continue\n", " \n", " # Extract metadata from the document\n", " metadata = {\n", " \"source\": source.get(\"source\", \"unknown\"),\n", " \"doc_id\": source.get(\"doc_id\", hit.get(\"_id\", \"\")),\n", " \"chunk_id\": source.get(\"chunk_id\", \"\"),\n", " \"title\": source.get(\"title\", \"Untitled\"),\n", " \"es_id\": hit.get(\"_id\") # Store the ES document ID for reference\n", " }\n", " \n", " # Create LangChain Document object\n", " doc = Document(page_content=page_content, metadata=metadata)\n", " documents.append(doc)\n", " \n", " print(f\"✓ Retrieved {len(documents)} documents from Elasticsearch\")\n", " return documents\n", "\n", "\n", "# Fetch documents from Elasticsearch\n", "documents = fetch_documents_from_elasticsearch(ELASTICSEARCH_URL, ELASTICSEARCH_INDEX, size=100)\n", "\n", "if documents:\n", " print(f\"\\nFirst document sample:\")\n", " print(f\" Content length: {len(documents[0].page_content)} chars\")\n", " print(f\" Metadata: {documents[0].metadata}\")\n", "else:\n", " print(\"⚠️ No documents found in Elasticsearch. Check index name and ES connection.\")" ] }, { "cell_type": "code", "execution_count": null, "id": "bf8797ac", "metadata": {}, "outputs": [], "source": [ "if not documents:\n", " print(\"❌ Cannot proceed: No documents to generate testset from.\")\n", " print(\"Ensure Elasticsearch is running and the index contains documents.\")\n", "else:\n", " print(\"\\n\" + \"=\" * 60)\n", " print(\"Initializing Ragas TestSetGenerator\")\n", " print(\"=\" * 60)\n", " \n", " # Initialize LLMs for test generation\n", " generator_llm = LangchainLLMWrapper(\n", " ChatOllama(base_url=BASE_URL, model=MODEL_NAME)\n", " )\n", " \n", " # Initialize embeddings for semantic similarity evaluation\n", " embeddings = OllamaEmbeddings(base_url=BASE_URL, model=MODEL_NAME)\n", " \n", " # Create the TestsetGenerator\n", " generator = TestsetGenerator.from_langchain(\n", " llm=generator_llm,\n", " embedding_model=embeddings\n", " )\n", " \n", " print(\"✓ TestsetGenerator initialized successfully\")\n", " print(f\" - LLM Model: {MODEL_NAME}\")\n", " print(f\" - Using {len(documents)} documents for generation\")\n", " print(\"\\n⏳ Generating synthetic test set (this may take a few minutes)...\")\n", " \n", " # Generate synthetic test set\n", " # Adjust testset_size based on your needs and token budget\n", " testset_size = min(50, len(documents) * 2) # Generate up to 50 test cases or 2x the docs\n", " \n", " testset = generator.generate_with_langchain_docs(\n", " documents=documents,\n", " testset_size=testset_size,\n", " raise_exceptions=False\n", " )\n", " \n", " print(f\"✓ Test set generated with {len(testset)} examples\")\n", " \n", " # Convert to DataFrame for easy viewing and exporting\n", " df = testset.to_pandas()\n", " \n", " print(\"\\n\" + \"=\" * 60)\n", " print(\"Test Set Summary\")\n", " print(\"=\" * 60)\n", " print(f\"Total test cases: {len(df)}\")\n", " print(f\"\\nColumns: {list(df.columns)}\")\n", " print(f\"\\nFirst few rows:\")\n", " print(df.head())" ] }, { "cell_type": "code", "execution_count": null, "id": "8dfd3472", "metadata": {}, "outputs": [], "source": [ "from datetime import datetime\n", "from pathlib import Path\n", "\n", "# Define output directory\n", "output_dir = Path(\"data/interim/synthetic_testsets\")\n", "output_dir.mkdir(parents=True, exist_ok=True)\n", "\n", "# Generate filename with timestamp\n", "timestamp = datetime.now().strftime(\"%Y%m%d_%H%M%S\")\n", "output_file = output_dir / f\"testset_{timestamp}.csv\"\n", "\n", "# Save to CSV\n", "df.to_csv(output_file, index=False)\n", "print(f\"✓ Test set saved to: {output_file}\")\n", "\n", "# Optional: Save as JSON for better structure preservation\n", "json_file = output_dir / f\"testset_{timestamp}.json\"\n", "testset.save_as_json(str(json_file))\n", "print(f\"✓ Test set saved (JSON) to: {json_file}\")\n", "\n", "# Display statistics\n", "print(\"\\n\" + \"=\" * 60)\n", "print(\"Test Set Distribution\")\n", "print(\"=\" * 60)\n", "if \"question_type\" in df.columns:\n", " print(\"\\nQuestion types:\")\n", " print(df[\"question_type\"].value_counts())\n", "\n", "if \"retrieval_context\" in df.columns:\n", " avg_context_len = df[\"retrieval_context\"].apply(lambda x: len(x) if x else 0).mean()\n", " print(f\"\\nAverage context length: {avg_context_len:.0f} characters\")" ] } ], "metadata": { "kernelspec": { "display_name": "assistance-engine", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.11" } }, "nbformat": 4, "nbformat_minor": 5 }