assistance-engine/scratches/pseco/synthetic_dataset/first_approach/n00 Ragas TestSet Generatio...

303 lines
12 KiB
Plaintext

{
"cells": [
{
"cell_type": "markdown",
"id": "efbabfec",
"metadata": {},
"source": [
"# Libraries"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e06aaf8c",
"metadata": {},
"outputs": [],
"source": [
"from src.config import ELASTICSEARCH_INDEX, OPENAI_API_KEY, OPENAI_MODEL, BASE_URL, MODEL_NAME"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8488db9f",
"metadata": {},
"outputs": [
{
"ename": "ModuleNotFoundError",
"evalue": "No module named 'ragas.testset.generator'",
"output_type": "error",
"traceback": [
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
"\u001b[31mModuleNotFoundError\u001b[39m Traceback (most recent call last)",
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[1]\u001b[39m\u001b[32m, line 6\u001b[39m\n\u001b[32m 4\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mlangchain_core\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mdocuments\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m Document\n\u001b[32m 5\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mlangchain_ollama\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m ChatOllama, OllamaEmbeddings\n\u001b[32m----> \u001b[39m\u001b[32m6\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mragas\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mtestset\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mgenerator\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m TestsetGenerator\n\u001b[32m 7\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mragas\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mllms\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m LangchainLLMWrapper\n",
"\u001b[31mModuleNotFoundError\u001b[39m: No module named 'ragas.testset.generator'"
]
}
],
"source": [
"from langchain_core.embeddings import Embeddings \n",
"from ragas.testset import TestsetGenerator \n",
"from langchain_community.document_loaders import DirectoryLoader \n",
"from openai import OpenAI \n",
"from ragas.llms import llm_factory \n",
"from langchain_core.documents import Document"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "816afb58",
"metadata": {},
"outputs": [],
"source": [
"class ElasticsearchEmbeddings(Embeddings): \n",
" \"\"\"Fetch precomputed embeddings from Elasticsearch.\"\"\" \n",
" def __init__(self, es_client, index_name, id_field=\"_id\", embedding_field=\"embedding_vector\"): \n",
" self.es = es_client \n",
" self.index = index_name \n",
" self.id_field = id_field \n",
" self.embedding_field = embedding_field \n",
" \n",
" def embed_query(self, text: str): \n",
" # Example: hash text to a doc ID or perform a text lookup in ES \n",
" doc_id = self._text_to_doc_id(text) # implement your mapping \n",
" resp = self.es.get(index=self.index, id=doc_id) \n",
" return resp[\"_source\"][self.embedding_field] \n",
" \n",
" def embed_documents(self, texts): \n",
" return [self.embed_query(t) for t in texts] \n",
" \n",
" # Optional: implement async variants if Ragas calls them"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "65a7c504",
"metadata": {},
"outputs": [],
"source": [
"loader = DirectoryLoader(\"./my_code_repo\", glob=\"**/*.py\", recursive=True) \n",
"docs = loader.load() \n",
" \n",
"# 2) Ollama LLM via OpenAI-compatible client \n",
"ollama_client = OpenAI(base_url=\"http://localhost:11434/v1\", api_key=\"ollama\") \n",
"generator_llm = llm_factory(\"codellama\", provider=\"openai\", client=ollama_client) \n",
" \n",
"# 3) Custom ES embeddings wrapper \n",
"from elasticsearch import Elasticsearch \n",
"es_client = Elasticsearch(\"http://localhost:9200\") \n",
"es_embeddings = ElasticsearchEmbeddings(es_client, index_name=\"my_docs_embeddings\") \n",
" \n",
"# 4) Ragas TestsetGenerator using the ES embeddings \n",
"generator = TestsetGenerator.from_langchain(generator_llm, es_embeddings) \n",
" \n",
"# 5) Generate testset \n",
"dataset = generator.generate_with_langchain_docs(docs, testset_size=20) \n",
" \n",
"# 6) Export \n",
"df = dataset.to_pandas() \n",
"print(df[[\"user_input\", \"reference\"]].head())"
]
},
{
"cell_type": "markdown",
"id": "06f41852",
"metadata": {},
"source": [
"# Functions"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d24262b0",
"metadata": {},
"outputs": [],
"source": [
"def fetch_documents_from_elasticsearch(\n",
" es_url: str, \n",
" index_name: str, \n",
" size: int = 100\n",
") -> List[Document]:\n",
" \"\"\"\n",
" Retrieve documents from Elasticsearch and convert to LangChain Document objects.\n",
" \n",
" Args:\n",
" es_url: Elasticsearch URL\n",
" index_name: Index name to query\n",
" size: Number of documents to retrieve\n",
" \n",
" Returns:\n",
" List of LangChain Document objects with metadata\n",
" \"\"\"\n",
" es = Elasticsearch(es_url)\n",
" \n",
" # Query all documents from the index\n",
" response = es.search(\n",
" index=index_name,\n",
" body={\"query\": {\"match_all\": {}}, \"size\": size}\n",
" )\n",
" \n",
" documents = []\n",
" for hit in response[\"hits\"][\"hits\"]:\n",
" source = hit.get(\"_source\", {})\n",
" \n",
" # Extract text content (field name is 'text' based on your ES setup)\n",
" page_content = source.get(\"text\", \"\")\n",
" \n",
" if not page_content:\n",
" continue\n",
" \n",
" # Extract metadata from the document\n",
" metadata = {\n",
" \"source\": source.get(\"source\", \"unknown\"),\n",
" \"doc_id\": source.get(\"doc_id\", hit.get(\"_id\", \"\")),\n",
" \"chunk_id\": source.get(\"chunk_id\", \"\"),\n",
" \"title\": source.get(\"title\", \"Untitled\"),\n",
" \"es_id\": hit.get(\"_id\") # Store the ES document ID for reference\n",
" }\n",
" \n",
" # Create LangChain Document object\n",
" doc = Document(page_content=page_content, metadata=metadata)\n",
" documents.append(doc)\n",
" \n",
" print(f\"✓ Retrieved {len(documents)} documents from Elasticsearch\")\n",
" return documents\n",
"\n",
"\n",
"# Fetch documents from Elasticsearch\n",
"documents = fetch_documents_from_elasticsearch(ELASTICSEARCH_URL, ELASTICSEARCH_INDEX, size=100)\n",
"\n",
"if documents:\n",
" print(f\"\\nFirst document sample:\")\n",
" print(f\" Content length: {len(documents[0].page_content)} chars\")\n",
" print(f\" Metadata: {documents[0].metadata}\")\n",
"else:\n",
" print(\"⚠️ No documents found in Elasticsearch. Check index name and ES connection.\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "bf8797ac",
"metadata": {},
"outputs": [],
"source": [
"if not documents:\n",
" print(\"❌ Cannot proceed: No documents to generate testset from.\")\n",
" print(\"Ensure Elasticsearch is running and the index contains documents.\")\n",
"else:\n",
" print(\"\\n\" + \"=\" * 60)\n",
" print(\"Initializing Ragas TestSetGenerator\")\n",
" print(\"=\" * 60)\n",
" \n",
" # Initialize LLMs for test generation\n",
" generator_llm = LangchainLLMWrapper(\n",
" ChatOllama(base_url=BASE_URL, model=MODEL_NAME)\n",
" )\n",
" \n",
" # Initialize embeddings for semantic similarity evaluation\n",
" embeddings = OllamaEmbeddings(base_url=BASE_URL, model=MODEL_NAME)\n",
" \n",
" # Create the TestsetGenerator\n",
" generator = TestsetGenerator.from_langchain(\n",
" llm=generator_llm,\n",
" embedding_model=embeddings\n",
" )\n",
" \n",
" print(\"✓ TestsetGenerator initialized successfully\")\n",
" print(f\" - LLM Model: {MODEL_NAME}\")\n",
" print(f\" - Using {len(documents)} documents for generation\")\n",
" print(\"\\n⏳ Generating synthetic test set (this may take a few minutes)...\")\n",
" \n",
" # Generate synthetic test set\n",
" # Adjust testset_size based on your needs and token budget\n",
" testset_size = min(50, len(documents) * 2) # Generate up to 50 test cases or 2x the docs\n",
" \n",
" testset = generator.generate_with_langchain_docs(\n",
" documents=documents,\n",
" testset_size=testset_size,\n",
" raise_exceptions=False\n",
" )\n",
" \n",
" print(f\"✓ Test set generated with {len(testset)} examples\")\n",
" \n",
" # Convert to DataFrame for easy viewing and exporting\n",
" df = testset.to_pandas()\n",
" \n",
" print(\"\\n\" + \"=\" * 60)\n",
" print(\"Test Set Summary\")\n",
" print(\"=\" * 60)\n",
" print(f\"Total test cases: {len(df)}\")\n",
" print(f\"\\nColumns: {list(df.columns)}\")\n",
" print(f\"\\nFirst few rows:\")\n",
" print(df.head())"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8dfd3472",
"metadata": {},
"outputs": [],
"source": [
"from datetime import datetime\n",
"from pathlib import Path\n",
"\n",
"# Define output directory\n",
"output_dir = Path(\"data/interim/synthetic_testsets\")\n",
"output_dir.mkdir(parents=True, exist_ok=True)\n",
"\n",
"# Generate filename with timestamp\n",
"timestamp = datetime.now().strftime(\"%Y%m%d_%H%M%S\")\n",
"output_file = output_dir / f\"testset_{timestamp}.csv\"\n",
"\n",
"# Save to CSV\n",
"df.to_csv(output_file, index=False)\n",
"print(f\"✓ Test set saved to: {output_file}\")\n",
"\n",
"# Optional: Save as JSON for better structure preservation\n",
"json_file = output_dir / f\"testset_{timestamp}.json\"\n",
"testset.save_as_json(str(json_file))\n",
"print(f\"✓ Test set saved (JSON) to: {json_file}\")\n",
"\n",
"# Display statistics\n",
"print(\"\\n\" + \"=\" * 60)\n",
"print(\"Test Set Distribution\")\n",
"print(\"=\" * 60)\n",
"if \"question_type\" in df.columns:\n",
" print(\"\\nQuestion types:\")\n",
" print(df[\"question_type\"].value_counts())\n",
"\n",
"if \"retrieval_context\" in df.columns:\n",
" avg_context_len = df[\"retrieval_context\"].apply(lambda x: len(x) if x else 0).mean()\n",
" print(f\"\\nAverage context length: {avg_context_len:.0f} characters\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "assistance-engine",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.11"
}
},
"nbformat": 4,
"nbformat_minor": 5
}