303 lines
12 KiB
Plaintext
303 lines
12 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "efbabfec",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Libraries"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "e06aaf8c",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from src.config import ELASTICSEARCH_INDEX, OPENAI_API_KEY, OPENAI_MODEL, BASE_URL, MODEL_NAME"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "8488db9f",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"ename": "ModuleNotFoundError",
|
|
"evalue": "No module named 'ragas.testset.generator'",
|
|
"output_type": "error",
|
|
"traceback": [
|
|
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
|
|
"\u001b[31mModuleNotFoundError\u001b[39m Traceback (most recent call last)",
|
|
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[1]\u001b[39m\u001b[32m, line 6\u001b[39m\n\u001b[32m 4\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mlangchain_core\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mdocuments\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m Document\n\u001b[32m 5\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mlangchain_ollama\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m ChatOllama, OllamaEmbeddings\n\u001b[32m----> \u001b[39m\u001b[32m6\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mragas\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mtestset\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mgenerator\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m TestsetGenerator\n\u001b[32m 7\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mragas\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mllms\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m LangchainLLMWrapper\n",
|
|
"\u001b[31mModuleNotFoundError\u001b[39m: No module named 'ragas.testset.generator'"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"from langchain_core.embeddings import Embeddings \n",
|
|
"from ragas.testset import TestsetGenerator \n",
|
|
"from langchain_community.document_loaders import DirectoryLoader \n",
|
|
"from openai import OpenAI \n",
|
|
"from ragas.llms import llm_factory \n",
|
|
"from langchain_core.documents import Document"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "816afb58",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"class ElasticsearchEmbeddings(Embeddings): \n",
|
|
" \"\"\"Fetch precomputed embeddings from Elasticsearch.\"\"\" \n",
|
|
" def __init__(self, es_client, index_name, id_field=\"_id\", embedding_field=\"embedding_vector\"): \n",
|
|
" self.es = es_client \n",
|
|
" self.index = index_name \n",
|
|
" self.id_field = id_field \n",
|
|
" self.embedding_field = embedding_field \n",
|
|
" \n",
|
|
" def embed_query(self, text: str): \n",
|
|
" # Example: hash text to a doc ID or perform a text lookup in ES \n",
|
|
" doc_id = self._text_to_doc_id(text) # implement your mapping \n",
|
|
" resp = self.es.get(index=self.index, id=doc_id) \n",
|
|
" return resp[\"_source\"][self.embedding_field] \n",
|
|
" \n",
|
|
" def embed_documents(self, texts): \n",
|
|
" return [self.embed_query(t) for t in texts] \n",
|
|
" \n",
|
|
" # Optional: implement async variants if Ragas calls them"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "65a7c504",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"loader = DirectoryLoader(\"./my_code_repo\", glob=\"**/*.py\", recursive=True) \n",
|
|
"docs = loader.load() \n",
|
|
" \n",
|
|
"# 2) Ollama LLM via OpenAI-compatible client \n",
|
|
"ollama_client = OpenAI(base_url=\"http://localhost:11434/v1\", api_key=\"ollama\") \n",
|
|
"generator_llm = llm_factory(\"codellama\", provider=\"openai\", client=ollama_client) \n",
|
|
" \n",
|
|
"# 3) Custom ES embeddings wrapper \n",
|
|
"from elasticsearch import Elasticsearch \n",
|
|
"es_client = Elasticsearch(\"http://localhost:9200\") \n",
|
|
"es_embeddings = ElasticsearchEmbeddings(es_client, index_name=\"my_docs_embeddings\") \n",
|
|
" \n",
|
|
"# 4) Ragas TestsetGenerator using the ES embeddings \n",
|
|
"generator = TestsetGenerator.from_langchain(generator_llm, es_embeddings) \n",
|
|
" \n",
|
|
"# 5) Generate testset \n",
|
|
"dataset = generator.generate_with_langchain_docs(docs, testset_size=20) \n",
|
|
" \n",
|
|
"# 6) Export \n",
|
|
"df = dataset.to_pandas() \n",
|
|
"print(df[[\"user_input\", \"reference\"]].head())"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "06f41852",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Functions"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "d24262b0",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def fetch_documents_from_elasticsearch(\n",
|
|
" es_url: str, \n",
|
|
" index_name: str, \n",
|
|
" size: int = 100\n",
|
|
") -> List[Document]:\n",
|
|
" \"\"\"\n",
|
|
" Retrieve documents from Elasticsearch and convert to LangChain Document objects.\n",
|
|
" \n",
|
|
" Args:\n",
|
|
" es_url: Elasticsearch URL\n",
|
|
" index_name: Index name to query\n",
|
|
" size: Number of documents to retrieve\n",
|
|
" \n",
|
|
" Returns:\n",
|
|
" List of LangChain Document objects with metadata\n",
|
|
" \"\"\"\n",
|
|
" es = Elasticsearch(es_url)\n",
|
|
" \n",
|
|
" # Query all documents from the index\n",
|
|
" response = es.search(\n",
|
|
" index=index_name,\n",
|
|
" body={\"query\": {\"match_all\": {}}, \"size\": size}\n",
|
|
" )\n",
|
|
" \n",
|
|
" documents = []\n",
|
|
" for hit in response[\"hits\"][\"hits\"]:\n",
|
|
" source = hit.get(\"_source\", {})\n",
|
|
" \n",
|
|
" # Extract text content (field name is 'text' based on your ES setup)\n",
|
|
" page_content = source.get(\"text\", \"\")\n",
|
|
" \n",
|
|
" if not page_content:\n",
|
|
" continue\n",
|
|
" \n",
|
|
" # Extract metadata from the document\n",
|
|
" metadata = {\n",
|
|
" \"source\": source.get(\"source\", \"unknown\"),\n",
|
|
" \"doc_id\": source.get(\"doc_id\", hit.get(\"_id\", \"\")),\n",
|
|
" \"chunk_id\": source.get(\"chunk_id\", \"\"),\n",
|
|
" \"title\": source.get(\"title\", \"Untitled\"),\n",
|
|
" \"es_id\": hit.get(\"_id\") # Store the ES document ID for reference\n",
|
|
" }\n",
|
|
" \n",
|
|
" # Create LangChain Document object\n",
|
|
" doc = Document(page_content=page_content, metadata=metadata)\n",
|
|
" documents.append(doc)\n",
|
|
" \n",
|
|
" print(f\"✓ Retrieved {len(documents)} documents from Elasticsearch\")\n",
|
|
" return documents\n",
|
|
"\n",
|
|
"\n",
|
|
"# Fetch documents from Elasticsearch\n",
|
|
"documents = fetch_documents_from_elasticsearch(ELASTICSEARCH_URL, ELASTICSEARCH_INDEX, size=100)\n",
|
|
"\n",
|
|
"if documents:\n",
|
|
" print(f\"\\nFirst document sample:\")\n",
|
|
" print(f\" Content length: {len(documents[0].page_content)} chars\")\n",
|
|
" print(f\" Metadata: {documents[0].metadata}\")\n",
|
|
"else:\n",
|
|
" print(\"⚠️ No documents found in Elasticsearch. Check index name and ES connection.\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "bf8797ac",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"if not documents:\n",
|
|
" print(\"❌ Cannot proceed: No documents to generate testset from.\")\n",
|
|
" print(\"Ensure Elasticsearch is running and the index contains documents.\")\n",
|
|
"else:\n",
|
|
" print(\"\\n\" + \"=\" * 60)\n",
|
|
" print(\"Initializing Ragas TestSetGenerator\")\n",
|
|
" print(\"=\" * 60)\n",
|
|
" \n",
|
|
" # Initialize LLMs for test generation\n",
|
|
" generator_llm = LangchainLLMWrapper(\n",
|
|
" ChatOllama(base_url=BASE_URL, model=MODEL_NAME)\n",
|
|
" )\n",
|
|
" \n",
|
|
" # Initialize embeddings for semantic similarity evaluation\n",
|
|
" embeddings = OllamaEmbeddings(base_url=BASE_URL, model=MODEL_NAME)\n",
|
|
" \n",
|
|
" # Create the TestsetGenerator\n",
|
|
" generator = TestsetGenerator.from_langchain(\n",
|
|
" llm=generator_llm,\n",
|
|
" embedding_model=embeddings\n",
|
|
" )\n",
|
|
" \n",
|
|
" print(\"✓ TestsetGenerator initialized successfully\")\n",
|
|
" print(f\" - LLM Model: {MODEL_NAME}\")\n",
|
|
" print(f\" - Using {len(documents)} documents for generation\")\n",
|
|
" print(\"\\n⏳ Generating synthetic test set (this may take a few minutes)...\")\n",
|
|
" \n",
|
|
" # Generate synthetic test set\n",
|
|
" # Adjust testset_size based on your needs and token budget\n",
|
|
" testset_size = min(50, len(documents) * 2) # Generate up to 50 test cases or 2x the docs\n",
|
|
" \n",
|
|
" testset = generator.generate_with_langchain_docs(\n",
|
|
" documents=documents,\n",
|
|
" testset_size=testset_size,\n",
|
|
" raise_exceptions=False\n",
|
|
" )\n",
|
|
" \n",
|
|
" print(f\"✓ Test set generated with {len(testset)} examples\")\n",
|
|
" \n",
|
|
" # Convert to DataFrame for easy viewing and exporting\n",
|
|
" df = testset.to_pandas()\n",
|
|
" \n",
|
|
" print(\"\\n\" + \"=\" * 60)\n",
|
|
" print(\"Test Set Summary\")\n",
|
|
" print(\"=\" * 60)\n",
|
|
" print(f\"Total test cases: {len(df)}\")\n",
|
|
" print(f\"\\nColumns: {list(df.columns)}\")\n",
|
|
" print(f\"\\nFirst few rows:\")\n",
|
|
" print(df.head())"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "8dfd3472",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from datetime import datetime\n",
|
|
"from pathlib import Path\n",
|
|
"\n",
|
|
"# Define output directory\n",
|
|
"output_dir = Path(\"data/interim/synthetic_testsets\")\n",
|
|
"output_dir.mkdir(parents=True, exist_ok=True)\n",
|
|
"\n",
|
|
"# Generate filename with timestamp\n",
|
|
"timestamp = datetime.now().strftime(\"%Y%m%d_%H%M%S\")\n",
|
|
"output_file = output_dir / f\"testset_{timestamp}.csv\"\n",
|
|
"\n",
|
|
"# Save to CSV\n",
|
|
"df.to_csv(output_file, index=False)\n",
|
|
"print(f\"✓ Test set saved to: {output_file}\")\n",
|
|
"\n",
|
|
"# Optional: Save as JSON for better structure preservation\n",
|
|
"json_file = output_dir / f\"testset_{timestamp}.json\"\n",
|
|
"testset.save_as_json(str(json_file))\n",
|
|
"print(f\"✓ Test set saved (JSON) to: {json_file}\")\n",
|
|
"\n",
|
|
"# Display statistics\n",
|
|
"print(\"\\n\" + \"=\" * 60)\n",
|
|
"print(\"Test Set Distribution\")\n",
|
|
"print(\"=\" * 60)\n",
|
|
"if \"question_type\" in df.columns:\n",
|
|
" print(\"\\nQuestion types:\")\n",
|
|
" print(df[\"question_type\"].value_counts())\n",
|
|
"\n",
|
|
"if \"retrieval_context\" in df.columns:\n",
|
|
" avg_context_len = df[\"retrieval_context\"].apply(lambda x: len(x) if x else 0).mean()\n",
|
|
" print(f\"\\nAverage context length: {avg_context_len:.0f} characters\")"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "assistance-engine",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.12.11"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|