199 lines
5.1 KiB
Plaintext
199 lines
5.1 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"id": "d520f6c3",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import json\n",
|
|
"from datasets import load_dataset\n",
|
|
"\n",
|
|
"import boto3\n",
|
|
"from botocore.config import Config\n",
|
|
"from langchain_core.messages import SystemMessage, HumanMessage\n",
|
|
"\n",
|
|
"from src.utils.llm_factory import create_chat_model\n",
|
|
"from src.config import settings"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "e08b9060",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Create LLM isntance"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"id": "81111a86",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"config = Config(\n",
|
|
" region_name=\"us-east-1\",\n",
|
|
" connect_timeout=10, \n",
|
|
" read_timeout=600, \n",
|
|
")\n",
|
|
"\n",
|
|
"client = boto3.client(\"bedrock-runtime\", config=config)\n",
|
|
"\n",
|
|
"llm = create_chat_model(\n",
|
|
" provider=\"bedrock\",\n",
|
|
" client=client,\n",
|
|
" model=\"global.anthropic.claude-sonnet-4-6\",\n",
|
|
" temperature=0,\n",
|
|
")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "045f8e81",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Load AVAP data"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"id": "07dea3fe",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"with open(settings.proj_root / \"docs/LRM/avap.md\", \"r\") as f:\n",
|
|
" avap_docs = f.read()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"id": "adbbe8b6",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Loaded 33 AVAP samples\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"samples_dir = settings.proj_root / \"docs/samples\"\n",
|
|
"avap_samples = []\n",
|
|
"\n",
|
|
"for avap_file in sorted(samples_dir.glob(\"*.avap\")):\n",
|
|
" with open(avap_file, \"r\") as f:\n",
|
|
" code = f.read()\n",
|
|
" \n",
|
|
" avap_samples.append({\n",
|
|
" \"file\": avap_file.name,\n",
|
|
" \"code\": code\n",
|
|
" })\n",
|
|
"\n",
|
|
"# Display as JSON\n",
|
|
"avap_samples_json = json.dumps(avap_samples, indent=2, ensure_ascii=False)\n",
|
|
"print(f\"Loaded {len(avap_samples)} AVAP samples\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "7a15e09a",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Prompt"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "895a170f",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"GOLDEN_DATASET_PROMPT = SystemMessage(\n",
|
|
" content=f\"\"\"\n",
|
|
" You are an AI agent responsible for generating a golden dataset of queries for AVAP code retrieval and understanding.\n",
|
|
"\n",
|
|
" You will receive a JSON array of AVAP code samples, each with a 'file' name and 'code' content.\n",
|
|
"\n",
|
|
" Your task is to:\n",
|
|
" 1. Analyze each AVAP code sample.\n",
|
|
" 2. Generate 2-3 natural language queries that can be answered by examining that specific code.\n",
|
|
" 3. Output a JSON array where each element has:\n",
|
|
" - \"query\": A natural language question about AVAP code implementation, best practices, or specific constructs.\n",
|
|
" - \"context\": The filename of the code sample that provides the context/answer for this query.\n",
|
|
"\n",
|
|
" Requirements:\n",
|
|
" - Queries should be diverse: ask about functions, control flow, API operations, error handling, etc.\n",
|
|
" - Queries must be answerable using ONLY the provided code samples.\n",
|
|
" - Queries should be framed as natural developer questions (e.g., \"How do you handle errors in AVAP?\" or \"Show me an example of looping over a list\").\n",
|
|
" - Use natural English (or Spanish if context is Spanish-language code).\n",
|
|
" - Do not reference exact variable names unless necessary; focus on the patterns and constructs used.\n",
|
|
" - Output MUST be valid JSON array format.\n",
|
|
"\n",
|
|
" AVAP Code Samples:\n",
|
|
" {avap_samples_json}\n",
|
|
"\n",
|
|
" Output format (JSON array):\n",
|
|
" [\n",
|
|
" {{\"query\": \"...\", \"context\": \"filename.avap\"}},\n",
|
|
" {{\"query\": \"...\", \"context\": \"filename.avap\"}},\n",
|
|
" ...\n",
|
|
" ]\n",
|
|
" \"\"\"\n",
|
|
")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "a3123199",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "98c4f93c",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "723352ee",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "assistance-engine",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.11.13"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|