assistance-engine/scratches/acano/generate_synthethic_data.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "b657efd2",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "from datasets import load_dataset\n",
    "\n",
    "import boto3\n",
    "from botocore.config import Config\n",
    "from langchain_core.messages import SystemMessage, HumanMessage\n",
    "\n",
    "from src.utils.llm_factory import create_chat_model\n",
    "from src.config import RAW_DIR, INTERIM_DIR, EXTERNAL_DIR"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e6e90339",
   "metadata": {},
   "source": [
    "### Create llm instance"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "20eecc53",
   "metadata": {},
   "outputs": [],
   "source": [
    "config = Config(\n",
    "    region_name=\"us-east-1\",\n",
    "    connect_timeout=10,     \n",
    "    read_timeout=600,        \n",
    ")\n",
    "\n",
    "client = boto3.client(\"bedrock-runtime\", config=config)\n",
    "\n",
    "llm = create_chat_model(\n",
    "    provider=\"bedrock\",\n",
    "    client=client,\n",
    "    model=\"global.anthropic.claude-sonnet-4-6\",\n",
    "    temperature=0,\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "96f12a22",
   "metadata": {},
   "source": [
    "### Load mbpp data "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "78e29dc2",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DatasetDict({\n",
       "    train: Dataset({\n",
       "        features: ['task_id', 'text', 'code', 'test_list', 'test_setup_code', 'challenge_test_list'],\n",
       "        num_rows: 374\n",
       "    })\n",
       "    test: Dataset({\n",
       "        features: ['task_id', 'text', 'code', 'test_list', 'test_setup_code', 'challenge_test_list'],\n",
       "        num_rows: 500\n",
       "    })\n",
       "    validation: Dataset({\n",
       "        features: ['task_id', 'text', 'code', 'test_list', 'test_setup_code', 'challenge_test_list'],\n",
       "        num_rows: 90\n",
       "    })\n",
       "    prompt: Dataset({\n",
       "        features: ['task_id', 'text', 'code', 'test_list', 'test_setup_code', 'challenge_test_list'],\n",
       "        num_rows: 10\n",
       "    })\n",
       "})"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset_full = load_dataset(\"mbpp\")\n",
    "dataset_full"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3e7544bb",
   "metadata": {},
   "source": [
    "### Load AVAP data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8e88b2d3",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(RAW_DIR / \"combined.txt\", \"r\") as f:\n",
    "    avap_docs = f.read()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c039d79f",
   "metadata": {},
   "source": [
    "### Random sample mbpp"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "b186af30",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Sample 100 random examples from the test split\n",
    "random_test_samples = dataset_full[\"test\"].shuffle(seed=42).select(range(50))\n",
    "\n",
    "# Save text and code in a dictionary\n",
    "test_samples_dict = {\n",
    "    str(i): {\n",
    "        \"text\": sample[\"text\"],\n",
    "        \"code\": sample[\"code\"],\n",
    "    }\n",
    "    for i, sample in enumerate(random_test_samples)\n",
    "}"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "21a0422d",
   "metadata": {},
   "source": [
    "### Prompts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2c075e29",
   "metadata": {},
   "outputs": [],
   "source": [
    "PROMPT_MBPP = SystemMessage(\n",
    "    content=f\"\"\"\n",
    "    You are an AI agent responsible for generating a synthetic dataset for a programming language called AVAP (Advanced Virtual API Programming).\n",
    "\n",
    "    You will receive a JSON object containing programming problems and their solutions written in Python.\n",
    "\n",
    "    Your task is to:\n",
    "    1. Convert each Python programming problem into an equivalent problem framed for the AVAP language.\n",
    "    2. Replace the Python solution with a correct implementation written in AVAP.\n",
    "\n",
    "    Requirements:\n",
    "    - The AVAP solution must follow AVAP syntax.\n",
    "    - Use AVAP libraries and constructs when appropriate.\n",
    "    - Functions in AVAP are declared using the `function` keyword.\n",
    "    - Preserve the original problem intent and difficulty.\n",
    "    - Do not produce Python code in the final solution.\n",
    "\n",
    "    Documentation usage rules:\n",
    "    - You MUST rely exclusively on the AVAP documentation provided below.\n",
    "    - Do not invent syntax, functions, or libraries that are not described in the documentation.\n",
    "    - If the documentation does not contain enough information to solve the problem, return exactly:\n",
    "    \"I don't know how to answer this question based on the provided documentation.\"\n",
    "\n",
    "    AVAP Documentation:\n",
    "    {avap_docs}\n",
    "    \"\"\"\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cbb459f8",
   "metadata": {},
   "outputs": [],
   "source": [
    "PROMPT_NO_MBPP = HumanMessage(\n",
    "    content=\"\"\"\n",
    "    I gave you all the context for a new programming language called AVAP (Advanced Virtual API Programming). \n",
    "    With that context, I'd like you to generate a \"golden dataset\" of 100 question and answer where the user query is asking code snippets and the response is the full code. \n",
    "    I want the questions and answers to cover the entire context provided, and I want there to be questions and answers at all possible levels, from questions with simple code snippets to more complex questions with more complex code. \n",
    "    \n",
    "    You must return a .json file with user_query and response for each question and answer\n",
    "    \"\"\"\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ea1e824e",
   "metadata": {},
   "source": [
    "### Generate dataset using mbpp dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8b7dcf2f",
   "metadata": {},
   "outputs": [],
   "source": [
    "llm_response = llm.invoke([PROMPT_MBPP, HumanMessage(content=str(test_samples_dict))])\n",
    "print(llm_response.content)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "c6f5872e",
   "metadata": {},
   "outputs": [],
   "source": [
    "json_str = llm_response.content.removeprefix(\"```json\").removesuffix(\"```\").strip()\n",
    "synthetic_data = json.loads(json_str)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d26cbba7",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(INTERIM_DIR /'synthethic_datasets/synthethic_data.json', 'w') as f:\n",
    "    json.dump(synthetic_data, f)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "fc52b327",
   "metadata": {},
   "source": [
    "### Generate dataset without mbpp dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b16137cb",
   "metadata": {},
   "outputs": [],
   "source": [
    "llm_response = llm.invoke([SystemMessage(content=avap_docs), PROMPT_NO_MBPP])\n",
    "print(llm_response.content)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "80d207fa",
   "metadata": {},
   "outputs": [],
   "source": [
    "json_str = llm_response.content.removeprefix(\"```json\").removesuffix(\"```\").strip()\n",
    "synthetic_data = json.loads(json_str)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "13e53200",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(INTERIM_DIR /'synthethic_datasets/synthethic_data_no_mbpp.json', 'w') as f:\n",
    "    json.dump(synthetic_data, f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c3deb316",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "assistance-engine",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}