import json from enum import Enum from datasets import load_dataset import boto3 import typer from loguru import logger from botocore.config import Config from pathlib import Path from langchain_core.messages import SystemMessage, HumanMessage from src.utils.llm_factory import create_chat_model from src.config import RAW_DIR, INTERIM_DIR from scripts.pipelines.input.prompts import get_prompt_mbpp app = typer.Typer() class Provider(str, Enum): bedrock = "bedrock" openai = "openai" ollama = "ollama" @app.command() def generate_synthetic_dataset( provider: Provider = Provider.bedrock, model: str = "global.anthropic.claude-sonnet-4-6", temperature: float = 0.0, num_samples: int = 10, seed: int = 42, context_docs_path: str = "docs/avap.txt", synthetic_output_path: str = "synthetic_datasets", ) -> None: """Generate synthetic dataset using the specified LLM.""" logger.info("🚀 Starting synthetic dataset generation pipeline") logger.info( f"Configuration - Provider: {provider}, Model: {model}, Temperature: {temperature}, Samples: {num_samples}, Seed: {seed}" ) try: config = Config( connect_timeout=10, read_timeout=600, ) client = boto3.client("bedrock-runtime", config=config) logger.info("✓ Bedrock client initialized successfully") # Create LLM instance with specified parameters logger.debug(f"Creating LLM instance with provider: {provider}") llm = create_chat_model( provider=provider, client=client, model=model, temperature=temperature, ) logger.info(f"✓ LLM initialized: {model}") # Load MBPP dataset logger.debug("Loading MBPP dataset") dataset_full = load_dataset("mbpp") logger.info("✓ MBPP dataset loaded successfully") # Select random test samples for synthetic generation logger.debug(f"Selecting {num_samples} random test samples from MBPP dataset") random_test_samples = ( dataset_full["test"].shuffle(seed=seed).select(range(num_samples)) ) logger.info(f"✓ Selected {len(random_test_samples)} test samples") # Prepare test samples dictionary logger.debug("Preparing test samples dictionary") test_samples_dict = { str(i): { "text": sample["text"], "code": sample["code"], } for i, sample in enumerate(random_test_samples) } logger.info(f"✓ Prepared {len(test_samples_dict)} samples for processing") # Load AVAP documentation logger.debug(f"Loading AVAP documentation from {context_docs_path}") with open(context_docs_path, "r") as f: avap_docs = f.read() logger.info(f"✓ AVAP documentation loaded ({len(avap_docs)} characters)") # Generate prompt with AVAP context logger.debug("Generating prompt with AVAP context") get_prompt_mbpp_func = get_prompt_mbpp(avap_docs=avap_docs) logger.debug("✓ Prompt generated successfully") # Invoke LLM to generate synthetic data logger.info("Invoking LLM to generate synthetic dataset...") llm_response = llm.invoke( [get_prompt_mbpp_func, HumanMessage(content=str(test_samples_dict))] ) logger.info("✓ LLM response received") # Parse JSON response logger.debug("Parsing LLM response as JSON") json_str = ( llm_response.content.removeprefix("```json").removesuffix("```").strip() ) synthetic_data = json.loads(json_str) logger.info( f"✓ Successfully parsed synthetic data with {len(synthetic_data)} samples" ) logger.info( f"Pipeline completed successfully! Generated {len(synthetic_data)} synthetic samples" ) output_dir = Path(synthetic_output_path) output_dir.mkdir(parents=True, exist_ok=True) output_file = output_dir / f"synthetic_data_{provider.value}.json" with output_file.open("w", encoding="utf-8") as f: json.dump(synthetic_data, f, ensure_ascii=False, indent=2) return synthetic_data except Exception as exc: logger.error(f"Error during synthetic dataset generation: {exc}") logger.exception("Full exception traceback") raise if __name__ == "__main__": logger.info("=" * 50) logger.info("Dataset Synthetic Generation Pipeline") logger.info("=" * 50) try: app() except Exception as exc: logger.error(f"Pipeline execution failed: {exc}") raise