import json
from enum import Enum
from datasets import load_dataset
import boto3
import typer
from loguru import logger
from botocore.config import Config
from pathlib import Path
from langchain_core.messages import SystemMessage, HumanMessage
from src.utils.llm_factory import create_chat_model
from src.config import RAW_DIR, INTERIM_DIR
from scripts.pipelines.input.prompts import get_prompt_mbpp


app = typer.Typer()


class Provider(str, Enum):
    bedrock = "bedrock"
    openai = "openai"
    ollama = "ollama"


@app.command()
def generate_synthetic_dataset(
    provider: Provider = Provider.bedrock,
    model: str = "global.anthropic.claude-sonnet-4-6",
    temperature: float = 0.0,
    num_samples: int = 10,
    seed: int = 42,
    context_docs_path: str = "docs/avap.txt",
    synthetic_output_path: str = "synthetic_datasets",
) -> None:
    """Generate synthetic dataset using the specified LLM."""
    logger.info("🚀 Starting synthetic dataset generation pipeline")
    logger.info(
        f"Configuration - Provider: {provider}, Model: {model}, Temperature: {temperature}, Samples: {num_samples}, Seed: {seed}"
    )

    try:
        config = Config(
            connect_timeout=10,
            read_timeout=600,
        )
        client = boto3.client("bedrock-runtime", config=config)
        logger.info("✓ Bedrock client initialized successfully")

        # Create LLM instance with specified parameters
        logger.debug(f"Creating LLM instance with provider: {provider}")
        llm = create_chat_model(
            provider=provider,
            client=client,
            model=model,
            temperature=temperature,
        )
        logger.info(f"✓ LLM initialized: {model}")

        # Load MBPP dataset
        logger.debug("Loading MBPP dataset")
        dataset_full = load_dataset("mbpp")
        logger.info("✓ MBPP dataset loaded successfully")

        # Select random test samples for synthetic generation
        logger.debug(f"Selecting {num_samples} random test samples from MBPP dataset")
        random_test_samples = (
            dataset_full["test"].shuffle(seed=seed).select(range(num_samples))
        )
        logger.info(f"✓ Selected {len(random_test_samples)} test samples")

        # Prepare test samples dictionary
        logger.debug("Preparing test samples dictionary")
        test_samples_dict = {
            str(i): {
                "text": sample["text"],
                "code": sample["code"],
            }
            for i, sample in enumerate(random_test_samples)
        }
        logger.info(f"✓ Prepared {len(test_samples_dict)} samples for processing")

        # Load AVAP documentation

        logger.debug(f"Loading AVAP documentation from {context_docs_path}")
        with open(context_docs_path, "r") as f:
            avap_docs = f.read()
        logger.info(f"✓ AVAP documentation loaded ({len(avap_docs)} characters)")

        # Generate prompt with AVAP context
        logger.debug("Generating prompt with AVAP context")
        get_prompt_mbpp_func = get_prompt_mbpp(avap_docs=avap_docs)
        logger.debug("✓ Prompt generated successfully")

        # Invoke LLM to generate synthetic data
        logger.info("Invoking LLM to generate synthetic dataset...")
        llm_response = llm.invoke(
            [get_prompt_mbpp_func, HumanMessage(content=str(test_samples_dict))]
        )
        logger.info("✓ LLM response received")

        # Parse JSON response
        logger.debug("Parsing LLM response as JSON")
        json_str = (
            llm_response.content.removeprefix("```json").removesuffix("```").strip()
        )
        synthetic_data = json.loads(json_str)
        logger.info(
            f"✓ Successfully parsed synthetic data with {len(synthetic_data)} samples"
        )

        logger.info(
            f"Pipeline completed successfully! Generated {len(synthetic_data)} synthetic samples"
        )

        output_dir = Path(synthetic_output_path)
        output_dir.mkdir(parents=True, exist_ok=True)

        output_file = output_dir / f"synthetic_data_{provider.value}.json"
        with output_file.open("w", encoding="utf-8") as f:
            json.dump(synthetic_data, f, ensure_ascii=False, indent=2)

        return synthetic_data

    except Exception as exc:
        logger.error(f"Error during synthetic dataset generation: {exc}")
        logger.exception("Full exception traceback")
        raise


if __name__ == "__main__":
    logger.info("=" * 50)
    logger.info("Dataset Synthetic Generation Pipeline")
    logger.info("=" * 50)
    try:
        app()
    except Exception as exc:
        logger.error(f"Pipeline execution failed: {exc}")
        raise