diff --git a/scratches/pseco/scrappy.py b/scratches/pseco/scrappy.py new file mode 100644 index 0000000..38a39d3 --- /dev/null +++ b/scratches/pseco/scrappy.py @@ -0,0 +1,59 @@ +import json +import time +from selenium import webdriver +from selenium.webdriver.chrome.options import Options +from bs4 import BeautifulSoup + +url = "https://developer.avapframework.com/docs" + +chrome_options = Options() +chrome_options.add_argument("--headless") +chrome_options.add_argument("--no-sandbox") +chrome_options.add_argument("--disable-dev-shm-usage") + +driver = webdriver.Chrome(options=chrome_options) + +try: + driver.get(url) + time.sleep(5) + html = driver.page_source +finally: + driver.quit() + +soup = BeautifulSoup(html, "html.parser") + +# Todos los bloques de contenido dentro de col-md-12 +blocks = soup.select("#contentDoc .col-md-12 div.body-md") + +data = [] + +for i, block in enumerate(blocks, start=1): + parent_section = block.find_parent("div", id=True) + + content = [] + full_text_parts = [] + + # Recorremos todos los elementos en orden + for elem in block.find_all(["h1", "h2", "h3", "h4", "h5", "h6", "p", "li", "pre", "code"], recursive=True): + text = elem.get_text(" ", strip=True) + if not text: + continue + + item = { + "tag": elem.name, + "text": text + } + content.append(item) + full_text_parts.append(text) + + data.append({ + "block_index": i, + "section_id": parent_section.get("id") if parent_section else None, + "content": content, + "full_text": "\n".join(full_text_parts) + }) + +with open("avap_docs.json", "w", encoding="utf-8") as f: + json.dump(data, f, indent=2, ensure_ascii=False) + +print("Bloques encontrados:", len(data)) \ No newline at end of file diff --git a/scripts/pipelines/flows/dataset_synthetic_generation.py b/scripts/pipelines/flows/dataset_synthetic_generation.py new file mode 100644 index 0000000..ebcf785 --- /dev/null +++ b/scripts/pipelines/flows/dataset_synthetic_generation.py @@ -0,0 +1,58 @@ +import json +from datasets import load_dataset +import boto3 +import typer +import logging +from botocore.config import Config +from langchain_core.messages import SystemMessage, HumanMessage +from src.utils.llm_factory import create_chat_model +from src.config import RAW_DIR, INTERIM_DIR + + +logger = logging.getLogger(__name__) +app = typer.Typer() + +config = Config( + region_name="us-east-1", + connect_timeout=10, + read_timeout=600, +) + +client = boto3.client("bedrock-runtime", config=config) + +llm = create_chat_model( + provider="bedrock", + client=client, + model="global.anthropic.claude-sonnet-4-6", + temperature=0, +) + +dataset_full = load_dataset("mbpp") + + +random_test_samples = dataset_full["test"].shuffle(seed=42).select(range(50)) + +test_samples_dict = { + str(i): { + "text": sample["text"], + "code": sample["code"], + } + for i, sample in enumerate(random_test_samples) +} + +llm_response = llm.invoke([PROMPT_MBPP, HumanMessage(content=str(test_samples_dict))]) + +json_str = llm_response.content.removeprefix("```json").removesuffix("```").strip() +synthetic_data = json.loads(json_str) + + +if __name__ == "__main__": + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s | %(levelname)s | %(name)s | %(message)s", + ) + try: + app() + except Exception as exc: + logger.exception(exc) + raise