working on scrappy

2026-03-09 15:00:07 +01:00 · 2026-03-09 15:00:07 +01:00 · 4afba7d89d
parent c8da317dd8
commit 4afba7d89d
2 changed files with 117 additions and 0 deletions
--- a/scratches/pseco/scrappy.py
+++ b/scratches/pseco/scrappy.py
@ -0,0 +1,59 @@
+import json
+import time
+from selenium import webdriver
+from selenium.webdriver.chrome.options import Options
+from bs4 import BeautifulSoup
+
+url = "https://developer.avapframework.com/docs"
+
+chrome_options = Options()
+chrome_options.add_argument("--headless")
+chrome_options.add_argument("--no-sandbox")
+chrome_options.add_argument("--disable-dev-shm-usage")
+
+driver = webdriver.Chrome(options=chrome_options)
+
+try:
+    driver.get(url)
+    time.sleep(5)
+    html = driver.page_source
+finally:
+    driver.quit()
+
+soup = BeautifulSoup(html, "html.parser")
+
+# Todos los bloques de contenido dentro de col-md-12
+blocks = soup.select("#contentDoc .col-md-12 div.body-md")
+
+data = []
+
+for i, block in enumerate(blocks, start=1):
+    parent_section = block.find_parent("div", id=True)
+
+    content = []
+    full_text_parts = []
+
+    # Recorremos todos los elementos en orden
+    for elem in block.find_all(["h1", "h2", "h3", "h4", "h5", "h6", "p", "li", "pre", "code"], recursive=True):
+        text = elem.get_text(" ", strip=True)
+        if not text:
+            continue
+
+        item = {
+            "tag": elem.name,
+            "text": text
+        }
+        content.append(item)
+        full_text_parts.append(text)
+
+    data.append({
+        "block_index": i,
+        "section_id": parent_section.get("id") if parent_section else None,
+        "content": content,
+        "full_text": "\n".join(full_text_parts)
+    })
+
+with open("avap_docs.json", "w", encoding="utf-8") as f:
+    json.dump(data, f, indent=2, ensure_ascii=False)
+
+print("Bloques encontrados:", len(data))
--- a/scripts/pipelines/flows/dataset_synthetic_generation.py
+++ b/scripts/pipelines/flows/dataset_synthetic_generation.py
@ -0,0 +1,58 @@
+import json
+from datasets import load_dataset
+import boto3
+import typer
+import logging
+from botocore.config import Config
+from langchain_core.messages import SystemMessage, HumanMessage
+from src.utils.llm_factory import create_chat_model
+from src.config import RAW_DIR, INTERIM_DIR
+
+
+logger = logging.getLogger(__name__)
+app = typer.Typer()
+
+config = Config(
+    region_name="us-east-1",
+    connect_timeout=10,     
+    read_timeout=600,        
+)
+
+client = boto3.client("bedrock-runtime", config=config)
+
+llm = create_chat_model(
+    provider="bedrock",
+    client=client,
+    model="global.anthropic.claude-sonnet-4-6",
+    temperature=0,
+)
+
+dataset_full = load_dataset("mbpp")
+
+
+random_test_samples = dataset_full["test"].shuffle(seed=42).select(range(50))
+
+test_samples_dict = {
+    str(i): {
+        "text": sample["text"],
+        "code": sample["code"],
+    }
+    for i, sample in enumerate(random_test_samples)
+}
+
+llm_response = llm.invoke([PROMPT_MBPP, HumanMessage(content=str(test_samples_dict))])
+
+json_str = llm_response.content.removeprefix("```json").removesuffix("```").strip()
+synthetic_data = json.loads(json_str)
+
+
+if __name__ == "__main__":
+    logging.basicConfig(
+        level=logging.INFO,
+        format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
+    )
+    try:
+        app()
+    except Exception as exc:
+        logger.exception(exc)
+        raise