working on scrappy

This commit is contained in:
pseco 2026-03-09 15:00:07 +01:00
parent c8da317dd8
commit 4afba7d89d
2 changed files with 117 additions and 0 deletions

View File

@ -0,0 +1,59 @@
import json
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
url = "https://developer.avapframework.com/docs"
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
driver = webdriver.Chrome(options=chrome_options)
try:
driver.get(url)
time.sleep(5)
html = driver.page_source
finally:
driver.quit()
soup = BeautifulSoup(html, "html.parser")
# Todos los bloques de contenido dentro de col-md-12
blocks = soup.select("#contentDoc .col-md-12 div.body-md")
data = []
for i, block in enumerate(blocks, start=1):
parent_section = block.find_parent("div", id=True)
content = []
full_text_parts = []
# Recorremos todos los elementos en orden
for elem in block.find_all(["h1", "h2", "h3", "h4", "h5", "h6", "p", "li", "pre", "code"], recursive=True):
text = elem.get_text(" ", strip=True)
if not text:
continue
item = {
"tag": elem.name,
"text": text
}
content.append(item)
full_text_parts.append(text)
data.append({
"block_index": i,
"section_id": parent_section.get("id") if parent_section else None,
"content": content,
"full_text": "\n".join(full_text_parts)
})
with open("avap_docs.json", "w", encoding="utf-8") as f:
json.dump(data, f, indent=2, ensure_ascii=False)
print("Bloques encontrados:", len(data))

View File

@ -0,0 +1,58 @@
import json
from datasets import load_dataset
import boto3
import typer
import logging
from botocore.config import Config
from langchain_core.messages import SystemMessage, HumanMessage
from src.utils.llm_factory import create_chat_model
from src.config import RAW_DIR, INTERIM_DIR
logger = logging.getLogger(__name__)
app = typer.Typer()
config = Config(
region_name="us-east-1",
connect_timeout=10,
read_timeout=600,
)
client = boto3.client("bedrock-runtime", config=config)
llm = create_chat_model(
provider="bedrock",
client=client,
model="global.anthropic.claude-sonnet-4-6",
temperature=0,
)
dataset_full = load_dataset("mbpp")
random_test_samples = dataset_full["test"].shuffle(seed=42).select(range(50))
test_samples_dict = {
str(i): {
"text": sample["text"],
"code": sample["code"],
}
for i, sample in enumerate(random_test_samples)
}
llm_response = llm.invoke([PROMPT_MBPP, HumanMessage(content=str(test_samples_dict))])
json_str = llm_response.content.removeprefix("```json").removesuffix("```").strip()
synthetic_data = json.loads(json_str)
if __name__ == "__main__":
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
)
try:
app()
except Exception as exc:
logger.exception(exc)
raise