working on scrappy
This commit is contained in:
parent
c8da317dd8
commit
4afba7d89d
|
|
@ -0,0 +1,59 @@
|
|||
import json
|
||||
import time
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.chrome.options import Options
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
url = "https://developer.avapframework.com/docs"
|
||||
|
||||
chrome_options = Options()
|
||||
chrome_options.add_argument("--headless")
|
||||
chrome_options.add_argument("--no-sandbox")
|
||||
chrome_options.add_argument("--disable-dev-shm-usage")
|
||||
|
||||
driver = webdriver.Chrome(options=chrome_options)
|
||||
|
||||
try:
|
||||
driver.get(url)
|
||||
time.sleep(5)
|
||||
html = driver.page_source
|
||||
finally:
|
||||
driver.quit()
|
||||
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
|
||||
# Todos los bloques de contenido dentro de col-md-12
|
||||
blocks = soup.select("#contentDoc .col-md-12 div.body-md")
|
||||
|
||||
data = []
|
||||
|
||||
for i, block in enumerate(blocks, start=1):
|
||||
parent_section = block.find_parent("div", id=True)
|
||||
|
||||
content = []
|
||||
full_text_parts = []
|
||||
|
||||
# Recorremos todos los elementos en orden
|
||||
for elem in block.find_all(["h1", "h2", "h3", "h4", "h5", "h6", "p", "li", "pre", "code"], recursive=True):
|
||||
text = elem.get_text(" ", strip=True)
|
||||
if not text:
|
||||
continue
|
||||
|
||||
item = {
|
||||
"tag": elem.name,
|
||||
"text": text
|
||||
}
|
||||
content.append(item)
|
||||
full_text_parts.append(text)
|
||||
|
||||
data.append({
|
||||
"block_index": i,
|
||||
"section_id": parent_section.get("id") if parent_section else None,
|
||||
"content": content,
|
||||
"full_text": "\n".join(full_text_parts)
|
||||
})
|
||||
|
||||
with open("avap_docs.json", "w", encoding="utf-8") as f:
|
||||
json.dump(data, f, indent=2, ensure_ascii=False)
|
||||
|
||||
print("Bloques encontrados:", len(data))
|
||||
|
|
@ -0,0 +1,58 @@
|
|||
import json
|
||||
from datasets import load_dataset
|
||||
import boto3
|
||||
import typer
|
||||
import logging
|
||||
from botocore.config import Config
|
||||
from langchain_core.messages import SystemMessage, HumanMessage
|
||||
from src.utils.llm_factory import create_chat_model
|
||||
from src.config import RAW_DIR, INTERIM_DIR
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
app = typer.Typer()
|
||||
|
||||
config = Config(
|
||||
region_name="us-east-1",
|
||||
connect_timeout=10,
|
||||
read_timeout=600,
|
||||
)
|
||||
|
||||
client = boto3.client("bedrock-runtime", config=config)
|
||||
|
||||
llm = create_chat_model(
|
||||
provider="bedrock",
|
||||
client=client,
|
||||
model="global.anthropic.claude-sonnet-4-6",
|
||||
temperature=0,
|
||||
)
|
||||
|
||||
dataset_full = load_dataset("mbpp")
|
||||
|
||||
|
||||
random_test_samples = dataset_full["test"].shuffle(seed=42).select(range(50))
|
||||
|
||||
test_samples_dict = {
|
||||
str(i): {
|
||||
"text": sample["text"],
|
||||
"code": sample["code"],
|
||||
}
|
||||
for i, sample in enumerate(random_test_samples)
|
||||
}
|
||||
|
||||
llm_response = llm.invoke([PROMPT_MBPP, HumanMessage(content=str(test_samples_dict))])
|
||||
|
||||
json_str = llm_response.content.removeprefix("```json").removesuffix("```").strip()
|
||||
synthetic_data = json.loads(json_str)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
|
||||
)
|
||||
try:
|
||||
app()
|
||||
except Exception as exc:
|
||||
logger.exception(exc)
|
||||
raise
|
||||
Loading…
Reference in New Issue