import json import time from selenium import webdriver from selenium.webdriver.chrome.options import Options from bs4 import BeautifulSoup url = "https://developer.avapframework.com/docs" chrome_options = Options() chrome_options.add_argument("--headless") chrome_options.add_argument("--no-sandbox") chrome_options.add_argument("--disable-dev-shm-usage") driver = webdriver.Chrome(options=chrome_options) try: driver.get(url) time.sleep(5) html = driver.page_source finally: driver.quit() soup = BeautifulSoup(html, "html.parser") # Todos los bloques de contenido dentro de col-md-12 blocks = soup.select("#contentDoc .col-md-12 div.body-md") data = [] for i, block in enumerate(blocks, start=1): parent_section = block.find_parent("div", id=True) content = [] full_text_parts = [] # Recorremos todos los elementos en orden for elem in block.find_all(["h1", "h2", "h3", "h4", "h5", "h6", "p", "li", "pre", "code"], recursive=True): text = elem.get_text(" ", strip=True) if not text: continue item = { "tag": elem.name, "text": text } content.append(item) full_text_parts.append(text) data.append({ "block_index": i, "section_id": parent_section.get("id") if parent_section else None, "content": content, "full_text": "\n".join(full_text_parts) }) with open("avap_docs.json", "w", encoding="utf-8") as f: json.dump(data, f, indent=2, ensure_ascii=False) print("Bloques encontrados:", len(data))