import json import time from bs4 import BeautifulSoup from selenium import webdriver from selenium.webdriver.chrome.options import Options from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.ui import WebDriverWait url = "https://developer.avapframework.com/docs" chrome_options = Options() chrome_options.add_argument("--headless") driver = webdriver.Chrome(options=chrome_options) try: driver.get(url) wait = WebDriverWait(driver, 15) wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#contentDoc"))) html = driver.page_source finally: driver.quit() soup = BeautifulSoup(html, "html.parser") main_container = soup.select_one("#contentDoc .col-md-12") def html_to_markdown(elem): text = elem.get_text(" ", strip=True) if not text: return None if elem.name == "h1": return f"# {text}\n" if elem.name == "h2": return f"## {text}\n" if elem.name == "h3": return f"### {text}\n" if elem.name == "h4": return f"#### {text}\n" if elem.name == "h5": return f"##### {text}\n" if elem.name == "h6": return f"###### {text}\n" if elem.name == "p": return f"{text}\n" if elem.name == "li": return f"- {text}" if elem.name == "pre": code = elem.get_text("\n", strip=True) return f"\n```\n{code}\n```\n" return None markdown_lines = [] tags_to_extract = ["h1", "h2", "h3", "h4", "h5", "h6", "p", "li", "pre"] for elem in main_container.find_all(tags_to_extract): md = html_to_markdown(elem) if md: markdown_lines.append(md) markdown = "\n".join(markdown_lines) with open("avap_docs.md", "w", encoding="utf-8") as f: f.write(markdown)