75 lines
1.8 KiB
Python
75 lines
1.8 KiB
Python
import json
|
|
import time
|
|
from bs4 import BeautifulSoup
|
|
from selenium import webdriver
|
|
from selenium.webdriver.chrome.options import Options
|
|
from selenium.webdriver.common.by import By
|
|
from selenium.webdriver.support import expected_conditions as EC
|
|
from selenium.webdriver.support.ui import WebDriverWait
|
|
url = "https://developer.avapframework.com/docs"
|
|
|
|
chrome_options = Options()
|
|
chrome_options.add_argument("--headless")
|
|
|
|
driver = webdriver.Chrome(options=chrome_options)
|
|
|
|
try:
|
|
driver.get(url)
|
|
|
|
wait = WebDriverWait(driver, 15)
|
|
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#contentDoc")))
|
|
|
|
html = driver.page_source
|
|
finally:
|
|
driver.quit()
|
|
|
|
soup = BeautifulSoup(html, "html.parser")
|
|
|
|
main_container = soup.select_one("#contentDoc .col-md-12")
|
|
|
|
def html_to_markdown(elem):
|
|
text = elem.get_text(" ", strip=True)
|
|
|
|
if not text:
|
|
return None
|
|
|
|
if elem.name == "h1":
|
|
return f"# {text}\n"
|
|
if elem.name == "h2":
|
|
return f"## {text}\n"
|
|
if elem.name == "h3":
|
|
return f"### {text}\n"
|
|
if elem.name == "h4":
|
|
return f"#### {text}\n"
|
|
if elem.name == "h5":
|
|
return f"##### {text}\n"
|
|
if elem.name == "h6":
|
|
return f"###### {text}\n"
|
|
|
|
if elem.name == "p":
|
|
return f"{text}\n"
|
|
|
|
if elem.name == "li":
|
|
return f"- {text}"
|
|
|
|
if elem.name == "pre":
|
|
code = elem.get_text("\n", strip=True)
|
|
return f"\n```\n{code}\n```\n"
|
|
|
|
return None
|
|
|
|
markdown_lines = []
|
|
|
|
tags_to_extract = ["h1", "h2", "h3", "h4", "h5", "h6", "p", "li", "pre"]
|
|
|
|
for elem in main_container.find_all(tags_to_extract):
|
|
md = html_to_markdown(elem)
|
|
|
|
if md:
|
|
markdown_lines.append(md)
|
|
|
|
markdown = "\n".join(markdown_lines)
|
|
|
|
|
|
with open("avap_docs.md", "w", encoding="utf-8") as f:
|
|
f.write(markdown) |