assistance-engine/scratches/pseco/scrappy.py

75 lines
1.8 KiB
Python

import json
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
url = "https://developer.avapframework.com/docs"
chrome_options = Options()
chrome_options.add_argument("--headless")
driver = webdriver.Chrome(options=chrome_options)
try:
driver.get(url)
wait = WebDriverWait(driver, 15)
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#contentDoc")))
html = driver.page_source
finally:
driver.quit()
soup = BeautifulSoup(html, "html.parser")
main_container = soup.select_one("#contentDoc .col-md-12")
def html_to_markdown(elem):
text = elem.get_text(" ", strip=True)
if not text:
return None
if elem.name == "h1":
return f"# {text}\n"
if elem.name == "h2":
return f"## {text}\n"
if elem.name == "h3":
return f"### {text}\n"
if elem.name == "h4":
return f"#### {text}\n"
if elem.name == "h5":
return f"##### {text}\n"
if elem.name == "h6":
return f"###### {text}\n"
if elem.name == "p":
return f"{text}\n"
if elem.name == "li":
return f"- {text}"
if elem.name == "pre":
code = elem.get_text("\n", strip=True)
return f"\n```\n{code}\n```\n"
return None
markdown_lines = []
tags_to_extract = ["h1", "h2", "h3", "h4", "h5", "h6", "p", "li", "pre"]
for elem in main_container.find_all(tags_to_extract):
md = html_to_markdown(elem)
if md:
markdown_lines.append(md)
markdown = "\n".join(markdown_lines)
with open("avap_docs.md", "w", encoding="utf-8") as f:
f.write(markdown)