assistance-engine/scratches/pseco/scrappy.py

import json
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
url = "https://developer.avapframework.com/docs"

chrome_options = Options()
chrome_options.add_argument("--headless")

driver = webdriver.Chrome(options=chrome_options)

try:
    driver.get(url)

    wait = WebDriverWait(driver, 15)
    wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#contentDoc")))

    html = driver.page_source
finally:
    driver.quit()

soup = BeautifulSoup(html, "html.parser")

main_container = soup.select_one("#contentDoc .col-md-12")

def html_to_markdown(elem):
    text = elem.get_text(" ", strip=True)

    if not text:
        return None

    if elem.name == "h1":
        return f"# {text}\n"
    if elem.name == "h2":
        return f"## {text}\n"
    if elem.name == "h3":
        return f"### {text}\n"
    if elem.name == "h4":
        return f"#### {text}\n"
    if elem.name == "h5":
        return f"##### {text}\n"
    if elem.name == "h6":
        return f"###### {text}\n"

    if elem.name == "p":
        return f"{text}\n"

    if elem.name == "li":
        return f"- {text}"

    if elem.name == "pre":
        code = elem.get_text("\n", strip=True)
        return f"\n```\n{code}\n```\n"

    return None

markdown_lines = []

tags_to_extract = ["h1", "h2", "h3", "h4", "h5", "h6", "p", "li", "pre"]

for elem in main_container.find_all(tags_to_extract):
    md = html_to_markdown(elem)

    if md:
        markdown_lines.append(md)

markdown = "\n".join(markdown_lines)


with open("avap_docs.md", "w", encoding="utf-8") as f:
    f.write(markdown)