feat(chunk): enhance file reading and processing logic
- Updated `read_files` function to return a list of dictionaries containing 'content' and 'title' keys. - Added logic to handle concatenation of file contents and improved handling of file prefixes. - Introduced `get_chunk_docs` function to chunk document contents using `SemanticChunker`. - Added `convert_chunks_to_document` function to convert chunked content into `Document` objects. - Integrated logging for chunking process. - Updated dependencies in `uv.lock` to include `chonkie` and other related packages.
This commit is contained in:
parent
f6bfba5561
commit
bf3c7f36d8
File diff suppressed because it is too large
Load Diff
|
|
@ -5,6 +5,7 @@ description = "Add your description here"
|
||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
requires-python = ">=3.11"
|
requires-python = ">=3.11"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
|
"chonkie[semantic]>=1.5.6",
|
||||||
"grpcio>=1.78.0",
|
"grpcio>=1.78.0",
|
||||||
"grpcio-reflection>=1.78.0",
|
"grpcio-reflection>=1.78.0",
|
||||||
"grpcio-tools>=1.78.0",
|
"grpcio-tools>=1.78.0",
|
||||||
|
|
@ -36,5 +37,6 @@ dev = [
|
||||||
"polars>=1.38.1",
|
"polars>=1.38.1",
|
||||||
"ragas>=0.4.3",
|
"ragas>=0.4.3",
|
||||||
"ruff>=0.15.1",
|
"ruff>=0.15.1",
|
||||||
|
"selenium>=4.41.0",
|
||||||
"tree-sitter-language-pack>=0.13.0",
|
"tree-sitter-language-pack>=0.13.0",
|
||||||
]
|
]
|
||||||
|
|
|
||||||
File diff suppressed because one or more lines are too long
|
|
@ -1,59 +1,75 @@
|
||||||
import json
|
import json
|
||||||
import time
|
import time
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
from selenium import webdriver
|
from selenium import webdriver
|
||||||
from selenium.webdriver.chrome.options import Options
|
from selenium.webdriver.chrome.options import Options
|
||||||
from bs4 import BeautifulSoup
|
from selenium.webdriver.common.by import By
|
||||||
|
from selenium.webdriver.support import expected_conditions as EC
|
||||||
|
from selenium.webdriver.support.ui import WebDriverWait
|
||||||
url = "https://developer.avapframework.com/docs"
|
url = "https://developer.avapframework.com/docs"
|
||||||
|
|
||||||
chrome_options = Options()
|
chrome_options = Options()
|
||||||
chrome_options.add_argument("--headless")
|
chrome_options.add_argument("--headless")
|
||||||
chrome_options.add_argument("--no-sandbox")
|
|
||||||
chrome_options.add_argument("--disable-dev-shm-usage")
|
|
||||||
|
|
||||||
driver = webdriver.Chrome(options=chrome_options)
|
driver = webdriver.Chrome(options=chrome_options)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
driver.get(url)
|
driver.get(url)
|
||||||
time.sleep(5)
|
|
||||||
|
wait = WebDriverWait(driver, 15)
|
||||||
|
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#contentDoc")))
|
||||||
|
|
||||||
html = driver.page_source
|
html = driver.page_source
|
||||||
finally:
|
finally:
|
||||||
driver.quit()
|
driver.quit()
|
||||||
|
|
||||||
soup = BeautifulSoup(html, "html.parser")
|
soup = BeautifulSoup(html, "html.parser")
|
||||||
|
|
||||||
# Todos los bloques de contenido dentro de col-md-12
|
main_container = soup.select_one("#contentDoc .col-md-12")
|
||||||
blocks = soup.select("#contentDoc .col-md-12 div.body-md")
|
|
||||||
|
|
||||||
data = []
|
def html_to_markdown(elem):
|
||||||
|
text = elem.get_text(" ", strip=True)
|
||||||
|
|
||||||
for i, block in enumerate(blocks, start=1):
|
if not text:
|
||||||
parent_section = block.find_parent("div", id=True)
|
return None
|
||||||
|
|
||||||
content = []
|
if elem.name == "h1":
|
||||||
full_text_parts = []
|
return f"# {text}\n"
|
||||||
|
if elem.name == "h2":
|
||||||
|
return f"## {text}\n"
|
||||||
|
if elem.name == "h3":
|
||||||
|
return f"### {text}\n"
|
||||||
|
if elem.name == "h4":
|
||||||
|
return f"#### {text}\n"
|
||||||
|
if elem.name == "h5":
|
||||||
|
return f"##### {text}\n"
|
||||||
|
if elem.name == "h6":
|
||||||
|
return f"###### {text}\n"
|
||||||
|
|
||||||
# Recorremos todos los elementos en orden
|
if elem.name == "p":
|
||||||
for elem in block.find_all(["h1", "h2", "h3", "h4", "h5", "h6", "p", "li", "pre", "code"], recursive=True):
|
return f"{text}\n"
|
||||||
text = elem.get_text(" ", strip=True)
|
|
||||||
if not text:
|
|
||||||
continue
|
|
||||||
|
|
||||||
item = {
|
if elem.name == "li":
|
||||||
"tag": elem.name,
|
return f"- {text}"
|
||||||
"text": text
|
|
||||||
}
|
|
||||||
content.append(item)
|
|
||||||
full_text_parts.append(text)
|
|
||||||
|
|
||||||
data.append({
|
if elem.name == "pre":
|
||||||
"block_index": i,
|
code = elem.get_text("\n", strip=True)
|
||||||
"section_id": parent_section.get("id") if parent_section else None,
|
return f"\n```\n{code}\n```\n"
|
||||||
"content": content,
|
|
||||||
"full_text": "\n".join(full_text_parts)
|
|
||||||
})
|
|
||||||
|
|
||||||
with open("avap_docs.json", "w", encoding="utf-8") as f:
|
return None
|
||||||
json.dump(data, f, indent=2, ensure_ascii=False)
|
|
||||||
|
|
||||||
print("Bloques encontrados:", len(data))
|
markdown_lines = []
|
||||||
|
|
||||||
|
tags_to_extract = ["h1", "h2", "h3", "h4", "h5", "h6", "p", "li", "pre"]
|
||||||
|
|
||||||
|
for elem in main_container.find_all(tags_to_extract):
|
||||||
|
md = html_to_markdown(elem)
|
||||||
|
|
||||||
|
if md:
|
||||||
|
markdown_lines.append(md)
|
||||||
|
|
||||||
|
markdown = "\n".join(markdown_lines)
|
||||||
|
|
||||||
|
|
||||||
|
with open("avap_docs.md", "w", encoding="utf-8") as f:
|
||||||
|
f.write(markdown)
|
||||||
|
|
@ -7,12 +7,16 @@ import logging
|
||||||
import os
|
import os
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
from loguru import logger
|
||||||
from elasticsearch import Elasticsearch
|
from elasticsearch import Elasticsearch
|
||||||
from langchain_core.documents import Document
|
from langchain_core.documents import Document
|
||||||
from langchain_elasticsearch import ElasticsearchStore
|
from langchain_elasticsearch import ElasticsearchStore
|
||||||
from src.utils.emb_factory import create_embedding_model
|
from langchain_community.embeddings import HuggingFaceEmbeddings
|
||||||
|
from langchain_experimental.text_splitter import SemanticChunker
|
||||||
|
|
||||||
|
from src.utils.emb_factory import create_embedding_model
|
||||||
|
from scripts.pipelines.tasks.chunk import scrape_avap_docs
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
app = typer.Typer()
|
app = typer.Typer()
|
||||||
|
|
||||||
ELASTICSEARCH_LOCAL_URL = os.getenv("ELASTICSEARCH_LOCAL_URL")
|
ELASTICSEARCH_LOCAL_URL = os.getenv("ELASTICSEARCH_LOCAL_URL")
|
||||||
|
|
@ -20,6 +24,7 @@ OLLAMA_LOCAL_URL = os.getenv("OLLAMA_LOCAL_URL")
|
||||||
ELASTICSEARCH_INDEX = os.getenv("ELASTICSEARCH_INDEX")
|
ELASTICSEARCH_INDEX = os.getenv("ELASTICSEARCH_INDEX")
|
||||||
OLLAMA_URL = os.getenv("OLLAMA_URL")
|
OLLAMA_URL = os.getenv("OLLAMA_URL")
|
||||||
OLLAMA_EMB_MODEL_NAME = os.getenv("OLLAMA_EMB_MODEL_NAME")
|
OLLAMA_EMB_MODEL_NAME = os.getenv("OLLAMA_EMB_MODEL_NAME")
|
||||||
|
AVAP_WEB_DOCS_URL = os.getenv("AVAP_WEB_DOCS_URL")
|
||||||
|
|
||||||
class DistanceStrategy(str, Enum):
|
class DistanceStrategy(str, Enum):
|
||||||
euclidean = "EUCLIDEAN_DISTANCE"
|
euclidean = "EUCLIDEAN_DISTANCE"
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,143 @@
|
||||||
|
from enum import Enum
|
||||||
|
import typer
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from loguru import logger
|
||||||
|
from elasticsearch import Elasticsearch
|
||||||
|
from langchain_elasticsearch import ElasticsearchStore
|
||||||
|
from chonkie import SemanticChunker
|
||||||
|
|
||||||
|
from src.utils.emb_factory import create_embedding_model
|
||||||
|
from scripts.pipelines.tasks.chunk import (
|
||||||
|
read_files,
|
||||||
|
get_chunk_docs,
|
||||||
|
convert_chunks_to_document
|
||||||
|
)
|
||||||
|
|
||||||
|
app = typer.Typer()
|
||||||
|
|
||||||
|
ELASTICSEARCH_LOCAL_URL = os.getenv("ELASTICSEARCH_LOCAL_URL")
|
||||||
|
OLLAMA_LOCAL_URL = os.getenv("OLLAMA_LOCAL_URL")
|
||||||
|
ELASTICSEARCH_INDEX = os.getenv("ELASTICSEARCH_INDEX")
|
||||||
|
OLLAMA_URL = os.getenv("OLLAMA_URL")
|
||||||
|
OLLAMA_EMB_MODEL_NAME = os.getenv("OLLAMA_EMB_MODEL_NAME")
|
||||||
|
AVAP_WEB_DOCS_URL = os.getenv("AVAP_WEB_DOCS_URL")
|
||||||
|
HF_EMB_MODEL_NAME = os.getenv("HF_EMB_MODEL_NAME")
|
||||||
|
|
||||||
|
class DistanceStrategy(str, Enum):
|
||||||
|
euclidean = "EUCLIDEAN_DISTANCE"
|
||||||
|
max_inner_product = "MAX_INNER_PRODUCT"
|
||||||
|
dot_product = "DOT_PRODUCT"
|
||||||
|
jaccard = "JACCARD"
|
||||||
|
cosine = "COSINE"
|
||||||
|
|
||||||
|
|
||||||
|
@app.command()
|
||||||
|
def elasticsearch_ingestion(
|
||||||
|
docs_folder_path: str = "docs",
|
||||||
|
es_request_timeout: int = 120,
|
||||||
|
es_max_retries: int = 5,
|
||||||
|
es_retry_on_timeout: bool = True,
|
||||||
|
distance_strategy: DistanceStrategy = DistanceStrategy.cosine,
|
||||||
|
chunk_size: int = 2048,
|
||||||
|
chunk_threshold: float = 0.5,
|
||||||
|
chunk_similarity_window: int = 3,
|
||||||
|
chunk_skip_window: int = 1,
|
||||||
|
):
|
||||||
|
logger.info("Starting Elasticsearch ingestion pipeline...")
|
||||||
|
logger.info(f"Reading and concatenating files from folder: {docs_folder_path}/developer.avapframework.com")
|
||||||
|
avap_github_docs = read_files(f"{docs_folder_path}/avap_language_github_docs", concatenate=False)
|
||||||
|
avap_web_docs_intro = read_files(f"{docs_folder_path}/developer.avapframework.com", "intro", concatenate=True)
|
||||||
|
|
||||||
|
# Check chapters in developer.avapframework.com folder and read and concatenate files for each chapter
|
||||||
|
chapters = sorted({
|
||||||
|
p.name.split("_")[0]
|
||||||
|
for p in Path(f"{docs_folder_path}/developer.avapframework.com").glob("chapter*.md")
|
||||||
|
})
|
||||||
|
|
||||||
|
avap_web_docs_chapters = [
|
||||||
|
item
|
||||||
|
for chapter in chapters
|
||||||
|
for item in read_files(
|
||||||
|
f"{docs_folder_path}/developer.avapframework.com",
|
||||||
|
f"{chapter}_",
|
||||||
|
concatenate=True
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
|
avap_web_docs_appendices = read_files(f"{docs_folder_path}/developer.avapframework.com", "appendices_", concatenate=False)
|
||||||
|
avap_samples_docs = read_files(f"{docs_folder_path}/samples", concatenate=False)
|
||||||
|
|
||||||
|
logger.info("Instantiating semantic chunker...")
|
||||||
|
chunker = SemanticChunker(
|
||||||
|
embedding_model=HF_EMB_MODEL_NAME,
|
||||||
|
chunk_size=chunk_size,
|
||||||
|
threshold=chunk_threshold,
|
||||||
|
similarity_window=chunk_similarity_window,
|
||||||
|
skip_window=chunk_skip_window
|
||||||
|
)
|
||||||
|
|
||||||
|
logger.info("Chunking AVAP GitHub docs...")
|
||||||
|
avap_github_docs_chunks = get_chunk_docs(avap_github_docs, chunker)
|
||||||
|
|
||||||
|
logger.info("Chunking AVAP web docs chapters...")
|
||||||
|
avap_web_docs_chapters_chunks = get_chunk_docs(avap_web_docs_chapters, chunker)
|
||||||
|
|
||||||
|
logger.info("Creating Langchain Document to index...")
|
||||||
|
avap_github_langchain_docs = convert_chunks_to_document(avap_github_docs_chunks)
|
||||||
|
avap_web_chapters_langchain_docs = convert_chunks_to_document(avap_web_docs_chapters_chunks)
|
||||||
|
avap_web_intro_langchain_docs = convert_chunks_to_document(avap_web_docs_intro)
|
||||||
|
avap_web_appendices_langchain_docs = convert_chunks_to_document(avap_web_docs_appendices)
|
||||||
|
avap_samples_langchain_docs = convert_chunks_to_document(avap_samples_docs)
|
||||||
|
avap_documents = avap_github_langchain_docs + avap_web_chapters_langchain_docs + avap_web_intro_langchain_docs + avap_web_appendices_langchain_docs + avap_samples_langchain_docs
|
||||||
|
|
||||||
|
logger.info("Connecting to Elasticsearch...")
|
||||||
|
try:
|
||||||
|
es = Elasticsearch(
|
||||||
|
ELASTICSEARCH_LOCAL_URL,
|
||||||
|
request_timeout=es_request_timeout,
|
||||||
|
max_retries=es_max_retries,
|
||||||
|
retry_on_timeout=es_retry_on_timeout,
|
||||||
|
)
|
||||||
|
except:
|
||||||
|
logger.exception("Failed to connect to Elasticsearch.")
|
||||||
|
raise
|
||||||
|
|
||||||
|
logger.info("Instantiating embeddings model...")
|
||||||
|
try:
|
||||||
|
embeddings = create_embedding_model(
|
||||||
|
provider="ollama",
|
||||||
|
model=OLLAMA_EMB_MODEL_NAME,
|
||||||
|
base_url=OLLAMA_LOCAL_URL,
|
||||||
|
)
|
||||||
|
except:
|
||||||
|
logger.exception("Failed to instantiate embeddings model.")
|
||||||
|
raise
|
||||||
|
|
||||||
|
logger.info(f"Checking if index {ELASTICSEARCH_INDEX} exists and deleting if it does...")
|
||||||
|
if es.indices.exists(index=ELASTICSEARCH_INDEX):
|
||||||
|
es.indices.delete(index=ELASTICSEARCH_INDEX)
|
||||||
|
|
||||||
|
logger.info(f"Uploading documents to index {ELASTICSEARCH_INDEX}...")
|
||||||
|
ElasticsearchStore.from_documents(
|
||||||
|
avap_documents,
|
||||||
|
embeddings,
|
||||||
|
client=es,
|
||||||
|
index_name=ELASTICSEARCH_INDEX,
|
||||||
|
distance_strategy=distance_strategy.value,
|
||||||
|
)
|
||||||
|
logger.info(f"Finished uploading documents to index {ELASTICSEARCH_INDEX}.")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
logging.basicConfig(
|
||||||
|
level=logging.INFO,
|
||||||
|
format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
app()
|
||||||
|
except Exception as exc:
|
||||||
|
logger.exception(exc)
|
||||||
|
raise
|
||||||
|
|
@ -0,0 +1,326 @@
|
||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "f8ea7a75",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import re\n",
|
||||||
|
"import hashlib\n",
|
||||||
|
"from typing import Any\n",
|
||||||
|
"from enum import Enum\n",
|
||||||
|
"import typer\n",
|
||||||
|
"import logging\n",
|
||||||
|
"import os\n",
|
||||||
|
"from pathlib import Path\n",
|
||||||
|
"\n",
|
||||||
|
"from loguru import logger\n",
|
||||||
|
"from elasticsearch import Elasticsearch\n",
|
||||||
|
"from langchain_core.documents import Document\n",
|
||||||
|
"from langchain_elasticsearch import ElasticsearchStore\n",
|
||||||
|
"from langchain_community.embeddings import HuggingFaceEmbeddings\n",
|
||||||
|
"from chonkie import SemanticChunker\n",
|
||||||
|
"\n",
|
||||||
|
"from src.utils.emb_factory import create_embedding_model\n",
|
||||||
|
"from scripts.pipelines.tasks.chunk import read_concat_files, get_chunk_docs, chunks_to_document\n",
|
||||||
|
"from src.config import PROJ_ROOT\n",
|
||||||
|
"\n",
|
||||||
|
"ELASTICSEARCH_LOCAL_URL = os.getenv(\"ELASTICSEARCH_LOCAL_URL\")\n",
|
||||||
|
"OLLAMA_LOCAL_URL = os.getenv(\"OLLAMA_LOCAL_URL\")\n",
|
||||||
|
"ELASTICSEARCH_INDEX = os.getenv(\"ELASTICSEARCH_INDEX\")\n",
|
||||||
|
"OLLAMA_URL = os.getenv(\"OLLAMA_URL\")\n",
|
||||||
|
"OLLAMA_EMB_MODEL_NAME = os.getenv(\"OLLAMA_EMB_MODEL_NAME\")\n",
|
||||||
|
"AVAP_WEB_DOCS_URL = os.getenv(\"AVAP_WEB_DOCS_URL\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "a8b8de3f",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stderr",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"\u001b[32m2026-03-10 13:58:32.657\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m<module>\u001b[0m:\u001b[36m3\u001b[0m - \u001b[1mStarting Elasticsearch ingestion pipeline...\u001b[0m\n",
|
||||||
|
"\u001b[32m2026-03-10 13:58:32.658\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m<module>\u001b[0m:\u001b[36m4\u001b[0m - \u001b[1mReading and concatenating files from folder: docs/developer.avapframework.com\u001b[0m\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"1\n",
|
||||||
|
"1\n",
|
||||||
|
"14\n",
|
||||||
|
"24\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"docs_folder_path = \"docs\"\n",
|
||||||
|
"\n",
|
||||||
|
"logger.info(\"Starting Elasticsearch ingestion pipeline...\")\n",
|
||||||
|
"logger.info(f\"Reading and concatenating files from folder: {docs_folder_path}/developer.avapframework.com\")\n",
|
||||||
|
"avap_github_docs = read_concat_files(PROJ_ROOT / f\"{docs_folder_path}/avap_language_github_docs\", \"AVAP\", concatenate=False)\n",
|
||||||
|
"avap_web_docs_intro = read_concat_files(PROJ_ROOT / f\"{docs_folder_path}/developer.avapframework.com\", \"intro\", concatenate=True)\n",
|
||||||
|
"\n",
|
||||||
|
"# Check chapters in developer.avapframework.com folder and read and concatenate files for each chapter\n",
|
||||||
|
"chapters = sorted({\n",
|
||||||
|
" p.name.split(\"_\")[0]\n",
|
||||||
|
" for p in Path(f\"{docs_folder_path}/developer.avapframework.com\").glob(\"chapter*.md\")\n",
|
||||||
|
"})\n",
|
||||||
|
"\n",
|
||||||
|
"avap_web_docs_chapters = [\n",
|
||||||
|
" item\n",
|
||||||
|
" for chapter in chapters\n",
|
||||||
|
" for item in read_concat_files(\n",
|
||||||
|
" f\"{docs_folder_path}/developer.avapframework.com\",\n",
|
||||||
|
" f\"{chapter}_\",\n",
|
||||||
|
" concatenate=True\n",
|
||||||
|
" )\n",
|
||||||
|
"]\n",
|
||||||
|
"avap_web_docs_appendices = read_concat_files(PROJ_ROOT / f\"{docs_folder_path}/developer.avapframework.com\", \"appendices_\", concatenate=False)\n",
|
||||||
|
"avap_examples_docs = read_concat_files(PROJ_ROOT / f\"{docs_folder_path}/samples\", concatenate=False)\n",
|
||||||
|
"\n",
|
||||||
|
"print(len(avap_github_docs))\n",
|
||||||
|
"print(len(avap_web_docs_intro))\n",
|
||||||
|
"print(len(avap_web_docs_chapters))\n",
|
||||||
|
"print(len(avap_web_docs_appendices))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 12,
|
||||||
|
"id": "36abc025",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"[{'content': 'nivel = 5\\nes_admin = nivel >= 10\\naddResult(es_admin)',\n",
|
||||||
|
" 'title': 'asignacion_booleana.avap'},\n",
|
||||||
|
" {'content': 'subtotal = 150.50\\niva = subtotal * 0.21\\ntotal = subtotal + iva\\naddResult(total)',\n",
|
||||||
|
" 'title': 'asignacion_matematica.avap'},\n",
|
||||||
|
" {'content': 'startLoop(i,1,10)\\n item = \"item_%s\" % i\\n AddvariableToJSON(item,\\'valor_generado\\',mi_json)\\nendLoop()\\naddResult(mi_json)',\n",
|
||||||
|
" 'title': 'bucle_1_10.avap'},\n",
|
||||||
|
" {'content': \"registros = ['1','2','3']\\ngetListLen(registros, total)\\ncontador = 0\\nstartLoop(idx, 0, 2)\\n actual = registros[int(idx)]\\nendLoop()\\naddResult(actual)\",\n",
|
||||||
|
" 'title': 'bucle_longitud_de_datos.avap'},\n",
|
||||||
|
" {'content': 'getDateTime(\"\", 86400, \"UTC\", expira)\\naddResult(expira)',\n",
|
||||||
|
" 'title': 'calculo_de_expiracion.avap'},\n",
|
||||||
|
" {'content': 'addParam(\"client_id\", id_interno)\\naddResult(id_interno)',\n",
|
||||||
|
" 'title': 'captura_de_id.avap'},\n",
|
||||||
|
" {'content': 'addParam(emails,emails)\\ngetQueryParamList(lista_correos)\\naddResult(lista_correos)',\n",
|
||||||
|
" 'title': 'captura_de_listas_multiples.avap'},\n",
|
||||||
|
" {'content': 'addParam(\"lang\", l)\\nif(l, \"es\", \"=\")\\n addVar(msg, \"Hola\")\\nend()\\naddResult(msg)',\n",
|
||||||
|
" 'title': 'comparacion_simple.avap'},\n",
|
||||||
|
" {'content': 'nombre = \"Sistema\"\\nlog = \"Evento registrado por: %s\" % nombre\\naddResult(log)',\n",
|
||||||
|
" 'title': 'concatenacion_dinamica.avap'},\n",
|
||||||
|
" {'content': 'datos_cliente = \"datos\"\\naddVar(clave, \"cliente_vip\")\\nAddvariableToJSON(clave, datos_cliente, mi_json_final)\\naddResult(mi_json_final)',\n",
|
||||||
|
" 'title': 'construccion_dinamica_de_objeto.avap'},\n",
|
||||||
|
" {'content': 'addParam(\"data_list\", mi_lista)\\ngetListLen(mi_lista, cantidad)\\naddResult(cantidad)',\n",
|
||||||
|
" 'title': 'contador_de_parametros.avap'},\n",
|
||||||
|
" {'content': 'stampToDatetime(1708726162, \"%d/%m/%Y\", 0, fecha_human)\\naddResult(fecha_human)',\n",
|
||||||
|
" 'title': 'conversion_timestamp_legible.avap'},\n",
|
||||||
|
" {'content': 'addParam(sal_par,saldo)\\nif(saldo, 0, \">\")\\n permitir = True\\nelse()\\n permitir = False\\nend()\\naddResult(permitir)',\n",
|
||||||
|
" 'title': 'else_estandar.avap'},\n",
|
||||||
|
" {'content': 'addParam(userrype, user_type)\\naddParam(sells, compras)\\nif(None, None, \" user_type == \\'VIP\\' or compras > 100\")\\n addVar(descuento, 0.20)\\nend()\\naddResult(descuento)',\n",
|
||||||
|
" 'title': 'expresion_compleja.avap'},\n",
|
||||||
|
" {'content': 'getDateTime(\"%Y-%m-%d %H:%M:%S\", 0, \"Europe/Madrid\", sql_date)\\naddResult(sql_date)',\n",
|
||||||
|
" 'title': 'fecha_para_base_de_datos.avap'},\n",
|
||||||
|
" {'content': 'function suma(a, b){\\n total = a + b\\n return(total)\\n }\\nresultado = suma(10, 20)\\naddResult(resultado)',\n",
|
||||||
|
" 'title': 'funcion_de_suma.avap'},\n",
|
||||||
|
" {'content': 'function es_valido(token){\\n response = False\\n if(token, \"SECRET\", \"=\")\\n response = True\\n end()\\n return(response)\\n }\\nautorizado = es_valido(\"SECRET\")\\naddResult(autorizado)',\n",
|
||||||
|
" 'title': 'funcion_validacion_acceso.avap'},\n",
|
||||||
|
" {'content': 'randomString(\"[A-Z]\\\\d\", 32, token_seguridad)\\naddResult(token_seguridad)',\n",
|
||||||
|
" 'title': 'generador_de_tokens_aleatorios.avap'},\n",
|
||||||
|
" {'content': 'encodeSHA256(\"payload_data\", checksum)\\naddResult(checksum)',\n",
|
||||||
|
" 'title': 'hash_SHA256_para_integridad.avap'},\n",
|
||||||
|
" {'content': 'addVar(mensaje, \"Hola mundo desde AVAP\")\\naddResult(mensaje)',\n",
|
||||||
|
" 'title': 'hola_mundo.avap'},\n",
|
||||||
|
" {'content': 'addParam(password,pass_nueva)\\npass_antigua = \"password\"\\nif(pass_nueva, pass_antigua, \"!=\")\\n addVar(cambio, \"Contraseña actualizada\")\\nend()\\naddResult(cambio)',\n",
|
||||||
|
" 'title': 'if_desigualdad.avap'},\n",
|
||||||
|
" {'content': 'replace(\"REF_1234_OLD\",\"OLD\", \"NEW\", ref_actualizada)\\naddResult(ref_actualizada)',\n",
|
||||||
|
" 'title': 'limpieza_de_strings.avap'},\n",
|
||||||
|
" {'content': 'try()\\n ormDirect(\"UPDATE table_inexistente SET a=1\", res)\\nexception(e)\\n addVar(_status,500)\\n addResult(\"Error de base de datos\")',\n",
|
||||||
|
" 'title': 'manejo_error_sql_critico.avap'},\n",
|
||||||
|
" {'content': 'getDateTime(\"\", 0, \"UTC\", ahora)\\naddResult(ahora)',\n",
|
||||||
|
" 'title': 'obtencion_timestamp.avap'},\n",
|
||||||
|
" {'content': 'ormCheckTable(tabla_pruebas,resultado_comprobacion)\\nif(resultado_comprobacion,False,\\'==\\')\\n ormCreateTable(\"username,age\",\\'VARCHAR,INTEGER\\',tabla_pruebas,resultado_creacion)\\nend()\\naddResult(resultado_comprobacion)\\naddResult(resultado_creacion)',\n",
|
||||||
|
" 'title': 'ormAccessCreate.avap'},\n",
|
||||||
|
" {'content': 'addParam(\"page\", p)\\naddParam(\"size\", s)\\nregistros = [\"u1\", \"u2\", \"u3\", \"u4\", \"u5\", \"u6\"]\\noffset = int(p) * int(s)\\nlimite = offset + int(s)\\ncontador = 0\\naddResult(offset)\\naddResult(limite)\\nstartLoop(i, 2, limite)\\n actual = registros[int(i)]\\n titulo = \"reg_%s\" % i\\n AddvariableToJSON(titulo, actual, pagina_json)\\nendLoop()\\naddResult(pagina_json)',\n",
|
||||||
|
" 'title': 'paginacion_dinamica_recursos.avap'},\n",
|
||||||
|
" {'content': 'addVar(base, 1000)\\naddVar(copia, $base)\\naddResult(copia)',\n",
|
||||||
|
" 'title': 'referencia_por_valor.avap'},\n",
|
||||||
|
" {'content': 'addVar(code, 200)\\naddVar(status, \"Success\")\\naddResult(code)\\naddResult(status)',\n",
|
||||||
|
" 'title': 'respuesta_multiple.avap'},\n",
|
||||||
|
" {'content': 'encontrado = False\\nstartLoop(i, 1, 10)\\n if(i, 5, \"==\")\\n encontrado = True\\n i = 11 \\n end()\\nendLoop()\\naddResult(encontrado)',\n",
|
||||||
|
" 'title': 'salida_bucle_correcta.avap'},\n",
|
||||||
|
" {'content': 'try()\\n RequestGet(\"https://api.test.com/data\", 0, 0, respuesta)\\nexception(e)\\n addVar(error_trace, \"Fallo de conexión: %s\" % e)\\n addResult(error_trace)',\n",
|
||||||
|
" 'title': 'try_catch_request.avap'},\n",
|
||||||
|
" {'content': 'addParam(\"api_key\", key)\\nif(key, None, \"==\")\\n addVar(_status, 403)\\n addVar(error, \"Acceso denegado: falta API KEY\")\\n addResult(error)\\nend()',\n",
|
||||||
|
" 'title': 'validacion_de_nulo.avap'},\n",
|
||||||
|
" {'content': 'addParam(\"rol\", r)\\nif(r, [\"admin\", \"editor\", \"root\"], \"in\")\\n acceso = True\\nend()\\naddResult(acceso)',\n",
|
||||||
|
" 'title': 'validacion_in_pertenece_a_lista.avap'}]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 12,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"avap_examples_docs"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "16a9e8ce",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "679e5f8c",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"id": "27e5774d",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stderr",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"\u001b[32m2026-03-10 13:58:34.531\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m<module>\u001b[0m:\u001b[36m1\u001b[0m - \u001b[1mChunking documents...\u001b[0m\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"logger.info(\"Chunking documents...\")\n",
|
||||||
|
"chunker = SemanticChunker(\n",
|
||||||
|
" embedding_model=os.getenv(\"HF_EMB_MODEL_NAME\"),\n",
|
||||||
|
" chunk_size=2048,\n",
|
||||||
|
" threshold=0.5,\n",
|
||||||
|
" skip_window=1\n",
|
||||||
|
")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 7,
|
||||||
|
"id": "a5ce984e",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stderr",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"\u001b[32m2026-03-10 13:58:51.740\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m<module>\u001b[0m:\u001b[36m1\u001b[0m - \u001b[1mChunking AVAP GitHub docs...\u001b[0m\n",
|
||||||
|
"\u001b[32m2026-03-10 14:01:00.535\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mscripts.pipelines.tasks.chunk\u001b[0m:\u001b[36mchunk_docs\u001b[0m:\u001b[36m181\u001b[0m - \u001b[1mFinished chunking AVAP.md\u001b[0m\n",
|
||||||
|
"\u001b[32m2026-03-10 14:01:00.536\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m<module>\u001b[0m:\u001b[36m4\u001b[0m - \u001b[1mChunking AVAP web docs chapters...\u001b[0m\n",
|
||||||
|
"\u001b[32m2026-03-10 14:01:09.128\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mscripts.pipelines.tasks.chunk\u001b[0m:\u001b[36mchunk_docs\u001b[0m:\u001b[36m181\u001b[0m - \u001b[1mFinished chunking chapter1_\u001b[0m\n",
|
||||||
|
"\u001b[32m2026-03-10 14:01:12.763\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mscripts.pipelines.tasks.chunk\u001b[0m:\u001b[36mchunk_docs\u001b[0m:\u001b[36m181\u001b[0m - \u001b[1mFinished chunking chapter2_\u001b[0m\n",
|
||||||
|
"\u001b[32m2026-03-10 14:01:42.995\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mscripts.pipelines.tasks.chunk\u001b[0m:\u001b[36mchunk_docs\u001b[0m:\u001b[36m181\u001b[0m - \u001b[1mFinished chunking chapter3_\u001b[0m\n",
|
||||||
|
"\u001b[32m2026-03-10 14:01:48.772\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mscripts.pipelines.tasks.chunk\u001b[0m:\u001b[36mchunk_docs\u001b[0m:\u001b[36m181\u001b[0m - \u001b[1mFinished chunking chapter4_\u001b[0m\n",
|
||||||
|
"\u001b[32m2026-03-10 14:01:48.772\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mscripts.pipelines.tasks.chunk\u001b[0m:\u001b[36mchunk_docs\u001b[0m:\u001b[36m181\u001b[0m - \u001b[1mFinished chunking chapter5_\u001b[0m\n",
|
||||||
|
"\u001b[32m2026-03-10 14:01:48.773\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mscripts.pipelines.tasks.chunk\u001b[0m:\u001b[36mchunk_docs\u001b[0m:\u001b[36m181\u001b[0m - \u001b[1mFinished chunking chapter6_\u001b[0m\n",
|
||||||
|
"\u001b[32m2026-03-10 14:02:06.408\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mscripts.pipelines.tasks.chunk\u001b[0m:\u001b[36mchunk_docs\u001b[0m:\u001b[36m181\u001b[0m - \u001b[1mFinished chunking chapter7_\u001b[0m\n",
|
||||||
|
"\u001b[32m2026-03-10 14:02:21.501\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mscripts.pipelines.tasks.chunk\u001b[0m:\u001b[36mchunk_docs\u001b[0m:\u001b[36m181\u001b[0m - \u001b[1mFinished chunking chapter8_\u001b[0m\n",
|
||||||
|
"\u001b[32m2026-03-10 14:07:27.158\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mscripts.pipelines.tasks.chunk\u001b[0m:\u001b[36mchunk_docs\u001b[0m:\u001b[36m181\u001b[0m - \u001b[1mFinished chunking chapter9_\u001b[0m\n",
|
||||||
|
"\u001b[32m2026-03-10 14:07:48.389\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mscripts.pipelines.tasks.chunk\u001b[0m:\u001b[36mchunk_docs\u001b[0m:\u001b[36m181\u001b[0m - \u001b[1mFinished chunking chapter10_\u001b[0m\n",
|
||||||
|
"\u001b[32m2026-03-10 14:08:10.823\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mscripts.pipelines.tasks.chunk\u001b[0m:\u001b[36mchunk_docs\u001b[0m:\u001b[36m181\u001b[0m - \u001b[1mFinished chunking chapter11_\u001b[0m\n",
|
||||||
|
"\u001b[32m2026-03-10 14:08:27.335\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mscripts.pipelines.tasks.chunk\u001b[0m:\u001b[36mchunk_docs\u001b[0m:\u001b[36m181\u001b[0m - \u001b[1mFinished chunking chapter12_\u001b[0m\n",
|
||||||
|
"\u001b[32m2026-03-10 14:08:55.010\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mscripts.pipelines.tasks.chunk\u001b[0m:\u001b[36mchunk_docs\u001b[0m:\u001b[36m181\u001b[0m - \u001b[1mFinished chunking chapter13_\u001b[0m\n",
|
||||||
|
"\u001b[32m2026-03-10 14:09:10.211\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mscripts.pipelines.tasks.chunk\u001b[0m:\u001b[36mchunk_docs\u001b[0m:\u001b[36m181\u001b[0m - \u001b[1mFinished chunking chapter14_\u001b[0m\n",
|
||||||
|
"\u001b[32m2026-03-10 14:09:10.211\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m<module>\u001b[0m:\u001b[36m7\u001b[0m - \u001b[1mCreating Langchain Document to index...\u001b[0m\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"logger.info(\"Chunking AVAP GitHub docs...\")\n",
|
||||||
|
"avap_github_docs_chunks = chunk_docs(avap_github_docs, chunker)\n",
|
||||||
|
"\n",
|
||||||
|
"logger.info(\"Chunking AVAP web docs chapters...\")\n",
|
||||||
|
"avap_web_docs_chapters_chunks = chunk_docs(avap_web_docs_chapters, chunker)\n",
|
||||||
|
"\n",
|
||||||
|
"logger.info(\"Creating Langchain Document to index...\")\n",
|
||||||
|
"avap_github_langchain_docs = chunks_to_document(avap_github_docs_chunks)\n",
|
||||||
|
"avap_web_chapters_langchain_docs = chunks_to_document(avap_web_docs_chapters_chunks)\n",
|
||||||
|
"avap_web_intro_langchain_docs = chunks_to_document(avap_web_docs_intro)\n",
|
||||||
|
"avap_web_appendices_langchain_docs = chunks_to_document(avap_web_docs_appendices)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "dd1f4d79",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"avap_github_langchain_docs"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "c24e8a8f",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"avap_web_chapters_langchain_docs"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "f6782a34",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"avap_web_intro_langchain_docs"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "78c1190e",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"avap_web_appendices_langchain_docs"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "assistance-engine",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.11.13"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
||||||
|
|
@ -1,5 +1,10 @@
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
|
import uuid
|
||||||
|
|
||||||
|
from loguru import logger
|
||||||
|
from chonkie import Chunk, SemanticChunker
|
||||||
|
from langchain_core.documents import Document
|
||||||
|
|
||||||
|
|
||||||
def replace_javascript_with_avap(text: str) -> str:
|
def replace_javascript_with_avap(text: str) -> str:
|
||||||
|
|
@ -29,33 +34,103 @@ def replace_javascript_with_avap(text: str) -> str:
|
||||||
return text
|
return text
|
||||||
|
|
||||||
|
|
||||||
def read_concat_files(folder_path: str, file_prefix: str, concatenate: bool = True) -> str | list[str]:
|
def read_files(
|
||||||
|
folder_path: str, file_prefix: str | None = None, concatenate: bool = True
|
||||||
|
) -> list[dict]:
|
||||||
"""
|
"""
|
||||||
Read and concatenate all files in a folder whose names start with a given prefix.
|
Read files in a folder whose names start with a given prefix.
|
||||||
Replaces javascript language markers with avap.
|
Replaces javascript language markers with avap.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
folder_path: Path to the folder to search in.
|
folder_path: Path to the folder to search in.
|
||||||
file_prefix: The prefix that file names must start with.
|
file_prefix: The prefix that file names must start with.
|
||||||
|
If None, all files in the folder are included.
|
||||||
concatenate: Whether to concatenate the contents of the files.
|
concatenate: Whether to concatenate the contents of the files.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
A single string with the concatenated contents of all matching files,
|
A list of dictionaries, each containing 'content' and 'title' keys.
|
||||||
with javascript markers replaced with avap, or a list of strings if concatenate is False.
|
If concatenate is True, returns a single dict with concatenated content and title as 'appendix'.
|
||||||
|
If concatenate is False, returns one dict per file with filename as title.
|
||||||
"""
|
"""
|
||||||
contents = []
|
contents = []
|
||||||
|
filenames = []
|
||||||
|
|
||||||
for filename in sorted(os.listdir(folder_path)):
|
for filename in sorted(os.listdir(folder_path)):
|
||||||
if filename.startswith(file_prefix):
|
include_file = file_prefix is None or filename.startswith(file_prefix)
|
||||||
|
if include_file:
|
||||||
file_path = os.path.join(folder_path, filename)
|
file_path = os.path.join(folder_path, filename)
|
||||||
if os.path.isfile(file_path):
|
if os.path.isfile(file_path):
|
||||||
with open(file_path, "r", encoding="utf-8") as f:
|
with open(file_path, "r", encoding="utf-8") as f:
|
||||||
content = f.read()
|
content = f.read()
|
||||||
if content.strip():
|
cleaned_content = content.strip()
|
||||||
print(f"Reading file: {filename}") # Skip empty files
|
if cleaned_content:
|
||||||
contents.append(content)
|
contents.append(cleaned_content)
|
||||||
|
filenames.append(filename)
|
||||||
|
|
||||||
if concatenate:
|
if concatenate:
|
||||||
concatenated = "\n".join(contents)
|
concatenated = "\n".join(contents)
|
||||||
return replace_javascript_with_avap(concatenated)
|
processed_content = replace_javascript_with_avap(concatenated)
|
||||||
|
title = file_prefix if file_prefix is not None else "all_files"
|
||||||
|
return [{"content": processed_content, "title": title}]
|
||||||
else:
|
else:
|
||||||
return [replace_javascript_with_avap(content) for content in contents]
|
return [
|
||||||
|
{"content": replace_javascript_with_avap(content), "title": filename}
|
||||||
|
for content, filename in zip(contents, filenames)
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def get_chunk_docs(docs: list[dict], chunker: SemanticChunker) -> list[list[Chunk]]:
|
||||||
|
"""
|
||||||
|
Chunk the content of the documents using the provided chunker.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
docs: A list of dictionaries, each containing 'content' and 'title' keys.
|
||||||
|
chunker: An instance of SemanticChunker to use for chunking the content.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A list of lists of Chunk objects, where each inner list corresponds to the chunks of a
|
||||||
|
single document.
|
||||||
|
"""
|
||||||
|
list_chunks = []
|
||||||
|
|
||||||
|
for doc in docs:
|
||||||
|
content = doc["content"]
|
||||||
|
chunks = chunker.chunk(content)
|
||||||
|
for chunk in chunks:
|
||||||
|
chunk.context = {"source": doc["title"]}
|
||||||
|
list_chunks.append(chunks)
|
||||||
|
logger.info(f"Finished chunking {doc['title']}")
|
||||||
|
|
||||||
|
return list_chunks
|
||||||
|
|
||||||
|
|
||||||
|
def convert_chunks_to_document(chunks: list[dict] | list[list[Chunk]]) -> list[Document]:
|
||||||
|
"""
|
||||||
|
Convert the chunked content into a list of Document objects.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
chunks: A list of dictionaries containing 'content' and 'title' keys.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A list of Document objects created from the chunked content.
|
||||||
|
"""
|
||||||
|
documents = []
|
||||||
|
|
||||||
|
if isinstance(chunks[0], dict):
|
||||||
|
for chunk in chunks:
|
||||||
|
content = chunk["content"]
|
||||||
|
title = chunk["title"]
|
||||||
|
documents.append(Document(id=str(uuid.uuid4()),
|
||||||
|
page_content=content,
|
||||||
|
metadata={"source": title}))
|
||||||
|
|
||||||
|
else:
|
||||||
|
for chunk_list in chunks:
|
||||||
|
for chunk in chunk_list:
|
||||||
|
content = chunk.text
|
||||||
|
title = chunk.context.get("source", "unknown")
|
||||||
|
documents.append(Document(id=str(uuid.uuid4()),
|
||||||
|
page_content=content,
|
||||||
|
metadata={"source": title}))
|
||||||
|
|
||||||
|
return documents
|
||||||
174
uv.lock
174
uv.lock
|
|
@ -250,6 +250,7 @@ name = "assistance-engine"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
source = { virtual = "." }
|
source = { virtual = "." }
|
||||||
dependencies = [
|
dependencies = [
|
||||||
|
{ name = "chonkie", extra = ["semantic"] },
|
||||||
{ name = "grpcio" },
|
{ name = "grpcio" },
|
||||||
{ name = "grpcio-reflection" },
|
{ name = "grpcio-reflection" },
|
||||||
{ name = "grpcio-tools" },
|
{ name = "grpcio-tools" },
|
||||||
|
|
@ -281,11 +282,13 @@ dev = [
|
||||||
{ name = "polars" },
|
{ name = "polars" },
|
||||||
{ name = "ragas" },
|
{ name = "ragas" },
|
||||||
{ name = "ruff" },
|
{ name = "ruff" },
|
||||||
|
{ name = "selenium" },
|
||||||
{ name = "tree-sitter-language-pack" },
|
{ name = "tree-sitter-language-pack" },
|
||||||
]
|
]
|
||||||
|
|
||||||
[package.metadata]
|
[package.metadata]
|
||||||
requires-dist = [
|
requires-dist = [
|
||||||
|
{ name = "chonkie", extras = ["semantic"], specifier = ">=1.5.6" },
|
||||||
{ name = "grpcio", specifier = ">=1.78.0" },
|
{ name = "grpcio", specifier = ">=1.78.0" },
|
||||||
{ name = "grpcio-reflection", specifier = ">=1.78.0" },
|
{ name = "grpcio-reflection", specifier = ">=1.78.0" },
|
||||||
{ name = "grpcio-tools", specifier = ">=1.78.0" },
|
{ name = "grpcio-tools", specifier = ">=1.78.0" },
|
||||||
|
|
@ -317,6 +320,7 @@ dev = [
|
||||||
{ name = "polars", specifier = ">=1.38.1" },
|
{ name = "polars", specifier = ">=1.38.1" },
|
||||||
{ name = "ragas", specifier = ">=0.4.3" },
|
{ name = "ragas", specifier = ">=0.4.3" },
|
||||||
{ name = "ruff", specifier = ">=0.15.1" },
|
{ name = "ruff", specifier = ">=0.15.1" },
|
||||||
|
{ name = "selenium", specifier = ">=4.41.0" },
|
||||||
{ name = "tree-sitter-language-pack", specifier = ">=0.13.0" },
|
{ name = "tree-sitter-language-pack", specifier = ">=0.13.0" },
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
@ -589,6 +593,62 @@ wheels = [
|
||||||
{ url = "https://files.pythonhosted.org/packages/0a/4c/925909008ed5a988ccbb72dcc897407e5d6d3bd72410d69e051fc0c14647/charset_normalizer-3.4.4-py3-none-any.whl", hash = "sha256:7a32c560861a02ff789ad905a2fe94e3f840803362c84fecf1851cb4cf3dc37f", size = 53402, upload-time = "2025-10-14T04:42:31.76Z" },
|
{ url = "https://files.pythonhosted.org/packages/0a/4c/925909008ed5a988ccbb72dcc897407e5d6d3bd72410d69e051fc0c14647/charset_normalizer-3.4.4-py3-none-any.whl", hash = "sha256:7a32c560861a02ff789ad905a2fe94e3f840803362c84fecf1851cb4cf3dc37f", size = 53402, upload-time = "2025-10-14T04:42:31.76Z" },
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "chonkie"
|
||||||
|
version = "1.5.6"
|
||||||
|
source = { registry = "https://pypi.org/simple" }
|
||||||
|
dependencies = [
|
||||||
|
{ name = "chonkie-core" },
|
||||||
|
{ name = "numpy" },
|
||||||
|
{ name = "tenacity" },
|
||||||
|
{ name = "tqdm" },
|
||||||
|
]
|
||||||
|
sdist = { url = "https://files.pythonhosted.org/packages/a4/16/e51295955f5a627ebb7867dc2e7fa48d4c6dc2a5f3cde3690de84812e929/chonkie-1.5.6.tar.gz", hash = "sha256:282a24c20b88c4c28d8cae893ac78bcbee531a87d28ec86b419897a9eea2ecf3", size = 172066, upload-time = "2026-02-16T21:44:01.336Z" }
|
||||||
|
wheels = [
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/18/3a/24cf4cb377f4d44126231d55a19b48a645a0f78f891288a8d4300c95160d/chonkie-1.5.6-py3-none-any.whl", hash = "sha256:4c3be39a0f97315eb3c5efe6dc5d7933d3d27a1918b55c39ab211b403bb03df7", size = 210065, upload-time = "2026-02-16T21:43:59.926Z" },
|
||||||
|
]
|
||||||
|
|
||||||
|
[package.optional-dependencies]
|
||||||
|
semantic = [
|
||||||
|
{ name = "model2vec" },
|
||||||
|
{ name = "tokenizers" },
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "chonkie-core"
|
||||||
|
version = "0.9.2"
|
||||||
|
source = { registry = "https://pypi.org/simple" }
|
||||||
|
dependencies = [
|
||||||
|
{ name = "numpy" },
|
||||||
|
]
|
||||||
|
sdist = { url = "https://files.pythonhosted.org/packages/55/f5/547c836f488dc74116ea42a2b2355365f4829fe6d925564f4db7775e6d34/chonkie_core-0.9.2.tar.gz", hash = "sha256:a34f457016fb4bedf9d0a62e55afc334670d88f8316d50ba9af8df83be78b56a", size = 49480, upload-time = "2026-01-21T09:09:46.265Z" }
|
||||||
|
wheels = [
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/c3/27/8a1f0efc87ef5d99760a462d6c3b17e4e765c77f52a944d56b676a83adfd/chonkie_core-0.9.2-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:9a1bfe98b00e6f70fe97bd342759d67856677056f0cb4193e95f79e561ca35d8", size = 350021, upload-time = "2026-01-21T09:09:12.614Z" },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/fb/04/dd2c768f0bad729b2efe4be3999349ee7164092d5acfcbeba12234457191/chonkie_core-0.9.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:9b6ef38eb3385e71d1c47a4a48e394813de3d25a62faa8d9ffd49156e8f31155", size = 336542, upload-time = "2026-01-21T09:09:13.486Z" },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/41/0e/bdf2863380373efb3f6c43e3361616d99c12cabd9e37f67949803809068f/chonkie_core-0.9.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0b40a3b33b381dc3543483b8ade4e4176280fa858f8690bdeb28acc082e1dc7e", size = 369363, upload-time = "2026-01-21T09:09:14.379Z" },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/34/d2/7316952edf5d7a7788659bb6dba23438e04a0268f93b21fb731204ceee58/chonkie_core-0.9.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cac5919f384e38792c742c51979c2dac57ba1ce079361a1d782bdf7d43b66f30", size = 388981, upload-time = "2026-01-21T09:09:15.295Z" },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/6c/1d/8265df0af95651cdca7a64dc6e57f3ad6c562966c53ccf03915a60440eb0/chonkie_core-0.9.2-cp311-cp311-win_amd64.whl", hash = "sha256:67cdf9aceef31ce4e02ecd82c4491c2a36bb70cc9230466f32c7d6cdb039285a", size = 224450, upload-time = "2026-01-21T09:09:16.522Z" },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/52/55/8825b059e70a3c757c90efa319e35312a2650431aef1cec11b476ee8699b/chonkie_core-0.9.2-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:d6d11337842ca90713d8b48d42ce823bcc82874437d4071a8aced9d47b66ec76", size = 347854, upload-time = "2026-01-21T09:09:17.49Z" },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/11/51/abac8676470c7e7a7967964eb9066e2efc346339c338da7190a41f412bba/chonkie_core-0.9.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:77fe2d6b9a709245408b53923dd4ebe1b79e09fdcdc5916df9c97e90c8e13eda", size = 333582, upload-time = "2026-01-21T09:09:18.863Z" },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/d4/8c/f62d4ff0efbc08d8c281051ce1752cd6bcb6a7f3e816f8b3c143741d1b86/chonkie_core-0.9.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c0151a74791445985f30bc34cbe7d679e9a716d36e9acf67ed5dc3408be6a426", size = 365189, upload-time = "2026-01-21T09:09:19.884Z" },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/7a/f2/cae3bf4174e7d2b8f0c9fe76a341bed8dc48e30069683854ca536fbed5bd/chonkie_core-0.9.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cb5ba84745a7daa32749fe8cbdf691428c2bd4cef14e6555db4ce382b2edef05", size = 385232, upload-time = "2026-01-21T09:09:21.088Z" },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/fe/1b/18323d5a7fa3638e9c0aaf00cb1fb1b678546466debb3ad57a6adea9d686/chonkie_core-0.9.2-cp312-cp312-win_amd64.whl", hash = "sha256:ee5093fd6a3f78163445bab5907cc6fd883ccea0514f8866abead0f059683d45", size = 222786, upload-time = "2026-01-21T09:09:21.919Z" },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/28/ee/f45c8cb237e5a55eac366c9ac7a4a831329f6cf6f33401609063c1ed660d/chonkie_core-0.9.2-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:6a658cd4fc5cb7c12bc6587246eb545f84d6aa25b86001a92fdcb191cea632c8", size = 347713, upload-time = "2026-01-21T09:09:22.809Z" },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/f9/31/0049eb4366cef2171404166e8ff1f39ffe350d7d8921247d262dbb3d4d6c/chonkie_core-0.9.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:ebf47e86be6603cbb940529aed6880655ac7f0bac232952565160fdbea5283d0", size = 333290, upload-time = "2026-01-21T09:09:23.786Z" },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/03/d9/3a082faa359e3b24826547bdc725dc9af92b4180b262d3ca6872724cbfbb/chonkie_core-0.9.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:64645ff2c299b953b4a1ff951d1492b4a2b461c624b20604ced5612a8622b030", size = 364600, upload-time = "2026-01-21T09:09:24.64Z" },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/04/0b/b89aa90c4f44ce4d82effc064031016bb791979cfd6147c155548e706ef7/chonkie_core-0.9.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:418748eeea1e09c539fd1d7f7f91c1611765c9d82a523e4c4ae0ed9e30a16b2d", size = 384806, upload-time = "2026-01-21T09:09:25.566Z" },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/3d/d9/a7f8577b5550a4323aa9eda16336669b8ad6e8a5ea0c176c9baa25738436/chonkie_core-0.9.2-cp313-cp313-win_amd64.whl", hash = "sha256:f3718af3037480023423125e3b4a490c8f4cbf6de38d652169a97dd8ba391953", size = 222393, upload-time = "2026-01-21T09:09:26.897Z" },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/72/73/cf6a32cfa9238f19a1d539a1d8371b7d90e21e42458a43fbc949c6476871/chonkie_core-0.9.2-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:66dc990ac58471fbd12845402726ca69b510602abab7c1c3e52cf8e21f9552e3", size = 363201, upload-time = "2026-01-21T09:09:27.764Z" },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/1f/1f/56029d9a557e983cf71d22365b4229c4cfaf09401faa6cbc7e912cef2213/chonkie_core-0.9.2-cp314-cp314-macosx_10_12_x86_64.whl", hash = "sha256:325c0c853268fbdc37f4a65c3cde68fa56e3a25d164eee9512ceb41edc819902", size = 347292, upload-time = "2026-01-21T09:09:28.676Z" },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/21/5b/08b8230d9264007cc7920cf1b1576f2ee1a1ef20d3cd5f8adb5e043e0908/chonkie_core-0.9.2-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:be9ea3bee05564a721f5c6c8699e1ad5996cf353b2faea2217a08ddee29e2de7", size = 332693, upload-time = "2026-01-21T09:09:29.541Z" },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/eb/72/255c918da43a96c90b2bb96f1951a6ea1c513c18b36caecb6e9192275b83/chonkie_core-0.9.2-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:926db6c26e78b2d84dfb8422073a3f0f20478160ab48204f306fa35f3e1e95d7", size = 364578, upload-time = "2026-01-21T09:09:30.673Z" },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/60/80/1710844b9706cd44324446eb368e813ebb4a085e96f469f54b61ddff67dc/chonkie_core-0.9.2-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c861f147e5932e659dcbe770ca0779a470acc62e9242bff87f36d03eae29644d", size = 384173, upload-time = "2026-01-21T09:09:32.015Z" },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/5c/5c/a31b259ea94620ea4e4100ed4cc952ad770f2c7af36293bbc9154efb5c9e/chonkie_core-0.9.2-cp314-cp314-win_amd64.whl", hash = "sha256:83473c708a23652d6dc70142b2e586f965af3031b1d2a5c6336f1fc78614b452", size = 222275, upload-time = "2026-01-21T09:09:34.148Z" },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/48/8a/c15c88f59bc9cf6f7ac994689d048fd60fcb72247f6b67ca31dc4eadf2f8/chonkie_core-0.9.2-cp314-cp314t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4d6c0b89ee5e21d255bcf7d11f17b07ef811a71292717f50889244474cfab8bc", size = 363813, upload-time = "2026-01-21T09:09:35.175Z" },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/44/20/b16b9896065d2bfa175b238a23faa03531b80706f15de85ae6e5701b51fd/chonkie_core-0.9.2-pp311-pypy311_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e08b78b0e208310b8aa523bcd13a0cd655c90751211236a1de99693845b8826d", size = 371578, upload-time = "2026-01-21T09:09:43.44Z" },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/1b/c3/844844bbadbfb6727d0f9b286fcb1398fbf2984c1348e8a8238dee335113/chonkie_core-0.9.2-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c6cf9a691cbbf32ef00c22eec66dd9f59b92067f6153c62b766ce24948a2ffd", size = 390277, upload-time = "2026-01-21T09:09:44.364Z" },
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "click"
|
name = "click"
|
||||||
version = "8.3.1"
|
version = "8.3.1"
|
||||||
|
|
@ -2434,6 +2494,25 @@ wheels = [
|
||||||
{ url = "https://files.pythonhosted.org/packages/9b/f7/4a5e785ec9fbd65146a27b6b70b6cdc161a66f2024e4b04ac06a67f5578b/mistune-3.2.0-py3-none-any.whl", hash = "sha256:febdc629a3c78616b94393c6580551e0e34cc289987ec6c35ed3f4be42d0eee1", size = 53598, upload-time = "2025-12-23T11:36:33.211Z" },
|
{ url = "https://files.pythonhosted.org/packages/9b/f7/4a5e785ec9fbd65146a27b6b70b6cdc161a66f2024e4b04ac06a67f5578b/mistune-3.2.0-py3-none-any.whl", hash = "sha256:febdc629a3c78616b94393c6580551e0e34cc289987ec6c35ed3f4be42d0eee1", size = 53598, upload-time = "2025-12-23T11:36:33.211Z" },
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "model2vec"
|
||||||
|
version = "0.7.0"
|
||||||
|
source = { registry = "https://pypi.org/simple" }
|
||||||
|
dependencies = [
|
||||||
|
{ name = "jinja2" },
|
||||||
|
{ name = "joblib" },
|
||||||
|
{ name = "numpy" },
|
||||||
|
{ name = "rich" },
|
||||||
|
{ name = "safetensors" },
|
||||||
|
{ name = "setuptools" },
|
||||||
|
{ name = "tokenizers" },
|
||||||
|
{ name = "tqdm" },
|
||||||
|
]
|
||||||
|
sdist = { url = "https://files.pythonhosted.org/packages/2a/73/03badf0a639cdad59db887928f17a187f4240f021f2d3656ef39795058b7/model2vec-0.7.0.tar.gz", hash = "sha256:bf5d0420615e356dd8046794a057bb4f13c50253c44f0d0d1f4441bb489a6ed3", size = 2785837, upload-time = "2025-10-05T06:28:02.482Z" }
|
||||||
|
wheels = [
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/bd/3d/473b1d960a2e27b4115c7599b5d47adf3a487c44798468375d408b5fb825/model2vec-0.7.0-py3-none-any.whl", hash = "sha256:f7a6ebf1b9ca384ba1158c4ecc9a2d450407f63526c0a26e268711071e280c27", size = 53038, upload-time = "2025-10-05T06:28:00.533Z" },
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "mpmath"
|
name = "mpmath"
|
||||||
version = "1.3.0"
|
version = "1.3.0"
|
||||||
|
|
@ -3159,6 +3238,18 @@ wheels = [
|
||||||
{ url = "https://files.pythonhosted.org/packages/73/cd/29cee6007bddf7a834e6cd6f536754c0535fcb939d384f0f37a38b1cddb8/ormsgpack-1.12.2-cp314-cp314t-win_amd64.whl", hash = "sha256:837dd316584485b72ef451d08dd3e96c4a11d12e4963aedb40e08f89685d8ec2", size = 117232, upload-time = "2026-01-18T20:55:45.448Z" },
|
{ url = "https://files.pythonhosted.org/packages/73/cd/29cee6007bddf7a834e6cd6f536754c0535fcb939d384f0f37a38b1cddb8/ormsgpack-1.12.2-cp314-cp314t-win_amd64.whl", hash = "sha256:837dd316584485b72ef451d08dd3e96c4a11d12e4963aedb40e08f89685d8ec2", size = 117232, upload-time = "2026-01-18T20:55:45.448Z" },
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "outcome"
|
||||||
|
version = "1.3.0.post0"
|
||||||
|
source = { registry = "https://pypi.org/simple" }
|
||||||
|
dependencies = [
|
||||||
|
{ name = "attrs" },
|
||||||
|
]
|
||||||
|
sdist = { url = "https://files.pythonhosted.org/packages/98/df/77698abfac98571e65ffeb0c1fba8ffd692ab8458d617a0eed7d9a8d38f2/outcome-1.3.0.post0.tar.gz", hash = "sha256:9dcf02e65f2971b80047b377468e72a268e15c0af3cf1238e6ff14f7f91143b8", size = 21060, upload-time = "2023-10-26T04:26:04.361Z" }
|
||||||
|
wheels = [
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/55/8b/5ab7257531a5d830fc8000c476e63c935488d74609b50f9384a643ec0a62/outcome-1.3.0.post0-py2.py3-none-any.whl", hash = "sha256:e771c5ce06d1415e356078d3bdd68523f284b4ce5419828922b6871e65eda82b", size = 10692, upload-time = "2023-10-26T04:26:02.532Z" },
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "overrides"
|
name = "overrides"
|
||||||
version = "7.7.0"
|
version = "7.7.0"
|
||||||
|
|
@ -3809,6 +3900,15 @@ wheels = [
|
||||||
{ url = "https://files.pythonhosted.org/packages/c7/21/705964c7812476f378728bdf590ca4b771ec72385c533964653c68e86bdc/pygments-2.19.2-py3-none-any.whl", hash = "sha256:86540386c03d588bb81d44bc3928634ff26449851e99741617ecb9037ee5ec0b", size = 1225217, upload-time = "2025-06-21T13:39:07.939Z" },
|
{ url = "https://files.pythonhosted.org/packages/c7/21/705964c7812476f378728bdf590ca4b771ec72385c533964653c68e86bdc/pygments-2.19.2-py3-none-any.whl", hash = "sha256:86540386c03d588bb81d44bc3928634ff26449851e99741617ecb9037ee5ec0b", size = 1225217, upload-time = "2025-06-21T13:39:07.939Z" },
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "pysocks"
|
||||||
|
version = "1.7.1"
|
||||||
|
source = { registry = "https://pypi.org/simple" }
|
||||||
|
sdist = { url = "https://files.pythonhosted.org/packages/bd/11/293dd436aea955d45fc4e8a35b6ae7270f5b8e00b53cf6c024c83b657a11/PySocks-1.7.1.tar.gz", hash = "sha256:3f8804571ebe159c380ac6de37643bb4685970655d3bba243530d6558b799aa0", size = 284429, upload-time = "2019-09-20T02:07:35.714Z" }
|
||||||
|
wheels = [
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/8d/59/b4572118e098ac8e46e399a1dd0f2d85403ce8bbaad9ec79373ed6badaf9/PySocks-1.7.1-py3-none-any.whl", hash = "sha256:2725bd0a9925919b9b51739eea5f9e2bae91e83288108a9ad338b2e3a4435ee5", size = 16725, upload-time = "2019-09-20T02:06:22.938Z" },
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "python-dateutil"
|
name = "python-dateutil"
|
||||||
version = "2.9.0.post0"
|
version = "2.9.0.post0"
|
||||||
|
|
@ -4631,6 +4731,23 @@ wheels = [
|
||||||
{ url = "https://files.pythonhosted.org/packages/07/39/338d9219c4e87f3e708f18857ecd24d22a0c3094752393319553096b98af/scipy-1.17.1-cp314-cp314t-win_arm64.whl", hash = "sha256:200e1050faffacc162be6a486a984a0497866ec54149a01270adc8a59b7c7d21", size = 25489165, upload-time = "2026-02-23T00:22:29.563Z" },
|
{ url = "https://files.pythonhosted.org/packages/07/39/338d9219c4e87f3e708f18857ecd24d22a0c3094752393319553096b98af/scipy-1.17.1-cp314-cp314t-win_arm64.whl", hash = "sha256:200e1050faffacc162be6a486a984a0497866ec54149a01270adc8a59b7c7d21", size = 25489165, upload-time = "2026-02-23T00:22:29.563Z" },
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "selenium"
|
||||||
|
version = "4.41.0"
|
||||||
|
source = { registry = "https://pypi.org/simple" }
|
||||||
|
dependencies = [
|
||||||
|
{ name = "certifi" },
|
||||||
|
{ name = "trio" },
|
||||||
|
{ name = "trio-websocket" },
|
||||||
|
{ name = "typing-extensions" },
|
||||||
|
{ name = "urllib3", extra = ["socks"] },
|
||||||
|
{ name = "websocket-client" },
|
||||||
|
]
|
||||||
|
sdist = { url = "https://files.pythonhosted.org/packages/04/7c/133d00d6d013a17d3f39199f27f1a780ec2e95d7b9aa997dc1b8ac2e62a7/selenium-4.41.0.tar.gz", hash = "sha256:003e971f805231ad63e671783a2b91a299355d10cefb9de964c36ff3819115aa", size = 937872, upload-time = "2026-02-20T03:42:06.216Z" }
|
||||||
|
wheels = [
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/a8/d6/e4160989ef6b272779af6f3e5c43c3ba9be6687bdc21c68c3fb220e555b3/selenium-4.41.0-py3-none-any.whl", hash = "sha256:b8ccde8d2e7642221ca64af184a92c19eee6accf2e27f20f30472f5efae18eb1", size = 9532858, upload-time = "2026-02-20T03:42:03.218Z" },
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "send2trash"
|
name = "send2trash"
|
||||||
version = "2.1.0"
|
version = "2.1.0"
|
||||||
|
|
@ -4787,6 +4904,15 @@ wheels = [
|
||||||
{ url = "https://files.pythonhosted.org/packages/e9/44/75a9c9421471a6c4805dbf2356f7c181a29c1879239abab1ea2cc8f38b40/sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2", size = 10235, upload-time = "2024-02-25T23:20:01.196Z" },
|
{ url = "https://files.pythonhosted.org/packages/e9/44/75a9c9421471a6c4805dbf2356f7c181a29c1879239abab1ea2cc8f38b40/sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2", size = 10235, upload-time = "2024-02-25T23:20:01.196Z" },
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "sortedcontainers"
|
||||||
|
version = "2.4.0"
|
||||||
|
source = { registry = "https://pypi.org/simple" }
|
||||||
|
sdist = { url = "https://files.pythonhosted.org/packages/e8/c4/ba2f8066cceb6f23394729afe52f3bf7adec04bf9ed2c820b39e19299111/sortedcontainers-2.4.0.tar.gz", hash = "sha256:25caa5a06cc30b6b83d11423433f65d1f9d76c4c6a0c90e3379eaa43b9bfdb88", size = 30594, upload-time = "2021-05-16T22:03:42.897Z" }
|
||||||
|
wheels = [
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/32/46/9cb0e58b2deb7f82b84065f37f3bffeb12413f947f9388e4cac22c4621ce/sortedcontainers-2.4.0-py2.py3-none-any.whl", hash = "sha256:a163dcaede0f1c021485e957a39245190e74249897e2ae4b2aa38595db237ee0", size = 29575, upload-time = "2021-05-16T22:03:41.177Z" },
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "soupsieve"
|
name = "soupsieve"
|
||||||
version = "2.8.3"
|
version = "2.8.3"
|
||||||
|
|
@ -5257,6 +5383,37 @@ wheels = [
|
||||||
{ url = "https://files.pythonhosted.org/packages/d8/c7/dcf3ea1c4f5da9b10353b9af4455d756c92d728a8f58f03c480d3ef0ead5/tree_sitter_yaml-0.7.2-cp310-abi3-win_arm64.whl", hash = "sha256:f63c227b18e7ce7587bce124578f0bbf1f890ac63d3e3cd027417574273642c4", size = 44065, upload-time = "2025-10-07T14:40:35.337Z" },
|
{ url = "https://files.pythonhosted.org/packages/d8/c7/dcf3ea1c4f5da9b10353b9af4455d756c92d728a8f58f03c480d3ef0ead5/tree_sitter_yaml-0.7.2-cp310-abi3-win_arm64.whl", hash = "sha256:f63c227b18e7ce7587bce124578f0bbf1f890ac63d3e3cd027417574273642c4", size = 44065, upload-time = "2025-10-07T14:40:35.337Z" },
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "trio"
|
||||||
|
version = "0.33.0"
|
||||||
|
source = { registry = "https://pypi.org/simple" }
|
||||||
|
dependencies = [
|
||||||
|
{ name = "attrs" },
|
||||||
|
{ name = "cffi", marker = "implementation_name != 'pypy' and os_name == 'nt'" },
|
||||||
|
{ name = "idna" },
|
||||||
|
{ name = "outcome" },
|
||||||
|
{ name = "sniffio" },
|
||||||
|
{ name = "sortedcontainers" },
|
||||||
|
]
|
||||||
|
sdist = { url = "https://files.pythonhosted.org/packages/52/b6/c744031c6f89b18b3f5f4f7338603ab381d740a7f45938c4607b2302481f/trio-0.33.0.tar.gz", hash = "sha256:a29b92b73f09d4b48ed249acd91073281a7f1063f09caba5dc70465b5c7aa970", size = 605109, upload-time = "2026-02-14T18:40:55.386Z" }
|
||||||
|
wheels = [
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/1c/93/dab25dc87ac48da0fe0f6419e07d0bfd98799bed4e05e7b9e0f85a1a4b4b/trio-0.33.0-py3-none-any.whl", hash = "sha256:3bd5d87f781d9b0192d592aef28691f8951d6c2e41b7e1da4c25cde6c180ae9b", size = 510294, upload-time = "2026-02-14T18:40:53.313Z" },
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "trio-websocket"
|
||||||
|
version = "0.12.2"
|
||||||
|
source = { registry = "https://pypi.org/simple" }
|
||||||
|
dependencies = [
|
||||||
|
{ name = "outcome" },
|
||||||
|
{ name = "trio" },
|
||||||
|
{ name = "wsproto" },
|
||||||
|
]
|
||||||
|
sdist = { url = "https://files.pythonhosted.org/packages/d1/3c/8b4358e81f2f2cfe71b66a267f023a91db20a817b9425dd964873796980a/trio_websocket-0.12.2.tar.gz", hash = "sha256:22c72c436f3d1e264d0910a3951934798dcc5b00ae56fc4ee079d46c7cf20fae", size = 33549, upload-time = "2025-02-25T05:16:58.947Z" }
|
||||||
|
wheels = [
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/c7/19/eb640a397bba49ba49ef9dbe2e7e5c04202ba045b6ce2ec36e9cadc51e04/trio_websocket-0.12.2-py3-none-any.whl", hash = "sha256:df605665f1db533f4a386c94525870851096a223adcb97f72a07e8b4beba45b6", size = 21221, upload-time = "2025-02-25T05:16:57.545Z" },
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "triton"
|
name = "triton"
|
||||||
version = "3.6.0"
|
version = "3.6.0"
|
||||||
|
|
@ -5415,6 +5572,11 @@ wheels = [
|
||||||
{ url = "https://files.pythonhosted.org/packages/39/08/aaaad47bc4e9dc8c725e68f9d04865dbcb2052843ff09c97b08904852d84/urllib3-2.6.3-py3-none-any.whl", hash = "sha256:bf272323e553dfb2e87d9bfd225ca7b0f467b919d7bbd355436d3fd37cb0acd4", size = 131584, upload-time = "2026-01-07T16:24:42.685Z" },
|
{ url = "https://files.pythonhosted.org/packages/39/08/aaaad47bc4e9dc8c725e68f9d04865dbcb2052843ff09c97b08904852d84/urllib3-2.6.3-py3-none-any.whl", hash = "sha256:bf272323e553dfb2e87d9bfd225ca7b0f467b919d7bbd355436d3fd37cb0acd4", size = 131584, upload-time = "2026-01-07T16:24:42.685Z" },
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[package.optional-dependencies]
|
||||||
|
socks = [
|
||||||
|
{ name = "pysocks" },
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "uuid-utils"
|
name = "uuid-utils"
|
||||||
version = "0.14.1"
|
version = "0.14.1"
|
||||||
|
|
@ -5801,6 +5963,18 @@ wheels = [
|
||||||
{ url = "https://files.pythonhosted.org/packages/1f/f6/a933bd70f98e9cf3e08167fc5cd7aaaca49147e48411c0bd5ae701bb2194/wrapt-1.17.3-py3-none-any.whl", hash = "sha256:7171ae35d2c33d326ac19dd8facb1e82e5fd04ef8c6c0e394d7af55a55051c22", size = 23591, upload-time = "2025-08-12T05:53:20.674Z" },
|
{ url = "https://files.pythonhosted.org/packages/1f/f6/a933bd70f98e9cf3e08167fc5cd7aaaca49147e48411c0bd5ae701bb2194/wrapt-1.17.3-py3-none-any.whl", hash = "sha256:7171ae35d2c33d326ac19dd8facb1e82e5fd04ef8c6c0e394d7af55a55051c22", size = 23591, upload-time = "2025-08-12T05:53:20.674Z" },
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "wsproto"
|
||||||
|
version = "1.3.2"
|
||||||
|
source = { registry = "https://pypi.org/simple" }
|
||||||
|
dependencies = [
|
||||||
|
{ name = "h11" },
|
||||||
|
]
|
||||||
|
sdist = { url = "https://files.pythonhosted.org/packages/c7/79/12135bdf8b9c9367b8701c2c19a14c913c120b882d50b014ca0d38083c2c/wsproto-1.3.2.tar.gz", hash = "sha256:b86885dcf294e15204919950f666e06ffc6c7c114ca900b060d6e16293528294", size = 50116, upload-time = "2025-11-20T18:18:01.871Z" }
|
||||||
|
wheels = [
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/a4/f5/10b68b7b1544245097b2a1b8238f66f2fc6dcaeb24ba5d917f52bd2eed4f/wsproto-1.3.2-py3-none-any.whl", hash = "sha256:61eea322cdf56e8cc904bd3ad7573359a242ba65688716b0710a5eb12beab584", size = 24405, upload-time = "2025-11-20T18:18:00.454Z" },
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "xxhash"
|
name = "xxhash"
|
||||||
version = "3.6.0"
|
version = "3.6.0"
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue