feat(chunk): enhance file reading and processing logic

- Updated `read_files` function to return a list of dictionaries containing 'content' and 'title' keys.
- Added logic to handle concatenation of file contents and improved handling of file prefixes.
- Introduced `get_chunk_docs` function to chunk document contents using `SemanticChunker`.
- Added `convert_chunks_to_document` function to convert chunked content into `Document` objects.
- Integrated logging for chunking process.
- Updated dependencies in `uv.lock` to include `chonkie` and other related packages.
This commit is contained in:
acano 2026-03-10 14:36:09 +01:00
parent f6bfba5561
commit bf3c7f36d8
10 changed files with 3125 additions and 149 deletions

2305
docs/AVAP_dev.md Normal file

File diff suppressed because it is too large Load Diff

View File

@ -5,6 +5,7 @@ description = "Add your description here"
readme = "README.md"
requires-python = ">=3.11"
dependencies = [
"chonkie[semantic]>=1.5.6",
"grpcio>=1.78.0",
"grpcio-reflection>=1.78.0",
"grpcio-tools>=1.78.0",
@ -36,5 +37,6 @@ dev = [
"polars>=1.38.1",
"ragas>=0.4.3",
"ruff>=0.15.1",
"selenium>=4.41.0",
"tree-sitter-language-pack>=0.13.0",
]

File diff suppressed because one or more lines are too long

View File

@ -1,59 +1,75 @@
import json
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
url = "https://developer.avapframework.com/docs"
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
driver = webdriver.Chrome(options=chrome_options)
try:
driver.get(url)
time.sleep(5)
wait = WebDriverWait(driver, 15)
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#contentDoc")))
html = driver.page_source
finally:
driver.quit()
soup = BeautifulSoup(html, "html.parser")
# Todos los bloques de contenido dentro de col-md-12
blocks = soup.select("#contentDoc .col-md-12 div.body-md")
main_container = soup.select_one("#contentDoc .col-md-12")
data = []
def html_to_markdown(elem):
text = elem.get_text(" ", strip=True)
for i, block in enumerate(blocks, start=1):
parent_section = block.find_parent("div", id=True)
if not text:
return None
content = []
full_text_parts = []
if elem.name == "h1":
return f"# {text}\n"
if elem.name == "h2":
return f"## {text}\n"
if elem.name == "h3":
return f"### {text}\n"
if elem.name == "h4":
return f"#### {text}\n"
if elem.name == "h5":
return f"##### {text}\n"
if elem.name == "h6":
return f"###### {text}\n"
# Recorremos todos los elementos en orden
for elem in block.find_all(["h1", "h2", "h3", "h4", "h5", "h6", "p", "li", "pre", "code"], recursive=True):
text = elem.get_text(" ", strip=True)
if not text:
continue
if elem.name == "p":
return f"{text}\n"
item = {
"tag": elem.name,
"text": text
}
content.append(item)
full_text_parts.append(text)
if elem.name == "li":
return f"- {text}"
data.append({
"block_index": i,
"section_id": parent_section.get("id") if parent_section else None,
"content": content,
"full_text": "\n".join(full_text_parts)
})
if elem.name == "pre":
code = elem.get_text("\n", strip=True)
return f"\n```\n{code}\n```\n"
with open("avap_docs.json", "w", encoding="utf-8") as f:
json.dump(data, f, indent=2, ensure_ascii=False)
return None
print("Bloques encontrados:", len(data))
markdown_lines = []
tags_to_extract = ["h1", "h2", "h3", "h4", "h5", "h6", "p", "li", "pre"]
for elem in main_container.find_all(tags_to_extract):
md = html_to_markdown(elem)
if md:
markdown_lines.append(md)
markdown = "\n".join(markdown_lines)
with open("avap_docs.md", "w", encoding="utf-8") as f:
f.write(markdown)

View File

@ -7,12 +7,16 @@ import logging
import os
from pathlib import Path
from loguru import logger
from elasticsearch import Elasticsearch
from langchain_core.documents import Document
from langchain_elasticsearch import ElasticsearchStore
from src.utils.emb_factory import create_embedding_model
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_experimental.text_splitter import SemanticChunker
from src.utils.emb_factory import create_embedding_model
from scripts.pipelines.tasks.chunk import scrape_avap_docs
logger = logging.getLogger(__name__)
app = typer.Typer()
ELASTICSEARCH_LOCAL_URL = os.getenv("ELASTICSEARCH_LOCAL_URL")
@ -20,6 +24,7 @@ OLLAMA_LOCAL_URL = os.getenv("OLLAMA_LOCAL_URL")
ELASTICSEARCH_INDEX = os.getenv("ELASTICSEARCH_INDEX")
OLLAMA_URL = os.getenv("OLLAMA_URL")
OLLAMA_EMB_MODEL_NAME = os.getenv("OLLAMA_EMB_MODEL_NAME")
AVAP_WEB_DOCS_URL = os.getenv("AVAP_WEB_DOCS_URL")
class DistanceStrategy(str, Enum):
euclidean = "EUCLIDEAN_DISTANCE"

View File

@ -0,0 +1,143 @@
from enum import Enum
import typer
import logging
import os
from pathlib import Path
from loguru import logger
from elasticsearch import Elasticsearch
from langchain_elasticsearch import ElasticsearchStore
from chonkie import SemanticChunker
from src.utils.emb_factory import create_embedding_model
from scripts.pipelines.tasks.chunk import (
read_files,
get_chunk_docs,
convert_chunks_to_document
)
app = typer.Typer()
ELASTICSEARCH_LOCAL_URL = os.getenv("ELASTICSEARCH_LOCAL_URL")
OLLAMA_LOCAL_URL = os.getenv("OLLAMA_LOCAL_URL")
ELASTICSEARCH_INDEX = os.getenv("ELASTICSEARCH_INDEX")
OLLAMA_URL = os.getenv("OLLAMA_URL")
OLLAMA_EMB_MODEL_NAME = os.getenv("OLLAMA_EMB_MODEL_NAME")
AVAP_WEB_DOCS_URL = os.getenv("AVAP_WEB_DOCS_URL")
HF_EMB_MODEL_NAME = os.getenv("HF_EMB_MODEL_NAME")
class DistanceStrategy(str, Enum):
euclidean = "EUCLIDEAN_DISTANCE"
max_inner_product = "MAX_INNER_PRODUCT"
dot_product = "DOT_PRODUCT"
jaccard = "JACCARD"
cosine = "COSINE"
@app.command()
def elasticsearch_ingestion(
docs_folder_path: str = "docs",
es_request_timeout: int = 120,
es_max_retries: int = 5,
es_retry_on_timeout: bool = True,
distance_strategy: DistanceStrategy = DistanceStrategy.cosine,
chunk_size: int = 2048,
chunk_threshold: float = 0.5,
chunk_similarity_window: int = 3,
chunk_skip_window: int = 1,
):
logger.info("Starting Elasticsearch ingestion pipeline...")
logger.info(f"Reading and concatenating files from folder: {docs_folder_path}/developer.avapframework.com")
avap_github_docs = read_files(f"{docs_folder_path}/avap_language_github_docs", concatenate=False)
avap_web_docs_intro = read_files(f"{docs_folder_path}/developer.avapframework.com", "intro", concatenate=True)
# Check chapters in developer.avapframework.com folder and read and concatenate files for each chapter
chapters = sorted({
p.name.split("_")[0]
for p in Path(f"{docs_folder_path}/developer.avapframework.com").glob("chapter*.md")
})
avap_web_docs_chapters = [
item
for chapter in chapters
for item in read_files(
f"{docs_folder_path}/developer.avapframework.com",
f"{chapter}_",
concatenate=True
)
]
avap_web_docs_appendices = read_files(f"{docs_folder_path}/developer.avapframework.com", "appendices_", concatenate=False)
avap_samples_docs = read_files(f"{docs_folder_path}/samples", concatenate=False)
logger.info("Instantiating semantic chunker...")
chunker = SemanticChunker(
embedding_model=HF_EMB_MODEL_NAME,
chunk_size=chunk_size,
threshold=chunk_threshold,
similarity_window=chunk_similarity_window,
skip_window=chunk_skip_window
)
logger.info("Chunking AVAP GitHub docs...")
avap_github_docs_chunks = get_chunk_docs(avap_github_docs, chunker)
logger.info("Chunking AVAP web docs chapters...")
avap_web_docs_chapters_chunks = get_chunk_docs(avap_web_docs_chapters, chunker)
logger.info("Creating Langchain Document to index...")
avap_github_langchain_docs = convert_chunks_to_document(avap_github_docs_chunks)
avap_web_chapters_langchain_docs = convert_chunks_to_document(avap_web_docs_chapters_chunks)
avap_web_intro_langchain_docs = convert_chunks_to_document(avap_web_docs_intro)
avap_web_appendices_langchain_docs = convert_chunks_to_document(avap_web_docs_appendices)
avap_samples_langchain_docs = convert_chunks_to_document(avap_samples_docs)
avap_documents = avap_github_langchain_docs + avap_web_chapters_langchain_docs + avap_web_intro_langchain_docs + avap_web_appendices_langchain_docs + avap_samples_langchain_docs
logger.info("Connecting to Elasticsearch...")
try:
es = Elasticsearch(
ELASTICSEARCH_LOCAL_URL,
request_timeout=es_request_timeout,
max_retries=es_max_retries,
retry_on_timeout=es_retry_on_timeout,
)
except:
logger.exception("Failed to connect to Elasticsearch.")
raise
logger.info("Instantiating embeddings model...")
try:
embeddings = create_embedding_model(
provider="ollama",
model=OLLAMA_EMB_MODEL_NAME,
base_url=OLLAMA_LOCAL_URL,
)
except:
logger.exception("Failed to instantiate embeddings model.")
raise
logger.info(f"Checking if index {ELASTICSEARCH_INDEX} exists and deleting if it does...")
if es.indices.exists(index=ELASTICSEARCH_INDEX):
es.indices.delete(index=ELASTICSEARCH_INDEX)
logger.info(f"Uploading documents to index {ELASTICSEARCH_INDEX}...")
ElasticsearchStore.from_documents(
avap_documents,
embeddings,
client=es,
index_name=ELASTICSEARCH_INDEX,
distance_strategy=distance_strategy.value,
)
logger.info(f"Finished uploading documents to index {ELASTICSEARCH_INDEX}.")
if __name__ == "__main__":
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
)
try:
app()
except Exception as exc:
logger.exception(exc)
raise

View File

@ -0,0 +1,326 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "f8ea7a75",
"metadata": {},
"outputs": [],
"source": [
"import re\n",
"import hashlib\n",
"from typing import Any\n",
"from enum import Enum\n",
"import typer\n",
"import logging\n",
"import os\n",
"from pathlib import Path\n",
"\n",
"from loguru import logger\n",
"from elasticsearch import Elasticsearch\n",
"from langchain_core.documents import Document\n",
"from langchain_elasticsearch import ElasticsearchStore\n",
"from langchain_community.embeddings import HuggingFaceEmbeddings\n",
"from chonkie import SemanticChunker\n",
"\n",
"from src.utils.emb_factory import create_embedding_model\n",
"from scripts.pipelines.tasks.chunk import read_concat_files, get_chunk_docs, chunks_to_document\n",
"from src.config import PROJ_ROOT\n",
"\n",
"ELASTICSEARCH_LOCAL_URL = os.getenv(\"ELASTICSEARCH_LOCAL_URL\")\n",
"OLLAMA_LOCAL_URL = os.getenv(\"OLLAMA_LOCAL_URL\")\n",
"ELASTICSEARCH_INDEX = os.getenv(\"ELASTICSEARCH_INDEX\")\n",
"OLLAMA_URL = os.getenv(\"OLLAMA_URL\")\n",
"OLLAMA_EMB_MODEL_NAME = os.getenv(\"OLLAMA_EMB_MODEL_NAME\")\n",
"AVAP_WEB_DOCS_URL = os.getenv(\"AVAP_WEB_DOCS_URL\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a8b8de3f",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[32m2026-03-10 13:58:32.657\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m<module>\u001b[0m:\u001b[36m3\u001b[0m - \u001b[1mStarting Elasticsearch ingestion pipeline...\u001b[0m\n",
"\u001b[32m2026-03-10 13:58:32.658\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m<module>\u001b[0m:\u001b[36m4\u001b[0m - \u001b[1mReading and concatenating files from folder: docs/developer.avapframework.com\u001b[0m\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"1\n",
"1\n",
"14\n",
"24\n"
]
}
],
"source": [
"docs_folder_path = \"docs\"\n",
"\n",
"logger.info(\"Starting Elasticsearch ingestion pipeline...\")\n",
"logger.info(f\"Reading and concatenating files from folder: {docs_folder_path}/developer.avapframework.com\")\n",
"avap_github_docs = read_concat_files(PROJ_ROOT / f\"{docs_folder_path}/avap_language_github_docs\", \"AVAP\", concatenate=False)\n",
"avap_web_docs_intro = read_concat_files(PROJ_ROOT / f\"{docs_folder_path}/developer.avapframework.com\", \"intro\", concatenate=True)\n",
"\n",
"# Check chapters in developer.avapframework.com folder and read and concatenate files for each chapter\n",
"chapters = sorted({\n",
" p.name.split(\"_\")[0]\n",
" for p in Path(f\"{docs_folder_path}/developer.avapframework.com\").glob(\"chapter*.md\")\n",
"})\n",
"\n",
"avap_web_docs_chapters = [\n",
" item\n",
" for chapter in chapters\n",
" for item in read_concat_files(\n",
" f\"{docs_folder_path}/developer.avapframework.com\",\n",
" f\"{chapter}_\",\n",
" concatenate=True\n",
" )\n",
"]\n",
"avap_web_docs_appendices = read_concat_files(PROJ_ROOT / f\"{docs_folder_path}/developer.avapframework.com\", \"appendices_\", concatenate=False)\n",
"avap_examples_docs = read_concat_files(PROJ_ROOT / f\"{docs_folder_path}/samples\", concatenate=False)\n",
"\n",
"print(len(avap_github_docs))\n",
"print(len(avap_web_docs_intro))\n",
"print(len(avap_web_docs_chapters))\n",
"print(len(avap_web_docs_appendices))"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "36abc025",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[{'content': 'nivel = 5\\nes_admin = nivel >= 10\\naddResult(es_admin)',\n",
" 'title': 'asignacion_booleana.avap'},\n",
" {'content': 'subtotal = 150.50\\niva = subtotal * 0.21\\ntotal = subtotal + iva\\naddResult(total)',\n",
" 'title': 'asignacion_matematica.avap'},\n",
" {'content': 'startLoop(i,1,10)\\n item = \"item_%s\" % i\\n AddvariableToJSON(item,\\'valor_generado\\',mi_json)\\nendLoop()\\naddResult(mi_json)',\n",
" 'title': 'bucle_1_10.avap'},\n",
" {'content': \"registros = ['1','2','3']\\ngetListLen(registros, total)\\ncontador = 0\\nstartLoop(idx, 0, 2)\\n actual = registros[int(idx)]\\nendLoop()\\naddResult(actual)\",\n",
" 'title': 'bucle_longitud_de_datos.avap'},\n",
" {'content': 'getDateTime(\"\", 86400, \"UTC\", expira)\\naddResult(expira)',\n",
" 'title': 'calculo_de_expiracion.avap'},\n",
" {'content': 'addParam(\"client_id\", id_interno)\\naddResult(id_interno)',\n",
" 'title': 'captura_de_id.avap'},\n",
" {'content': 'addParam(emails,emails)\\ngetQueryParamList(lista_correos)\\naddResult(lista_correos)',\n",
" 'title': 'captura_de_listas_multiples.avap'},\n",
" {'content': 'addParam(\"lang\", l)\\nif(l, \"es\", \"=\")\\n addVar(msg, \"Hola\")\\nend()\\naddResult(msg)',\n",
" 'title': 'comparacion_simple.avap'},\n",
" {'content': 'nombre = \"Sistema\"\\nlog = \"Evento registrado por: %s\" % nombre\\naddResult(log)',\n",
" 'title': 'concatenacion_dinamica.avap'},\n",
" {'content': 'datos_cliente = \"datos\"\\naddVar(clave, \"cliente_vip\")\\nAddvariableToJSON(clave, datos_cliente, mi_json_final)\\naddResult(mi_json_final)',\n",
" 'title': 'construccion_dinamica_de_objeto.avap'},\n",
" {'content': 'addParam(\"data_list\", mi_lista)\\ngetListLen(mi_lista, cantidad)\\naddResult(cantidad)',\n",
" 'title': 'contador_de_parametros.avap'},\n",
" {'content': 'stampToDatetime(1708726162, \"%d/%m/%Y\", 0, fecha_human)\\naddResult(fecha_human)',\n",
" 'title': 'conversion_timestamp_legible.avap'},\n",
" {'content': 'addParam(sal_par,saldo)\\nif(saldo, 0, \">\")\\n permitir = True\\nelse()\\n permitir = False\\nend()\\naddResult(permitir)',\n",
" 'title': 'else_estandar.avap'},\n",
" {'content': 'addParam(userrype, user_type)\\naddParam(sells, compras)\\nif(None, None, \" user_type == \\'VIP\\' or compras > 100\")\\n addVar(descuento, 0.20)\\nend()\\naddResult(descuento)',\n",
" 'title': 'expresion_compleja.avap'},\n",
" {'content': 'getDateTime(\"%Y-%m-%d %H:%M:%S\", 0, \"Europe/Madrid\", sql_date)\\naddResult(sql_date)',\n",
" 'title': 'fecha_para_base_de_datos.avap'},\n",
" {'content': 'function suma(a, b){\\n total = a + b\\n return(total)\\n }\\nresultado = suma(10, 20)\\naddResult(resultado)',\n",
" 'title': 'funcion_de_suma.avap'},\n",
" {'content': 'function es_valido(token){\\n response = False\\n if(token, \"SECRET\", \"=\")\\n response = True\\n end()\\n return(response)\\n }\\nautorizado = es_valido(\"SECRET\")\\naddResult(autorizado)',\n",
" 'title': 'funcion_validacion_acceso.avap'},\n",
" {'content': 'randomString(\"[A-Z]\\\\d\", 32, token_seguridad)\\naddResult(token_seguridad)',\n",
" 'title': 'generador_de_tokens_aleatorios.avap'},\n",
" {'content': 'encodeSHA256(\"payload_data\", checksum)\\naddResult(checksum)',\n",
" 'title': 'hash_SHA256_para_integridad.avap'},\n",
" {'content': 'addVar(mensaje, \"Hola mundo desde AVAP\")\\naddResult(mensaje)',\n",
" 'title': 'hola_mundo.avap'},\n",
" {'content': 'addParam(password,pass_nueva)\\npass_antigua = \"password\"\\nif(pass_nueva, pass_antigua, \"!=\")\\n addVar(cambio, \"Contraseña actualizada\")\\nend()\\naddResult(cambio)',\n",
" 'title': 'if_desigualdad.avap'},\n",
" {'content': 'replace(\"REF_1234_OLD\",\"OLD\", \"NEW\", ref_actualizada)\\naddResult(ref_actualizada)',\n",
" 'title': 'limpieza_de_strings.avap'},\n",
" {'content': 'try()\\n ormDirect(\"UPDATE table_inexistente SET a=1\", res)\\nexception(e)\\n addVar(_status,500)\\n addResult(\"Error de base de datos\")',\n",
" 'title': 'manejo_error_sql_critico.avap'},\n",
" {'content': 'getDateTime(\"\", 0, \"UTC\", ahora)\\naddResult(ahora)',\n",
" 'title': 'obtencion_timestamp.avap'},\n",
" {'content': 'ormCheckTable(tabla_pruebas,resultado_comprobacion)\\nif(resultado_comprobacion,False,\\'==\\')\\n ormCreateTable(\"username,age\",\\'VARCHAR,INTEGER\\',tabla_pruebas,resultado_creacion)\\nend()\\naddResult(resultado_comprobacion)\\naddResult(resultado_creacion)',\n",
" 'title': 'ormAccessCreate.avap'},\n",
" {'content': 'addParam(\"page\", p)\\naddParam(\"size\", s)\\nregistros = [\"u1\", \"u2\", \"u3\", \"u4\", \"u5\", \"u6\"]\\noffset = int(p) * int(s)\\nlimite = offset + int(s)\\ncontador = 0\\naddResult(offset)\\naddResult(limite)\\nstartLoop(i, 2, limite)\\n actual = registros[int(i)]\\n titulo = \"reg_%s\" % i\\n AddvariableToJSON(titulo, actual, pagina_json)\\nendLoop()\\naddResult(pagina_json)',\n",
" 'title': 'paginacion_dinamica_recursos.avap'},\n",
" {'content': 'addVar(base, 1000)\\naddVar(copia, $base)\\naddResult(copia)',\n",
" 'title': 'referencia_por_valor.avap'},\n",
" {'content': 'addVar(code, 200)\\naddVar(status, \"Success\")\\naddResult(code)\\naddResult(status)',\n",
" 'title': 'respuesta_multiple.avap'},\n",
" {'content': 'encontrado = False\\nstartLoop(i, 1, 10)\\n if(i, 5, \"==\")\\n encontrado = True\\n i = 11 \\n end()\\nendLoop()\\naddResult(encontrado)',\n",
" 'title': 'salida_bucle_correcta.avap'},\n",
" {'content': 'try()\\n RequestGet(\"https://api.test.com/data\", 0, 0, respuesta)\\nexception(e)\\n addVar(error_trace, \"Fallo de conexión: %s\" % e)\\n addResult(error_trace)',\n",
" 'title': 'try_catch_request.avap'},\n",
" {'content': 'addParam(\"api_key\", key)\\nif(key, None, \"==\")\\n addVar(_status, 403)\\n addVar(error, \"Acceso denegado: falta API KEY\")\\n addResult(error)\\nend()',\n",
" 'title': 'validacion_de_nulo.avap'},\n",
" {'content': 'addParam(\"rol\", r)\\nif(r, [\"admin\", \"editor\", \"root\"], \"in\")\\n acceso = True\\nend()\\naddResult(acceso)',\n",
" 'title': 'validacion_in_pertenece_a_lista.avap'}]"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"avap_examples_docs"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "16a9e8ce",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "679e5f8c",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 6,
"id": "27e5774d",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[32m2026-03-10 13:58:34.531\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m<module>\u001b[0m:\u001b[36m1\u001b[0m - \u001b[1mChunking documents...\u001b[0m\n"
]
}
],
"source": [
"logger.info(\"Chunking documents...\")\n",
"chunker = SemanticChunker(\n",
" embedding_model=os.getenv(\"HF_EMB_MODEL_NAME\"),\n",
" chunk_size=2048,\n",
" threshold=0.5,\n",
" skip_window=1\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "a5ce984e",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[32m2026-03-10 13:58:51.740\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m<module>\u001b[0m:\u001b[36m1\u001b[0m - \u001b[1mChunking AVAP GitHub docs...\u001b[0m\n",
"\u001b[32m2026-03-10 14:01:00.535\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mscripts.pipelines.tasks.chunk\u001b[0m:\u001b[36mchunk_docs\u001b[0m:\u001b[36m181\u001b[0m - \u001b[1mFinished chunking AVAP.md\u001b[0m\n",
"\u001b[32m2026-03-10 14:01:00.536\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m<module>\u001b[0m:\u001b[36m4\u001b[0m - \u001b[1mChunking AVAP web docs chapters...\u001b[0m\n",
"\u001b[32m2026-03-10 14:01:09.128\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mscripts.pipelines.tasks.chunk\u001b[0m:\u001b[36mchunk_docs\u001b[0m:\u001b[36m181\u001b[0m - \u001b[1mFinished chunking chapter1_\u001b[0m\n",
"\u001b[32m2026-03-10 14:01:12.763\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mscripts.pipelines.tasks.chunk\u001b[0m:\u001b[36mchunk_docs\u001b[0m:\u001b[36m181\u001b[0m - \u001b[1mFinished chunking chapter2_\u001b[0m\n",
"\u001b[32m2026-03-10 14:01:42.995\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mscripts.pipelines.tasks.chunk\u001b[0m:\u001b[36mchunk_docs\u001b[0m:\u001b[36m181\u001b[0m - \u001b[1mFinished chunking chapter3_\u001b[0m\n",
"\u001b[32m2026-03-10 14:01:48.772\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mscripts.pipelines.tasks.chunk\u001b[0m:\u001b[36mchunk_docs\u001b[0m:\u001b[36m181\u001b[0m - \u001b[1mFinished chunking chapter4_\u001b[0m\n",
"\u001b[32m2026-03-10 14:01:48.772\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mscripts.pipelines.tasks.chunk\u001b[0m:\u001b[36mchunk_docs\u001b[0m:\u001b[36m181\u001b[0m - \u001b[1mFinished chunking chapter5_\u001b[0m\n",
"\u001b[32m2026-03-10 14:01:48.773\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mscripts.pipelines.tasks.chunk\u001b[0m:\u001b[36mchunk_docs\u001b[0m:\u001b[36m181\u001b[0m - \u001b[1mFinished chunking chapter6_\u001b[0m\n",
"\u001b[32m2026-03-10 14:02:06.408\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mscripts.pipelines.tasks.chunk\u001b[0m:\u001b[36mchunk_docs\u001b[0m:\u001b[36m181\u001b[0m - \u001b[1mFinished chunking chapter7_\u001b[0m\n",
"\u001b[32m2026-03-10 14:02:21.501\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mscripts.pipelines.tasks.chunk\u001b[0m:\u001b[36mchunk_docs\u001b[0m:\u001b[36m181\u001b[0m - \u001b[1mFinished chunking chapter8_\u001b[0m\n",
"\u001b[32m2026-03-10 14:07:27.158\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mscripts.pipelines.tasks.chunk\u001b[0m:\u001b[36mchunk_docs\u001b[0m:\u001b[36m181\u001b[0m - \u001b[1mFinished chunking chapter9_\u001b[0m\n",
"\u001b[32m2026-03-10 14:07:48.389\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mscripts.pipelines.tasks.chunk\u001b[0m:\u001b[36mchunk_docs\u001b[0m:\u001b[36m181\u001b[0m - \u001b[1mFinished chunking chapter10_\u001b[0m\n",
"\u001b[32m2026-03-10 14:08:10.823\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mscripts.pipelines.tasks.chunk\u001b[0m:\u001b[36mchunk_docs\u001b[0m:\u001b[36m181\u001b[0m - \u001b[1mFinished chunking chapter11_\u001b[0m\n",
"\u001b[32m2026-03-10 14:08:27.335\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mscripts.pipelines.tasks.chunk\u001b[0m:\u001b[36mchunk_docs\u001b[0m:\u001b[36m181\u001b[0m - \u001b[1mFinished chunking chapter12_\u001b[0m\n",
"\u001b[32m2026-03-10 14:08:55.010\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mscripts.pipelines.tasks.chunk\u001b[0m:\u001b[36mchunk_docs\u001b[0m:\u001b[36m181\u001b[0m - \u001b[1mFinished chunking chapter13_\u001b[0m\n",
"\u001b[32m2026-03-10 14:09:10.211\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mscripts.pipelines.tasks.chunk\u001b[0m:\u001b[36mchunk_docs\u001b[0m:\u001b[36m181\u001b[0m - \u001b[1mFinished chunking chapter14_\u001b[0m\n",
"\u001b[32m2026-03-10 14:09:10.211\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m<module>\u001b[0m:\u001b[36m7\u001b[0m - \u001b[1mCreating Langchain Document to index...\u001b[0m\n"
]
}
],
"source": [
"logger.info(\"Chunking AVAP GitHub docs...\")\n",
"avap_github_docs_chunks = chunk_docs(avap_github_docs, chunker)\n",
"\n",
"logger.info(\"Chunking AVAP web docs chapters...\")\n",
"avap_web_docs_chapters_chunks = chunk_docs(avap_web_docs_chapters, chunker)\n",
"\n",
"logger.info(\"Creating Langchain Document to index...\")\n",
"avap_github_langchain_docs = chunks_to_document(avap_github_docs_chunks)\n",
"avap_web_chapters_langchain_docs = chunks_to_document(avap_web_docs_chapters_chunks)\n",
"avap_web_intro_langchain_docs = chunks_to_document(avap_web_docs_intro)\n",
"avap_web_appendices_langchain_docs = chunks_to_document(avap_web_docs_appendices)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "dd1f4d79",
"metadata": {},
"outputs": [],
"source": [
"avap_github_langchain_docs"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c24e8a8f",
"metadata": {},
"outputs": [],
"source": [
"avap_web_chapters_langchain_docs"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f6782a34",
"metadata": {},
"outputs": [],
"source": [
"avap_web_intro_langchain_docs"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "78c1190e",
"metadata": {},
"outputs": [],
"source": [
"avap_web_appendices_langchain_docs"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "assistance-engine",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.13"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@ -1,5 +1,10 @@
import os
import re
import uuid
from loguru import logger
from chonkie import Chunk, SemanticChunker
from langchain_core.documents import Document
def replace_javascript_with_avap(text: str) -> str:
@ -29,33 +34,103 @@ def replace_javascript_with_avap(text: str) -> str:
return text
def read_concat_files(folder_path: str, file_prefix: str, concatenate: bool = True) -> str | list[str]:
def read_files(
folder_path: str, file_prefix: str | None = None, concatenate: bool = True
) -> list[dict]:
"""
Read and concatenate all files in a folder whose names start with a given prefix.
Read files in a folder whose names start with a given prefix.
Replaces javascript language markers with avap.
Args:
folder_path: Path to the folder to search in.
file_prefix: The prefix that file names must start with.
If None, all files in the folder are included.
concatenate: Whether to concatenate the contents of the files.
Returns:
A single string with the concatenated contents of all matching files,
with javascript markers replaced with avap, or a list of strings if concatenate is False.
A list of dictionaries, each containing 'content' and 'title' keys.
If concatenate is True, returns a single dict with concatenated content and title as 'appendix'.
If concatenate is False, returns one dict per file with filename as title.
"""
contents = []
filenames = []
for filename in sorted(os.listdir(folder_path)):
if filename.startswith(file_prefix):
include_file = file_prefix is None or filename.startswith(file_prefix)
if include_file:
file_path = os.path.join(folder_path, filename)
if os.path.isfile(file_path):
with open(file_path, "r", encoding="utf-8") as f:
content = f.read()
if content.strip():
print(f"Reading file: {filename}") # Skip empty files
contents.append(content)
cleaned_content = content.strip()
if cleaned_content:
contents.append(cleaned_content)
filenames.append(filename)
if concatenate:
concatenated = "\n".join(contents)
return replace_javascript_with_avap(concatenated)
processed_content = replace_javascript_with_avap(concatenated)
title = file_prefix if file_prefix is not None else "all_files"
return [{"content": processed_content, "title": title}]
else:
return [replace_javascript_with_avap(content) for content in contents]
return [
{"content": replace_javascript_with_avap(content), "title": filename}
for content, filename in zip(contents, filenames)
]
def get_chunk_docs(docs: list[dict], chunker: SemanticChunker) -> list[list[Chunk]]:
"""
Chunk the content of the documents using the provided chunker.
Args:
docs: A list of dictionaries, each containing 'content' and 'title' keys.
chunker: An instance of SemanticChunker to use for chunking the content.
Returns:
A list of lists of Chunk objects, where each inner list corresponds to the chunks of a
single document.
"""
list_chunks = []
for doc in docs:
content = doc["content"]
chunks = chunker.chunk(content)
for chunk in chunks:
chunk.context = {"source": doc["title"]}
list_chunks.append(chunks)
logger.info(f"Finished chunking {doc['title']}")
return list_chunks
def convert_chunks_to_document(chunks: list[dict] | list[list[Chunk]]) -> list[Document]:
"""
Convert the chunked content into a list of Document objects.
Args:
chunks: A list of dictionaries containing 'content' and 'title' keys.
Returns:
A list of Document objects created from the chunked content.
"""
documents = []
if isinstance(chunks[0], dict):
for chunk in chunks:
content = chunk["content"]
title = chunk["title"]
documents.append(Document(id=str(uuid.uuid4()),
page_content=content,
metadata={"source": title}))
else:
for chunk_list in chunks:
for chunk in chunk_list:
content = chunk.text
title = chunk.context.get("source", "unknown")
documents.append(Document(id=str(uuid.uuid4()),
page_content=content,
metadata={"source": title}))
return documents

174
uv.lock
View File

@ -250,6 +250,7 @@ name = "assistance-engine"
version = "0.1.0"
source = { virtual = "." }
dependencies = [
{ name = "chonkie", extra = ["semantic"] },
{ name = "grpcio" },
{ name = "grpcio-reflection" },
{ name = "grpcio-tools" },
@ -281,11 +282,13 @@ dev = [
{ name = "polars" },
{ name = "ragas" },
{ name = "ruff" },
{ name = "selenium" },
{ name = "tree-sitter-language-pack" },
]
[package.metadata]
requires-dist = [
{ name = "chonkie", extras = ["semantic"], specifier = ">=1.5.6" },
{ name = "grpcio", specifier = ">=1.78.0" },
{ name = "grpcio-reflection", specifier = ">=1.78.0" },
{ name = "grpcio-tools", specifier = ">=1.78.0" },
@ -317,6 +320,7 @@ dev = [
{ name = "polars", specifier = ">=1.38.1" },
{ name = "ragas", specifier = ">=0.4.3" },
{ name = "ruff", specifier = ">=0.15.1" },
{ name = "selenium", specifier = ">=4.41.0" },
{ name = "tree-sitter-language-pack", specifier = ">=0.13.0" },
]
@ -589,6 +593,62 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/0a/4c/925909008ed5a988ccbb72dcc897407e5d6d3bd72410d69e051fc0c14647/charset_normalizer-3.4.4-py3-none-any.whl", hash = "sha256:7a32c560861a02ff789ad905a2fe94e3f840803362c84fecf1851cb4cf3dc37f", size = 53402, upload-time = "2025-10-14T04:42:31.76Z" },
]
[[package]]
name = "chonkie"
version = "1.5.6"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "chonkie-core" },
{ name = "numpy" },
{ name = "tenacity" },
{ name = "tqdm" },
]
sdist = { url = "https://files.pythonhosted.org/packages/a4/16/e51295955f5a627ebb7867dc2e7fa48d4c6dc2a5f3cde3690de84812e929/chonkie-1.5.6.tar.gz", hash = "sha256:282a24c20b88c4c28d8cae893ac78bcbee531a87d28ec86b419897a9eea2ecf3", size = 172066, upload-time = "2026-02-16T21:44:01.336Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/18/3a/24cf4cb377f4d44126231d55a19b48a645a0f78f891288a8d4300c95160d/chonkie-1.5.6-py3-none-any.whl", hash = "sha256:4c3be39a0f97315eb3c5efe6dc5d7933d3d27a1918b55c39ab211b403bb03df7", size = 210065, upload-time = "2026-02-16T21:43:59.926Z" },
]
[package.optional-dependencies]
semantic = [
{ name = "model2vec" },
{ name = "tokenizers" },
]
[[package]]
name = "chonkie-core"
version = "0.9.2"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "numpy" },
]
sdist = { url = "https://files.pythonhosted.org/packages/55/f5/547c836f488dc74116ea42a2b2355365f4829fe6d925564f4db7775e6d34/chonkie_core-0.9.2.tar.gz", hash = "sha256:a34f457016fb4bedf9d0a62e55afc334670d88f8316d50ba9af8df83be78b56a", size = 49480, upload-time = "2026-01-21T09:09:46.265Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/c3/27/8a1f0efc87ef5d99760a462d6c3b17e4e765c77f52a944d56b676a83adfd/chonkie_core-0.9.2-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:9a1bfe98b00e6f70fe97bd342759d67856677056f0cb4193e95f79e561ca35d8", size = 350021, upload-time = "2026-01-21T09:09:12.614Z" },
{ url = "https://files.pythonhosted.org/packages/fb/04/dd2c768f0bad729b2efe4be3999349ee7164092d5acfcbeba12234457191/chonkie_core-0.9.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:9b6ef38eb3385e71d1c47a4a48e394813de3d25a62faa8d9ffd49156e8f31155", size = 336542, upload-time = "2026-01-21T09:09:13.486Z" },
{ url = "https://files.pythonhosted.org/packages/41/0e/bdf2863380373efb3f6c43e3361616d99c12cabd9e37f67949803809068f/chonkie_core-0.9.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0b40a3b33b381dc3543483b8ade4e4176280fa858f8690bdeb28acc082e1dc7e", size = 369363, upload-time = "2026-01-21T09:09:14.379Z" },
{ url = "https://files.pythonhosted.org/packages/34/d2/7316952edf5d7a7788659bb6dba23438e04a0268f93b21fb731204ceee58/chonkie_core-0.9.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cac5919f384e38792c742c51979c2dac57ba1ce079361a1d782bdf7d43b66f30", size = 388981, upload-time = "2026-01-21T09:09:15.295Z" },
{ url = "https://files.pythonhosted.org/packages/6c/1d/8265df0af95651cdca7a64dc6e57f3ad6c562966c53ccf03915a60440eb0/chonkie_core-0.9.2-cp311-cp311-win_amd64.whl", hash = "sha256:67cdf9aceef31ce4e02ecd82c4491c2a36bb70cc9230466f32c7d6cdb039285a", size = 224450, upload-time = "2026-01-21T09:09:16.522Z" },
{ url = "https://files.pythonhosted.org/packages/52/55/8825b059e70a3c757c90efa319e35312a2650431aef1cec11b476ee8699b/chonkie_core-0.9.2-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:d6d11337842ca90713d8b48d42ce823bcc82874437d4071a8aced9d47b66ec76", size = 347854, upload-time = "2026-01-21T09:09:17.49Z" },
{ url = "https://files.pythonhosted.org/packages/11/51/abac8676470c7e7a7967964eb9066e2efc346339c338da7190a41f412bba/chonkie_core-0.9.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:77fe2d6b9a709245408b53923dd4ebe1b79e09fdcdc5916df9c97e90c8e13eda", size = 333582, upload-time = "2026-01-21T09:09:18.863Z" },
{ url = "https://files.pythonhosted.org/packages/d4/8c/f62d4ff0efbc08d8c281051ce1752cd6bcb6a7f3e816f8b3c143741d1b86/chonkie_core-0.9.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c0151a74791445985f30bc34cbe7d679e9a716d36e9acf67ed5dc3408be6a426", size = 365189, upload-time = "2026-01-21T09:09:19.884Z" },
{ url = "https://files.pythonhosted.org/packages/7a/f2/cae3bf4174e7d2b8f0c9fe76a341bed8dc48e30069683854ca536fbed5bd/chonkie_core-0.9.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cb5ba84745a7daa32749fe8cbdf691428c2bd4cef14e6555db4ce382b2edef05", size = 385232, upload-time = "2026-01-21T09:09:21.088Z" },
{ url = "https://files.pythonhosted.org/packages/fe/1b/18323d5a7fa3638e9c0aaf00cb1fb1b678546466debb3ad57a6adea9d686/chonkie_core-0.9.2-cp312-cp312-win_amd64.whl", hash = "sha256:ee5093fd6a3f78163445bab5907cc6fd883ccea0514f8866abead0f059683d45", size = 222786, upload-time = "2026-01-21T09:09:21.919Z" },
{ url = "https://files.pythonhosted.org/packages/28/ee/f45c8cb237e5a55eac366c9ac7a4a831329f6cf6f33401609063c1ed660d/chonkie_core-0.9.2-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:6a658cd4fc5cb7c12bc6587246eb545f84d6aa25b86001a92fdcb191cea632c8", size = 347713, upload-time = "2026-01-21T09:09:22.809Z" },
{ url = "https://files.pythonhosted.org/packages/f9/31/0049eb4366cef2171404166e8ff1f39ffe350d7d8921247d262dbb3d4d6c/chonkie_core-0.9.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:ebf47e86be6603cbb940529aed6880655ac7f0bac232952565160fdbea5283d0", size = 333290, upload-time = "2026-01-21T09:09:23.786Z" },
{ url = "https://files.pythonhosted.org/packages/03/d9/3a082faa359e3b24826547bdc725dc9af92b4180b262d3ca6872724cbfbb/chonkie_core-0.9.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:64645ff2c299b953b4a1ff951d1492b4a2b461c624b20604ced5612a8622b030", size = 364600, upload-time = "2026-01-21T09:09:24.64Z" },
{ url = "https://files.pythonhosted.org/packages/04/0b/b89aa90c4f44ce4d82effc064031016bb791979cfd6147c155548e706ef7/chonkie_core-0.9.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:418748eeea1e09c539fd1d7f7f91c1611765c9d82a523e4c4ae0ed9e30a16b2d", size = 384806, upload-time = "2026-01-21T09:09:25.566Z" },
{ url = "https://files.pythonhosted.org/packages/3d/d9/a7f8577b5550a4323aa9eda16336669b8ad6e8a5ea0c176c9baa25738436/chonkie_core-0.9.2-cp313-cp313-win_amd64.whl", hash = "sha256:f3718af3037480023423125e3b4a490c8f4cbf6de38d652169a97dd8ba391953", size = 222393, upload-time = "2026-01-21T09:09:26.897Z" },
{ url = "https://files.pythonhosted.org/packages/72/73/cf6a32cfa9238f19a1d539a1d8371b7d90e21e42458a43fbc949c6476871/chonkie_core-0.9.2-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:66dc990ac58471fbd12845402726ca69b510602abab7c1c3e52cf8e21f9552e3", size = 363201, upload-time = "2026-01-21T09:09:27.764Z" },
{ url = "https://files.pythonhosted.org/packages/1f/1f/56029d9a557e983cf71d22365b4229c4cfaf09401faa6cbc7e912cef2213/chonkie_core-0.9.2-cp314-cp314-macosx_10_12_x86_64.whl", hash = "sha256:325c0c853268fbdc37f4a65c3cde68fa56e3a25d164eee9512ceb41edc819902", size = 347292, upload-time = "2026-01-21T09:09:28.676Z" },
{ url = "https://files.pythonhosted.org/packages/21/5b/08b8230d9264007cc7920cf1b1576f2ee1a1ef20d3cd5f8adb5e043e0908/chonkie_core-0.9.2-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:be9ea3bee05564a721f5c6c8699e1ad5996cf353b2faea2217a08ddee29e2de7", size = 332693, upload-time = "2026-01-21T09:09:29.541Z" },
{ url = "https://files.pythonhosted.org/packages/eb/72/255c918da43a96c90b2bb96f1951a6ea1c513c18b36caecb6e9192275b83/chonkie_core-0.9.2-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:926db6c26e78b2d84dfb8422073a3f0f20478160ab48204f306fa35f3e1e95d7", size = 364578, upload-time = "2026-01-21T09:09:30.673Z" },
{ url = "https://files.pythonhosted.org/packages/60/80/1710844b9706cd44324446eb368e813ebb4a085e96f469f54b61ddff67dc/chonkie_core-0.9.2-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c861f147e5932e659dcbe770ca0779a470acc62e9242bff87f36d03eae29644d", size = 384173, upload-time = "2026-01-21T09:09:32.015Z" },
{ url = "https://files.pythonhosted.org/packages/5c/5c/a31b259ea94620ea4e4100ed4cc952ad770f2c7af36293bbc9154efb5c9e/chonkie_core-0.9.2-cp314-cp314-win_amd64.whl", hash = "sha256:83473c708a23652d6dc70142b2e586f965af3031b1d2a5c6336f1fc78614b452", size = 222275, upload-time = "2026-01-21T09:09:34.148Z" },
{ url = "https://files.pythonhosted.org/packages/48/8a/c15c88f59bc9cf6f7ac994689d048fd60fcb72247f6b67ca31dc4eadf2f8/chonkie_core-0.9.2-cp314-cp314t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4d6c0b89ee5e21d255bcf7d11f17b07ef811a71292717f50889244474cfab8bc", size = 363813, upload-time = "2026-01-21T09:09:35.175Z" },
{ url = "https://files.pythonhosted.org/packages/44/20/b16b9896065d2bfa175b238a23faa03531b80706f15de85ae6e5701b51fd/chonkie_core-0.9.2-pp311-pypy311_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e08b78b0e208310b8aa523bcd13a0cd655c90751211236a1de99693845b8826d", size = 371578, upload-time = "2026-01-21T09:09:43.44Z" },
{ url = "https://files.pythonhosted.org/packages/1b/c3/844844bbadbfb6727d0f9b286fcb1398fbf2984c1348e8a8238dee335113/chonkie_core-0.9.2-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c6cf9a691cbbf32ef00c22eec66dd9f59b92067f6153c62b766ce24948a2ffd", size = 390277, upload-time = "2026-01-21T09:09:44.364Z" },
]
[[package]]
name = "click"
version = "8.3.1"
@ -2434,6 +2494,25 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/9b/f7/4a5e785ec9fbd65146a27b6b70b6cdc161a66f2024e4b04ac06a67f5578b/mistune-3.2.0-py3-none-any.whl", hash = "sha256:febdc629a3c78616b94393c6580551e0e34cc289987ec6c35ed3f4be42d0eee1", size = 53598, upload-time = "2025-12-23T11:36:33.211Z" },
]
[[package]]
name = "model2vec"
version = "0.7.0"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "jinja2" },
{ name = "joblib" },
{ name = "numpy" },
{ name = "rich" },
{ name = "safetensors" },
{ name = "setuptools" },
{ name = "tokenizers" },
{ name = "tqdm" },
]
sdist = { url = "https://files.pythonhosted.org/packages/2a/73/03badf0a639cdad59db887928f17a187f4240f021f2d3656ef39795058b7/model2vec-0.7.0.tar.gz", hash = "sha256:bf5d0420615e356dd8046794a057bb4f13c50253c44f0d0d1f4441bb489a6ed3", size = 2785837, upload-time = "2025-10-05T06:28:02.482Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/bd/3d/473b1d960a2e27b4115c7599b5d47adf3a487c44798468375d408b5fb825/model2vec-0.7.0-py3-none-any.whl", hash = "sha256:f7a6ebf1b9ca384ba1158c4ecc9a2d450407f63526c0a26e268711071e280c27", size = 53038, upload-time = "2025-10-05T06:28:00.533Z" },
]
[[package]]
name = "mpmath"
version = "1.3.0"
@ -3159,6 +3238,18 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/73/cd/29cee6007bddf7a834e6cd6f536754c0535fcb939d384f0f37a38b1cddb8/ormsgpack-1.12.2-cp314-cp314t-win_amd64.whl", hash = "sha256:837dd316584485b72ef451d08dd3e96c4a11d12e4963aedb40e08f89685d8ec2", size = 117232, upload-time = "2026-01-18T20:55:45.448Z" },
]
[[package]]
name = "outcome"
version = "1.3.0.post0"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "attrs" },
]
sdist = { url = "https://files.pythonhosted.org/packages/98/df/77698abfac98571e65ffeb0c1fba8ffd692ab8458d617a0eed7d9a8d38f2/outcome-1.3.0.post0.tar.gz", hash = "sha256:9dcf02e65f2971b80047b377468e72a268e15c0af3cf1238e6ff14f7f91143b8", size = 21060, upload-time = "2023-10-26T04:26:04.361Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/55/8b/5ab7257531a5d830fc8000c476e63c935488d74609b50f9384a643ec0a62/outcome-1.3.0.post0-py2.py3-none-any.whl", hash = "sha256:e771c5ce06d1415e356078d3bdd68523f284b4ce5419828922b6871e65eda82b", size = 10692, upload-time = "2023-10-26T04:26:02.532Z" },
]
[[package]]
name = "overrides"
version = "7.7.0"
@ -3809,6 +3900,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/c7/21/705964c7812476f378728bdf590ca4b771ec72385c533964653c68e86bdc/pygments-2.19.2-py3-none-any.whl", hash = "sha256:86540386c03d588bb81d44bc3928634ff26449851e99741617ecb9037ee5ec0b", size = 1225217, upload-time = "2025-06-21T13:39:07.939Z" },
]
[[package]]
name = "pysocks"
version = "1.7.1"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/bd/11/293dd436aea955d45fc4e8a35b6ae7270f5b8e00b53cf6c024c83b657a11/PySocks-1.7.1.tar.gz", hash = "sha256:3f8804571ebe159c380ac6de37643bb4685970655d3bba243530d6558b799aa0", size = 284429, upload-time = "2019-09-20T02:07:35.714Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/8d/59/b4572118e098ac8e46e399a1dd0f2d85403ce8bbaad9ec79373ed6badaf9/PySocks-1.7.1-py3-none-any.whl", hash = "sha256:2725bd0a9925919b9b51739eea5f9e2bae91e83288108a9ad338b2e3a4435ee5", size = 16725, upload-time = "2019-09-20T02:06:22.938Z" },
]
[[package]]
name = "python-dateutil"
version = "2.9.0.post0"
@ -4631,6 +4731,23 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/07/39/338d9219c4e87f3e708f18857ecd24d22a0c3094752393319553096b98af/scipy-1.17.1-cp314-cp314t-win_arm64.whl", hash = "sha256:200e1050faffacc162be6a486a984a0497866ec54149a01270adc8a59b7c7d21", size = 25489165, upload-time = "2026-02-23T00:22:29.563Z" },
]
[[package]]
name = "selenium"
version = "4.41.0"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "certifi" },
{ name = "trio" },
{ name = "trio-websocket" },
{ name = "typing-extensions" },
{ name = "urllib3", extra = ["socks"] },
{ name = "websocket-client" },
]
sdist = { url = "https://files.pythonhosted.org/packages/04/7c/133d00d6d013a17d3f39199f27f1a780ec2e95d7b9aa997dc1b8ac2e62a7/selenium-4.41.0.tar.gz", hash = "sha256:003e971f805231ad63e671783a2b91a299355d10cefb9de964c36ff3819115aa", size = 937872, upload-time = "2026-02-20T03:42:06.216Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/a8/d6/e4160989ef6b272779af6f3e5c43c3ba9be6687bdc21c68c3fb220e555b3/selenium-4.41.0-py3-none-any.whl", hash = "sha256:b8ccde8d2e7642221ca64af184a92c19eee6accf2e27f20f30472f5efae18eb1", size = 9532858, upload-time = "2026-02-20T03:42:03.218Z" },
]
[[package]]
name = "send2trash"
version = "2.1.0"
@ -4787,6 +4904,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/e9/44/75a9c9421471a6c4805dbf2356f7c181a29c1879239abab1ea2cc8f38b40/sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2", size = 10235, upload-time = "2024-02-25T23:20:01.196Z" },
]
[[package]]
name = "sortedcontainers"
version = "2.4.0"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/e8/c4/ba2f8066cceb6f23394729afe52f3bf7adec04bf9ed2c820b39e19299111/sortedcontainers-2.4.0.tar.gz", hash = "sha256:25caa5a06cc30b6b83d11423433f65d1f9d76c4c6a0c90e3379eaa43b9bfdb88", size = 30594, upload-time = "2021-05-16T22:03:42.897Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/32/46/9cb0e58b2deb7f82b84065f37f3bffeb12413f947f9388e4cac22c4621ce/sortedcontainers-2.4.0-py2.py3-none-any.whl", hash = "sha256:a163dcaede0f1c021485e957a39245190e74249897e2ae4b2aa38595db237ee0", size = 29575, upload-time = "2021-05-16T22:03:41.177Z" },
]
[[package]]
name = "soupsieve"
version = "2.8.3"
@ -5257,6 +5383,37 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/d8/c7/dcf3ea1c4f5da9b10353b9af4455d756c92d728a8f58f03c480d3ef0ead5/tree_sitter_yaml-0.7.2-cp310-abi3-win_arm64.whl", hash = "sha256:f63c227b18e7ce7587bce124578f0bbf1f890ac63d3e3cd027417574273642c4", size = 44065, upload-time = "2025-10-07T14:40:35.337Z" },
]
[[package]]
name = "trio"
version = "0.33.0"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "attrs" },
{ name = "cffi", marker = "implementation_name != 'pypy' and os_name == 'nt'" },
{ name = "idna" },
{ name = "outcome" },
{ name = "sniffio" },
{ name = "sortedcontainers" },
]
sdist = { url = "https://files.pythonhosted.org/packages/52/b6/c744031c6f89b18b3f5f4f7338603ab381d740a7f45938c4607b2302481f/trio-0.33.0.tar.gz", hash = "sha256:a29b92b73f09d4b48ed249acd91073281a7f1063f09caba5dc70465b5c7aa970", size = 605109, upload-time = "2026-02-14T18:40:55.386Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/1c/93/dab25dc87ac48da0fe0f6419e07d0bfd98799bed4e05e7b9e0f85a1a4b4b/trio-0.33.0-py3-none-any.whl", hash = "sha256:3bd5d87f781d9b0192d592aef28691f8951d6c2e41b7e1da4c25cde6c180ae9b", size = 510294, upload-time = "2026-02-14T18:40:53.313Z" },
]
[[package]]
name = "trio-websocket"
version = "0.12.2"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "outcome" },
{ name = "trio" },
{ name = "wsproto" },
]
sdist = { url = "https://files.pythonhosted.org/packages/d1/3c/8b4358e81f2f2cfe71b66a267f023a91db20a817b9425dd964873796980a/trio_websocket-0.12.2.tar.gz", hash = "sha256:22c72c436f3d1e264d0910a3951934798dcc5b00ae56fc4ee079d46c7cf20fae", size = 33549, upload-time = "2025-02-25T05:16:58.947Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/c7/19/eb640a397bba49ba49ef9dbe2e7e5c04202ba045b6ce2ec36e9cadc51e04/trio_websocket-0.12.2-py3-none-any.whl", hash = "sha256:df605665f1db533f4a386c94525870851096a223adcb97f72a07e8b4beba45b6", size = 21221, upload-time = "2025-02-25T05:16:57.545Z" },
]
[[package]]
name = "triton"
version = "3.6.0"
@ -5415,6 +5572,11 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/39/08/aaaad47bc4e9dc8c725e68f9d04865dbcb2052843ff09c97b08904852d84/urllib3-2.6.3-py3-none-any.whl", hash = "sha256:bf272323e553dfb2e87d9bfd225ca7b0f467b919d7bbd355436d3fd37cb0acd4", size = 131584, upload-time = "2026-01-07T16:24:42.685Z" },
]
[package.optional-dependencies]
socks = [
{ name = "pysocks" },
]
[[package]]
name = "uuid-utils"
version = "0.14.1"
@ -5801,6 +5963,18 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/1f/f6/a933bd70f98e9cf3e08167fc5cd7aaaca49147e48411c0bd5ae701bb2194/wrapt-1.17.3-py3-none-any.whl", hash = "sha256:7171ae35d2c33d326ac19dd8facb1e82e5fd04ef8c6c0e394d7af55a55051c22", size = 23591, upload-time = "2025-08-12T05:53:20.674Z" },
]
[[package]]
name = "wsproto"
version = "1.3.2"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "h11" },
]
sdist = { url = "https://files.pythonhosted.org/packages/c7/79/12135bdf8b9c9367b8701c2c19a14c913c120b882d50b014ca0d38083c2c/wsproto-1.3.2.tar.gz", hash = "sha256:b86885dcf294e15204919950f666e06ffc6c7c114ca900b060d6e16293528294", size = 50116, upload-time = "2025-11-20T18:18:01.871Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/a4/f5/10b68b7b1544245097b2a1b8238f66f2fc6dcaeb24ba5d917f52bd2eed4f/wsproto-1.3.2-py3-none-any.whl", hash = "sha256:61eea322cdf56e8cc904bd3ad7573359a242ba65688716b0710a5eb12beab584", size = 24405, upload-time = "2025-11-20T18:18:00.454Z" },
]
[[package]]
name = "xxhash"
version = "3.6.0"