assistance-engine/scripts/pipelines/tasks/chunks.py

46 lines
1.0 KiB
Python

import re
import hashlib
from pathlib import Path
from typing import Any
from langchain_core.documents import Document
def clean_text(text: str) -> str:
text = text.replace("\u00a0", " ")
text = re.sub(r"\s+", " ", text).strip()
return text
def build_chunks_from_folder(
folder_path: str,
) -> list[Document]:
folder = Path(folder_path)
if not folder.exists() or not folder.is_dir():
raise ValueError(f"Invalid folder path: {folder_path}")
all_chunks: list[Document] = []
for file_path in folder.glob("*.txt"):
doc_text = file_path.read_text(encoding="utf-8")
if not doc_text.strip():
continue
metadata: dict[str, Any] = {
"source": file_path.name,
}
doc_text = clean_text(doc_text)
chunk = Document(
id=hashlib.md5(file_path.name.encode()).hexdigest(),
page_content=doc_text,
metadata={**metadata,}
)
all_chunks.append(chunk)
return all_chunks