import re import hashlib from pathlib import Path from typing import Any from langchain_core.documents import Document def clean_text(text: str) -> str: text = text.replace("\u00a0", " ") text = re.sub(r"\s+", " ", text).strip() return text def build_chunks_from_folder( folder_path: str, ) -> list[Document]: folder = Path(folder_path) if not folder.exists() or not folder.is_dir(): raise ValueError(f"Invalid folder path: {folder_path}") all_chunks: list[Document] = [] for file_path in folder.glob("*.txt"): doc_text = file_path.read_text(encoding="utf-8") if not doc_text.strip(): continue metadata: dict[str, Any] = { "source": file_path.name, } doc_text = clean_text(doc_text) chunk = Document( id=hashlib.md5(file_path.name.encode()).hexdigest(), page_content=doc_text, metadata={**metadata,} ) all_chunks.append(chunk) return all_chunks