46 lines
1.0 KiB
Python
46 lines
1.0 KiB
Python
import re
|
|
import hashlib
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
from langchain_core.documents import Document
|
|
|
|
|
|
def clean_text(text: str) -> str:
|
|
text = text.replace("\u00a0", " ")
|
|
text = re.sub(r"\s+", " ", text).strip()
|
|
return text
|
|
|
|
|
|
def build_chunks_from_folder(
|
|
folder_path: str,
|
|
) -> list[Document]:
|
|
|
|
folder = Path(folder_path)
|
|
|
|
if not folder.exists() or not folder.is_dir():
|
|
raise ValueError(f"Invalid folder path: {folder_path}")
|
|
|
|
all_chunks: list[Document] = []
|
|
|
|
for file_path in folder.glob("*.txt"):
|
|
doc_text = file_path.read_text(encoding="utf-8")
|
|
|
|
if not doc_text.strip():
|
|
continue
|
|
|
|
metadata: dict[str, Any] = {
|
|
"source": file_path.name,
|
|
}
|
|
|
|
doc_text = clean_text(doc_text)
|
|
chunk = Document(
|
|
id=hashlib.md5(file_path.name.encode()).hexdigest(),
|
|
page_content=doc_text,
|
|
metadata={**metadata,}
|
|
)
|
|
|
|
all_chunks.append(chunk)
|
|
|
|
return all_chunks
|