136 lines
4.6 KiB
Python
136 lines
4.6 KiB
Python
import os
|
|
import re
|
|
import uuid
|
|
|
|
from loguru import logger
|
|
from chonkie import Chunk, SemanticChunker
|
|
from langchain_core.documents import Document
|
|
|
|
|
|
def replace_javascript_with_avap(text: str) -> str:
|
|
"""
|
|
Replace mentions of javascript language with avap in the text.
|
|
Handles code blocks, language identifiers, and references.
|
|
|
|
Args:
|
|
text: The text to process.
|
|
|
|
Returns:
|
|
The text with javascript references replaced with avap.
|
|
"""
|
|
# Replace ```javascript with ```avap
|
|
text = text.replace("```javascript", "```avap")
|
|
|
|
# Replace ```js with ```avap
|
|
text = text.replace("```js", "```avap")
|
|
|
|
# Replace common phrases (case-insensitive)
|
|
text = re.sub(r"\bjavascript\s+code\b", "avap code", text, flags=re.IGNORECASE)
|
|
text = re.sub(
|
|
r"\bjavascript\s+example\b", "avap example", text, flags=re.IGNORECASE
|
|
)
|
|
text = re.sub(r"\bjavascript\b(?!\s+file)", "avap", text, flags=re.IGNORECASE)
|
|
|
|
return text
|
|
|
|
|
|
def read_files(
|
|
folder_path: str, file_prefix: str | None = None, concatenate: bool = True
|
|
) -> list[dict]:
|
|
"""
|
|
Read files in a folder whose names start with a given prefix.
|
|
Replaces javascript language markers with avap.
|
|
|
|
Args:
|
|
folder_path: Path to the folder to search in.
|
|
file_prefix: The prefix that file names must start with.
|
|
If None, all files in the folder are included.
|
|
concatenate: Whether to concatenate the contents of the files.
|
|
|
|
Returns:
|
|
A list of dictionaries, each containing 'content' and 'title' keys.
|
|
If concatenate is True, returns a single dict with concatenated content and title as 'appendix'.
|
|
If concatenate is False, returns one dict per file with filename as title.
|
|
"""
|
|
contents = []
|
|
filenames = []
|
|
|
|
for filename in sorted(os.listdir(folder_path)):
|
|
include_file = file_prefix is None or filename.startswith(file_prefix)
|
|
if include_file:
|
|
file_path = os.path.join(folder_path, filename)
|
|
if os.path.isfile(file_path):
|
|
with open(file_path, "r", encoding="utf-8") as f:
|
|
content = f.read()
|
|
cleaned_content = content.strip()
|
|
if cleaned_content:
|
|
contents.append(cleaned_content)
|
|
filenames.append(filename)
|
|
|
|
if concatenate:
|
|
concatenated = "\n".join(contents)
|
|
processed_content = replace_javascript_with_avap(concatenated)
|
|
title = file_prefix if file_prefix is not None else "all_files"
|
|
return [{"content": processed_content, "title": title}]
|
|
else:
|
|
return [
|
|
{"content": replace_javascript_with_avap(content), "title": filename}
|
|
for content, filename in zip(contents, filenames)
|
|
]
|
|
|
|
|
|
def get_chunk_docs(docs: list[dict], chunker: SemanticChunker) -> list[list[Chunk]]:
|
|
"""
|
|
Chunk the content of the documents using the provided chunker.
|
|
|
|
Args:
|
|
docs: A list of dictionaries, each containing 'content' and 'title' keys.
|
|
chunker: An instance of SemanticChunker to use for chunking the content.
|
|
|
|
Returns:
|
|
A list of lists of Chunk objects, where each inner list corresponds to the chunks of a
|
|
single document.
|
|
"""
|
|
list_chunks = []
|
|
|
|
for doc in docs:
|
|
content = doc["content"]
|
|
chunks = chunker.chunk(content)
|
|
for chunk in chunks:
|
|
chunk.context = {"source": doc["title"]}
|
|
list_chunks.append(chunks)
|
|
logger.info(f"Finished chunking {doc['title']}")
|
|
|
|
return list_chunks
|
|
|
|
|
|
def convert_chunks_to_document(chunks: list[dict] | list[list[Chunk]]) -> list[Document]:
|
|
"""
|
|
Convert the chunked content into a list of Document objects.
|
|
|
|
Args:
|
|
chunks: A list of dictionaries containing 'content' and 'title' keys.
|
|
|
|
Returns:
|
|
A list of Document objects created from the chunked content.
|
|
"""
|
|
documents = []
|
|
|
|
if isinstance(chunks[0], dict):
|
|
for chunk in chunks:
|
|
content = chunk["content"]
|
|
title = chunk["title"]
|
|
documents.append(Document(id=str(uuid.uuid4()),
|
|
page_content=content,
|
|
metadata={"source": title}))
|
|
|
|
else:
|
|
for chunk_list in chunks:
|
|
for chunk in chunk_list:
|
|
content = chunk.text
|
|
title = chunk.context.get("source", "unknown")
|
|
documents.append(Document(id=str(uuid.uuid4()),
|
|
page_content=content,
|
|
metadata={"source": title}))
|
|
|
|
return documents |