assistance-engine/scripts/pipelines/tasks/chunk.py

136 lines
4.6 KiB
Python

import os
import re
import uuid
from loguru import logger
from chonkie import Chunk, SemanticChunker
from langchain_core.documents import Document
def replace_javascript_with_avap(text: str) -> str:
"""
Replace mentions of javascript language with avap in the text.
Handles code blocks, language identifiers, and references.
Args:
text: The text to process.
Returns:
The text with javascript references replaced with avap.
"""
# Replace ```javascript with ```avap
text = text.replace("```javascript", "```avap")
# Replace ```js with ```avap
text = text.replace("```js", "```avap")
# Replace common phrases (case-insensitive)
text = re.sub(r"\bjavascript\s+code\b", "avap code", text, flags=re.IGNORECASE)
text = re.sub(
r"\bjavascript\s+example\b", "avap example", text, flags=re.IGNORECASE
)
text = re.sub(r"\bjavascript\b(?!\s+file)", "avap", text, flags=re.IGNORECASE)
return text
def read_files(
folder_path: str, file_prefix: str | None = None, concatenate: bool = True
) -> list[dict]:
"""
Read files in a folder whose names start with a given prefix.
Replaces javascript language markers with avap.
Args:
folder_path: Path to the folder to search in.
file_prefix: The prefix that file names must start with.
If None, all files in the folder are included.
concatenate: Whether to concatenate the contents of the files.
Returns:
A list of dictionaries, each containing 'content' and 'title' keys.
If concatenate is True, returns a single dict with concatenated content and title as 'appendix'.
If concatenate is False, returns one dict per file with filename as title.
"""
contents = []
filenames = []
for filename in sorted(os.listdir(folder_path)):
include_file = file_prefix is None or filename.startswith(file_prefix)
if include_file:
file_path = os.path.join(folder_path, filename)
if os.path.isfile(file_path):
with open(file_path, "r", encoding="utf-8") as f:
content = f.read()
cleaned_content = content.strip()
if cleaned_content:
contents.append(cleaned_content)
filenames.append(filename)
if concatenate:
concatenated = "\n".join(contents)
processed_content = replace_javascript_with_avap(concatenated)
title = file_prefix if file_prefix is not None else "all_files"
return [{"content": processed_content, "title": title}]
else:
return [
{"content": replace_javascript_with_avap(content), "title": filename}
for content, filename in zip(contents, filenames)
]
def get_chunk_docs(docs: list[dict], chunker: SemanticChunker) -> list[list[Chunk]]:
"""
Chunk the content of the documents using the provided chunker.
Args:
docs: A list of dictionaries, each containing 'content' and 'title' keys.
chunker: An instance of SemanticChunker to use for chunking the content.
Returns:
A list of lists of Chunk objects, where each inner list corresponds to the chunks of a
single document.
"""
list_chunks = []
for doc in docs:
content = doc["content"]
chunks = chunker.chunk(content)
for chunk in chunks:
chunk.context = {"source": doc["title"]}
list_chunks.append(chunks)
logger.info(f"Finished chunking {doc['title']}")
return list_chunks
def convert_chunks_to_document(chunks: list[dict] | list[list[Chunk]]) -> list[Document]:
"""
Convert the chunked content into a list of Document objects.
Args:
chunks: A list of dictionaries containing 'content' and 'title' keys.
Returns:
A list of Document objects created from the chunked content.
"""
documents = []
if isinstance(chunks[0], dict):
for chunk in chunks:
content = chunk["content"]
title = chunk["title"]
documents.append(Document(id=str(uuid.uuid4()),
page_content=content,
metadata={"source": title}))
else:
for chunk_list in chunks:
for chunk in chunk_list:
content = chunk.text
title = chunk.context.get("source", "unknown")
documents.append(Document(id=str(uuid.uuid4()),
page_content=content,
metadata={"source": title}))
return documents