assistance-engine/scripts/pipelines/tasks/chunk.py

import os
import re
import uuid

from loguru import logger
from chonkie import Chunk, SemanticChunker
from langchain_core.documents import Document


def replace_javascript_with_avap(text: str) -> str:
    """
    Replace mentions of javascript language with avap in the text.
    Handles code blocks, language identifiers, and references.

    Args:
        text: The text to process.

    Returns:
        The text with javascript references replaced with avap.
    """
    # Replace ```javascript with ```avap
    text = text.replace("```javascript", "```avap")

    # Replace ```js with ```avap
    text = text.replace("```js", "```avap")

    # Replace common phrases (case-insensitive)
    text = re.sub(r"\bjavascript\s+code\b", "avap code", text, flags=re.IGNORECASE)
    text = re.sub(
        r"\bjavascript\s+example\b", "avap example", text, flags=re.IGNORECASE
    )
    text = re.sub(r"\bjavascript\b(?!\s+file)", "avap", text, flags=re.IGNORECASE)

    return text


def read_files(
    folder_path: str, file_prefix: str | None = None, concatenate: bool = True
) -> list[dict]:
    """
    Read files in a folder whose names start with a given prefix.
    Replaces javascript language markers with avap.

    Args:
        folder_path: Path to the folder to search in.
        file_prefix: The prefix that file names must start with.
            If None, all files in the folder are included.
        concatenate: Whether to concatenate the contents of the files.

    Returns:
        A list of dictionaries, each containing 'content' and 'title' keys.
        If concatenate is True, returns a single dict with concatenated content and title as 'appendix'.
        If concatenate is False, returns one dict per file with filename as title.
    """
    contents = []
    filenames = []

    for filename in sorted(os.listdir(folder_path)):
        include_file = file_prefix is None or filename.startswith(file_prefix)
        if include_file:
            file_path = os.path.join(folder_path, filename)
            if os.path.isfile(file_path):
                with open(file_path, "r", encoding="utf-8") as f:
                    content = f.read()
                    cleaned_content = content.strip()
                    if cleaned_content:
                        contents.append(cleaned_content)
                        filenames.append(filename)

    if concatenate:
        concatenated = "\n".join(contents)
        processed_content = replace_javascript_with_avap(concatenated)
        title = file_prefix if file_prefix is not None else "all_files"
        return [{"content": processed_content, "title": title}]
    else:
        return [
            {"content": replace_javascript_with_avap(content), "title": filename}
            for content, filename in zip(contents, filenames)
        ]


def get_chunk_docs(docs: list[dict], chunker: SemanticChunker) -> list[list[Chunk]]:
    """
    Chunk the content of the documents using the provided chunker.

    Args:
        docs: A list of dictionaries, each containing 'content' and 'title' keys.
        chunker: An instance of SemanticChunker to use for chunking the content.

    Returns:
        A list of lists of Chunk objects, where each inner list corresponds to the chunks of a
        single document.
    """
    list_chunks = []

    for doc in docs:
        content = doc["content"]
        chunks = chunker.chunk(content)
        for chunk in chunks:
            chunk.context = {"source": doc["title"]}
        list_chunks.append(chunks)
        logger.info(f"Finished chunking {doc['title']}")

    return list_chunks


def convert_chunks_to_document(chunks: list[dict] | list[list[Chunk]]) -> list[Document]:
    """
    Convert the chunked content into a list of Document objects.

    Args:
        chunks: A list of dictionaries containing 'content' and 'title' keys.

    Returns:
        A list of Document objects created from the chunked content.
    """
    documents = []

    if isinstance(chunks[0], dict):
        for chunk in chunks:
            content = chunk["content"]
            title = chunk["title"]
            documents.append(Document(id=str(uuid.uuid4()),
                                      page_content=content,
                                      metadata={"source": title}))

    else:
        for chunk_list in chunks:
            for chunk in chunk_list:
                content = chunk.text
                title = chunk.context.get("source", "unknown")
                documents.append(Document(id=str(uuid.uuid4()),
                                          page_content=content,
                                          metadata={"source": title}))

    return documents