import os import re import uuid from loguru import logger from chonkie import Chunk, SemanticChunker from langchain_core.documents import Document def replace_javascript_with_avap(text: str) -> str: """ Replace mentions of javascript language with avap in the text. Handles code blocks, language identifiers, and references. Args: text: The text to process. Returns: The text with javascript references replaced with avap. """ # Replace ```javascript with ```avap text = text.replace("```javascript", "```avap") # Replace ```js with ```avap text = text.replace("```js", "```avap") # Replace common phrases (case-insensitive) text = re.sub(r"\bjavascript\s+code\b", "avap code", text, flags=re.IGNORECASE) text = re.sub( r"\bjavascript\s+example\b", "avap example", text, flags=re.IGNORECASE ) text = re.sub(r"\bjavascript\b(?!\s+file)", "avap", text, flags=re.IGNORECASE) return text def read_files( folder_path: str, file_prefix: str | None = None, concatenate: bool = True ) -> list[dict]: """ Read files in a folder whose names start with a given prefix. Replaces javascript language markers with avap. Args: folder_path: Path to the folder to search in. file_prefix: The prefix that file names must start with. If None, all files in the folder are included. concatenate: Whether to concatenate the contents of the files. Returns: A list of dictionaries, each containing 'content' and 'title' keys. If concatenate is True, returns a single dict with concatenated content and title as 'appendix'. If concatenate is False, returns one dict per file with filename as title. """ contents = [] filenames = [] for filename in sorted(os.listdir(folder_path)): include_file = file_prefix is None or filename.startswith(file_prefix) if include_file: file_path = os.path.join(folder_path, filename) if os.path.isfile(file_path): with open(file_path, "r", encoding="utf-8") as f: content = f.read() cleaned_content = content.strip() if cleaned_content: contents.append(cleaned_content) filenames.append(filename) if concatenate: concatenated = "\n".join(contents) processed_content = replace_javascript_with_avap(concatenated) title = file_prefix if file_prefix is not None else "all_files" return [{"content": processed_content, "title": title}] else: return [ {"content": replace_javascript_with_avap(content), "title": filename} for content, filename in zip(contents, filenames) ] def get_chunk_docs(docs: list[dict], chunker: SemanticChunker) -> list[list[Chunk]]: """ Chunk the content of the documents using the provided chunker. Args: docs: A list of dictionaries, each containing 'content' and 'title' keys. chunker: An instance of SemanticChunker to use for chunking the content. Returns: A list of lists of Chunk objects, where each inner list corresponds to the chunks of a single document. """ list_chunks = [] for doc in docs: content = doc["content"] chunks = chunker.chunk(content) for chunk in chunks: chunk.context = {"source": doc["title"]} list_chunks.append(chunks) logger.info(f"Finished chunking {doc['title']}") return list_chunks def convert_chunks_to_document(chunks: list[dict] | list[list[Chunk]]) -> list[Document]: """ Convert the chunked content into a list of Document objects. Args: chunks: A list of dictionaries containing 'content' and 'title' keys. Returns: A list of Document objects created from the chunked content. """ documents = [] if isinstance(chunks[0], dict): for chunk in chunks: content = chunk["content"] title = chunk["title"] documents.append(Document(id=str(uuid.uuid4()), page_content=content, metadata={"source": title})) else: for chunk_list in chunks: for chunk in chunk_list: content = chunk.text title = chunk.context.get("source", "unknown") documents.append(Document(id=str(uuid.uuid4()), page_content=content, metadata={"source": title})) return documents