Refactor Elasticsearch ingestion and document processing functions for improved clarity and functionality

2026-03-12 09:50:30 +01:00 · 2026-03-12 09:50:30 +01:00 · 189e404d21
parent 9f3564ab2a
commit 189e404d21
2 changed files with 56 additions and 38 deletions
--- a/scripts/pipelines/flows/elasticsearch_ingestion.py
+++ b/scripts/pipelines/flows/elasticsearch_ingestion.py
@ -2,31 +2,28 @@ import typer
 import logging
 from loguru import logger
 from chonkie import FileFetcher
-from src.config import settings
+from scripts.pipelines.tasks.chunk import fetch_documents, process_documents, ingest_documents
 from scripts.pipelines.tasks.chunk import process_documents, ingest_documents
 app = typer.Typer()
@app.command()
 def elasticsearch_ingestion(
-    docs_folder_path: str = "docs/LRM",
+    docs_folder_path: str = "docs/samples",
-    docs_extension: str = ".md",
+    docs_extension: list[str] = [".md", ".avap"],
    es_index: str = "avap-docs-test-v3",
    es_request_timeout: int = 120,
    es_max_retries: int = 5,
    es_retry_on_timeout: bool = True,
-    delete_es_index: bool = False
+    delete_es_index: bool = True
 ):  
    logger.info("Starting Elasticsearch ingestion pipeline...")
    logger.info(f"Fetching files from {docs_folder_path}...")
-    fetcher = FileFetcher()
+    docs_path = fetch_documents(docs_folder_path, docs_extension)
    docs_path = fetcher.fetch(dir=f"{settings.proj_root}/{docs_folder_path}")
    logger.info("Processing docs...")
-    chunked_docs = process_documents(docs_path, docs_extension)
+    chunked_docs = process_documents(docs_path)
    logger.info(f"Ingesting chunks in Elasticsearch index: {es_index}...")
    ingest_documents(chunked_docs, es_index, es_request_timeout, es_max_retries, 
--- a/scripts/pipelines/tasks/chunk.py
+++ b/scripts/pipelines/tasks/chunk.py
@ -5,9 +5,11 @@ from pathlib import Path
 from chonkie import (
    Chunk,
    ElasticHandshake,
    FileFetcher,
    MarkdownChef,
    TextChef,
    TokenChunker,
    MarkdownDocument
 )
 from elasticsearch import Elasticsearch
 from loguru import logger
@ -27,16 +29,16 @@ def _get_text(element) -> str:
    )
-def _merge_markdown_document(doc):
+def _merge_markdown_document(processed_doc: MarkdownDocument) -> MarkdownDocument:
    elements = []
-    for chunk in doc.chunks:
+    for chunk in processed_doc.chunks:
        elements.append(("chunk", chunk.start_index, chunk.end_index, chunk))
-    for code in doc.code:
+    for code in processed_doc.code:
        elements.append(("code", code.start_index, code.end_index, code))
-    for table in doc.tables:
+    for table in processed_doc.tables:
        elements.append(("table", table.start_index, table.end_index, table))
    elements.sort(key=lambda item: (item[1], item[2]))
@ -87,41 +89,60 @@ def _merge_markdown_document(doc):
    flush()
-    new_doc = deepcopy(doc)
+    fused_processed_doc = deepcopy(processed_doc)
-    new_doc.chunks = merged_chunks
+    fused_processed_doc.chunks = merged_chunks
-    new_doc.code = doc.code
+    fused_processed_doc.code = processed_doc.code
-    new_doc.tables = doc.tables
+    fused_processed_doc.tables = processed_doc.tables
-    return new_doc
+    return fused_processed_doc
-def process_documents(docs_path: list[Path], docs_extension: str) -> list[Chunk]:
+def fetch_documents(docs_folder_path: str, docs_extension: list[str]) -> list[Path]:
    """
    Fetch files from a folder that match the specified extensions.
    Args:
        docs_folder_path (str): Path to the folder containing documents
        docs_extension (list[str]): List of file extensions to filter by (e.g., [".md", ".avap"])
    Returns:
        List of Paths to the fetched documents
    """
    fetcher = FileFetcher()
    docs_path = fetcher.fetch(dir=f"{settings.proj_root}/{docs_folder_path}", ext=docs_extension)
    return docs_path
 def process_documents(docs_path: list[Path]) -> list[Chunk]:
    """
    Process documents by applying appropriate chefs and chunking strategies based on file type.
    Args:
        docs_path (list[Path]): List of Paths to the documents to be processed
    Returns:
        List of processed documents ready for ingestion
    """
    processed_docs = []
    chunked_docs = []
    custom_tokenizer = AutoTokenizer.from_pretrained(settings.hf_emb_model_name)
-
+    chef_md = MarkdownChef(tokenizer=custom_tokenizer)
-    if docs_extension == ".md":
+    chef_txt = TextChef()
        chef = MarkdownChef(tokenizer=custom_tokenizer)
        for doc in docs_path:
            processed_doc = chef.process(doc)
            processed_docs.append((processed_doc, doc.name))
        for processed_doc, filename in processed_docs:
            fused_doc = _merge_markdown_document(processed_doc)
            chunked_docs.extend(fused_doc.chunks)
    elif docs_extension == ".avap":
        chef = TextChef()
    chunker = TokenChunker(tokenizer=custom_tokenizer)
        for doc in docs_path:
            processed_doc = chef.process(doc)
            processed_docs.append((processed_doc, doc.name))
-        for processed_doc, filename in processed_docs:
+    for doc_path in docs_path:
        doc_extension = doc_path.suffix.lower()
        if doc_extension == ".md":
            processed_doc = chef_md.process(doc_path)
            fused_doc = _merge_markdown_document(processed_doc)
            processed_docs.extend(fused_doc.chunks)
        elif doc_extension == ".avap":
            processed_doc = chef_txt.process(doc_path)
            chunked_doc = chunker.chunk(processed_doc.content)
-            chunked_docs.extend(chunked_doc)
+            processed_docs.extend(chunked_doc)
-    return chunked_docs
+    return processed_docs
 def ingest_documents(