From 46a6344c45e8fb6e9194d50808993ed6c6f08eeb Mon Sep 17 00:00:00 2001
From: acano <acturnes@mrhouston.net>
Date: Thu, 12 Mar 2026 09:53:56 +0100
Subject: [PATCH] Add docstrings to elasticsearch_ingestion and
 ingest_documents functions for improved documentation

---
 .../pipelines/flows/elasticsearch_ingestion.py | 18 +++++++++++++++++-
 scripts/pipelines/tasks/chunk.py               | 13 +++++++++++++
 2 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/scripts/pipelines/flows/elasticsearch_ingestion.py b/scripts/pipelines/flows/elasticsearch_ingestion.py
index 80a49d3..a14e4b0 100644
--- a/scripts/pipelines/flows/elasticsearch_ingestion.py
+++ b/scripts/pipelines/flows/elasticsearch_ingestion.py
@@ -17,7 +17,23 @@ def elasticsearch_ingestion(
     es_max_retries: int = 5,
     es_retry_on_timeout: bool = True,
     delete_es_index: bool = True
-):  
+) -> None:  
+    """
+    Pipeline to ingest documents into an Elasticsearch index. 
+    The pipeline includes fetching documents from a specified folder, processing them into chunks, and then ingesting those chunks into the specified Elasticsearch index.
+
+    Args:
+        docs_folder_path (str): Path to the folder containing documents to be ingested. Default is "docs/samples".
+        docs_extension (list[str]): List of file extensions to filter by (e.g., [".md", ".avap"]). Default is [".md", ".avap"].
+        es_index (str): Name of the Elasticsearch index to ingest documents into. Default is "avap-docs-test-v3".
+        es_request_timeout (int): Timeout in seconds for Elasticsearch requests. Default is 120 seconds.
+        es_max_retries (int): Maximum number of retries for Elasticsearch requests in case of failure. Default is 5 retries.
+        es_retry_on_timeout (bool): Whether to retry Elasticsearch requests on timeout. Default is True.
+        delete_es_index (bool): Whether to delete the existing Elasticsearch index before ingestion. Default is True.
+
+    Returns:
+        None
+    """
     logger.info("Starting Elasticsearch ingestion pipeline...")
     logger.info(f"Fetching files from {docs_folder_path}...")
     docs_path = fetch_documents(docs_folder_path, docs_extension)
diff --git a/scripts/pipelines/tasks/chunk.py b/scripts/pipelines/tasks/chunk.py
index 452e9a6..fcae43e 100644
--- a/scripts/pipelines/tasks/chunk.py
+++ b/scripts/pipelines/tasks/chunk.py
@@ -153,7 +153,20 @@ def ingest_documents(
     es_retry_on_timeout: bool,
     delete_es_index: bool,
 ) -> None:
+    """
+    Ingest processed documents into an Elasticsearch index.
 
+    Args:
+        chunked_docs (list[Chunk]): List of processed document chunks to be ingested
+        es_index (str): Name of the Elasticsearch index to ingest into
+        es_request_timeout (int): Timeout for Elasticsearch requests in seconds
+        es_max_retries (int): Maximum number of retries for Elasticsearch requests
+        es_retry_on_timeout (bool): Whether to retry on Elasticsearch request timeouts
+        delete_es_index (bool): Whether to delete the existing Elasticsearch index before ingestion
+
+    Returns:
+        None
+    """
     logger.info(
         f"Instantiating Elasticsearch client with URL: {settings.elasticsearch_local_url}..."
     )