From 46a6344c45e8fb6e9194d50808993ed6c6f08eeb Mon Sep 17 00:00:00 2001 From: acano Date: Thu, 12 Mar 2026 09:53:56 +0100 Subject: [PATCH] Add docstrings to elasticsearch_ingestion and ingest_documents functions for improved documentation --- .../pipelines/flows/elasticsearch_ingestion.py | 18 +++++++++++++++++- scripts/pipelines/tasks/chunk.py | 13 +++++++++++++ 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/scripts/pipelines/flows/elasticsearch_ingestion.py b/scripts/pipelines/flows/elasticsearch_ingestion.py index 80a49d3..a14e4b0 100644 --- a/scripts/pipelines/flows/elasticsearch_ingestion.py +++ b/scripts/pipelines/flows/elasticsearch_ingestion.py @@ -17,7 +17,23 @@ def elasticsearch_ingestion( es_max_retries: int = 5, es_retry_on_timeout: bool = True, delete_es_index: bool = True -): +) -> None: + """ + Pipeline to ingest documents into an Elasticsearch index. + The pipeline includes fetching documents from a specified folder, processing them into chunks, and then ingesting those chunks into the specified Elasticsearch index. + + Args: + docs_folder_path (str): Path to the folder containing documents to be ingested. Default is "docs/samples". + docs_extension (list[str]): List of file extensions to filter by (e.g., [".md", ".avap"]). Default is [".md", ".avap"]. + es_index (str): Name of the Elasticsearch index to ingest documents into. Default is "avap-docs-test-v3". + es_request_timeout (int): Timeout in seconds for Elasticsearch requests. Default is 120 seconds. + es_max_retries (int): Maximum number of retries for Elasticsearch requests in case of failure. Default is 5 retries. + es_retry_on_timeout (bool): Whether to retry Elasticsearch requests on timeout. Default is True. + delete_es_index (bool): Whether to delete the existing Elasticsearch index before ingestion. Default is True. + + Returns: + None + """ logger.info("Starting Elasticsearch ingestion pipeline...") logger.info(f"Fetching files from {docs_folder_path}...") docs_path = fetch_documents(docs_folder_path, docs_extension) diff --git a/scripts/pipelines/tasks/chunk.py b/scripts/pipelines/tasks/chunk.py index 452e9a6..fcae43e 100644 --- a/scripts/pipelines/tasks/chunk.py +++ b/scripts/pipelines/tasks/chunk.py @@ -153,7 +153,20 @@ def ingest_documents( es_retry_on_timeout: bool, delete_es_index: bool, ) -> None: + """ + Ingest processed documents into an Elasticsearch index. + Args: + chunked_docs (list[Chunk]): List of processed document chunks to be ingested + es_index (str): Name of the Elasticsearch index to ingest into + es_request_timeout (int): Timeout for Elasticsearch requests in seconds + es_max_retries (int): Maximum number of retries for Elasticsearch requests + es_retry_on_timeout (bool): Whether to retry on Elasticsearch request timeouts + delete_es_index (bool): Whether to delete the existing Elasticsearch index before ingestion + + Returns: + None + """ logger.info( f"Instantiating Elasticsearch client with URL: {settings.elasticsearch_local_url}..." )