Add docstrings to elasticsearch_ingestion and ingest_documents functions for improved documentation
This commit is contained in:
parent
189e404d21
commit
46a6344c45
|
|
@ -17,7 +17,23 @@ def elasticsearch_ingestion(
|
||||||
es_max_retries: int = 5,
|
es_max_retries: int = 5,
|
||||||
es_retry_on_timeout: bool = True,
|
es_retry_on_timeout: bool = True,
|
||||||
delete_es_index: bool = True
|
delete_es_index: bool = True
|
||||||
):
|
) -> None:
|
||||||
|
"""
|
||||||
|
Pipeline to ingest documents into an Elasticsearch index.
|
||||||
|
The pipeline includes fetching documents from a specified folder, processing them into chunks, and then ingesting those chunks into the specified Elasticsearch index.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
docs_folder_path (str): Path to the folder containing documents to be ingested. Default is "docs/samples".
|
||||||
|
docs_extension (list[str]): List of file extensions to filter by (e.g., [".md", ".avap"]). Default is [".md", ".avap"].
|
||||||
|
es_index (str): Name of the Elasticsearch index to ingest documents into. Default is "avap-docs-test-v3".
|
||||||
|
es_request_timeout (int): Timeout in seconds for Elasticsearch requests. Default is 120 seconds.
|
||||||
|
es_max_retries (int): Maximum number of retries for Elasticsearch requests in case of failure. Default is 5 retries.
|
||||||
|
es_retry_on_timeout (bool): Whether to retry Elasticsearch requests on timeout. Default is True.
|
||||||
|
delete_es_index (bool): Whether to delete the existing Elasticsearch index before ingestion. Default is True.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
None
|
||||||
|
"""
|
||||||
logger.info("Starting Elasticsearch ingestion pipeline...")
|
logger.info("Starting Elasticsearch ingestion pipeline...")
|
||||||
logger.info(f"Fetching files from {docs_folder_path}...")
|
logger.info(f"Fetching files from {docs_folder_path}...")
|
||||||
docs_path = fetch_documents(docs_folder_path, docs_extension)
|
docs_path = fetch_documents(docs_folder_path, docs_extension)
|
||||||
|
|
|
||||||
|
|
@ -153,7 +153,20 @@ def ingest_documents(
|
||||||
es_retry_on_timeout: bool,
|
es_retry_on_timeout: bool,
|
||||||
delete_es_index: bool,
|
delete_es_index: bool,
|
||||||
) -> None:
|
) -> None:
|
||||||
|
"""
|
||||||
|
Ingest processed documents into an Elasticsearch index.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
chunked_docs (list[Chunk]): List of processed document chunks to be ingested
|
||||||
|
es_index (str): Name of the Elasticsearch index to ingest into
|
||||||
|
es_request_timeout (int): Timeout for Elasticsearch requests in seconds
|
||||||
|
es_max_retries (int): Maximum number of retries for Elasticsearch requests
|
||||||
|
es_retry_on_timeout (bool): Whether to retry on Elasticsearch request timeouts
|
||||||
|
delete_es_index (bool): Whether to delete the existing Elasticsearch index before ingestion
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
None
|
||||||
|
"""
|
||||||
logger.info(
|
logger.info(
|
||||||
f"Instantiating Elasticsearch client with URL: {settings.elasticsearch_local_url}..."
|
f"Instantiating Elasticsearch client with URL: {settings.elasticsearch_local_url}..."
|
||||||
)
|
)
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue