assistance-engine/scripts/pipelines/flows/elasticsearch_ingestion.py

44 lines
1.3 KiB
Python

import typer
import logging
from loguru import logger
from scripts.pipelines.tasks.chunk import fetch_documents, process_documents, ingest_documents
app = typer.Typer()
@app.command()
def elasticsearch_ingestion(
docs_folder_path: str = "docs/samples",
docs_extension: list[str] = [".md", ".avap"],
es_index: str = "avap-docs-test-v3",
es_request_timeout: int = 120,
es_max_retries: int = 5,
es_retry_on_timeout: bool = True,
delete_es_index: bool = True
):
logger.info("Starting Elasticsearch ingestion pipeline...")
logger.info(f"Fetching files from {docs_folder_path}...")
docs_path = fetch_documents(docs_folder_path, docs_extension)
logger.info("Processing docs...")
chunked_docs = process_documents(docs_path)
logger.info(f"Ingesting chunks in Elasticsearch index: {es_index}...")
ingest_documents(chunked_docs, es_index, es_request_timeout, es_max_retries,
es_retry_on_timeout, delete_es_index)
logger.info(f"Finished ingesting in {es_index}.")
if __name__ == "__main__":
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
)
try:
app()
except Exception as exc:
logger.exception(exc)
raise