import typer import logging from loguru import logger from chonkie import FileFetcher from src.config import settings from scripts.pipelines.tasks.chunk import process_documents, ingest_documents app = typer.Typer() @app.command() def elasticsearch_ingestion( docs_folder_path: str = "docs/LRM", docs_extension: str = ".md", es_index: str = "avap-docs-test-v3", es_request_timeout: int = 120, es_max_retries: int = 5, es_retry_on_timeout: bool = True, delete_es_index: bool = True ): logger.info("Starting Elasticsearch ingestion pipeline...") logger.info(f"Fetching files from {docs_folder_path}...") fetcher = FileFetcher() docs_path = fetcher.fetch(dir=f"{settings.proj_root}/{docs_folder_path}") logger.info("Processing docs...") chunked_docs = process_documents(docs_path, docs_extension) logger.info(f"Ingesting chunks in Elasticsearch index: {es_index}...") ingest_documents(chunked_docs, es_index, es_request_timeout, es_max_retries, es_retry_on_timeout, delete_es_index) logger.info(f"Finished ingesting in {es_index}.") if __name__ == "__main__": logging.basicConfig( level=logging.INFO, format="%(asctime)s | %(levelname)s | %(name)s | %(message)s", ) try: app() except Exception as exc: logger.exception(exc) raise