47 lines
1.3 KiB
Python
47 lines
1.3 KiB
Python
import typer
|
|
import logging
|
|
|
|
from loguru import logger
|
|
from chonkie import FileFetcher
|
|
|
|
from src.config import settings
|
|
from scripts.pipelines.tasks.chunk import process_documents, ingest_documents
|
|
|
|
app = typer.Typer()
|
|
|
|
|
|
@app.command()
|
|
def elasticsearch_ingestion(
|
|
docs_folder_path: str = "docs/LRM",
|
|
docs_extension: str = ".md",
|
|
es_index: str = "avap-docs-test-v3",
|
|
es_request_timeout: int = 120,
|
|
es_max_retries: int = 5,
|
|
es_retry_on_timeout: bool = True,
|
|
delete_es_index: bool = False
|
|
):
|
|
logger.info("Starting Elasticsearch ingestion pipeline...")
|
|
logger.info(f"Fetching files from {docs_folder_path}...")
|
|
fetcher = FileFetcher()
|
|
docs_path = fetcher.fetch(dir=f"{settings.proj_root}/{docs_folder_path}")
|
|
|
|
logger.info("Processing docs...")
|
|
chunked_docs = process_documents(docs_path, docs_extension)
|
|
|
|
logger.info(f"Ingesting chunks in Elasticsearch index: {es_index}...")
|
|
ingest_documents(chunked_docs, es_index, es_request_timeout, es_max_retries,
|
|
es_retry_on_timeout, delete_es_index)
|
|
|
|
logger.info(f"Finished ingesting in {es_index}.")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
|
|
)
|
|
try:
|
|
app()
|
|
except Exception as exc:
|
|
logger.exception(exc)
|
|
raise |