feat: Update dependencies and enhance Elasticsearch ingestion pipeline
- Added new dependencies including chonkie and markdown-it-py to requirements.txt. - Refactored the Elasticsearch ingestion script to read and concatenate documents from specified folders. - Implemented semantic chunking for documents using the chonkie library. - Removed the old elasticsearch_ingestion_from_docs.py script as its functionality has been integrated into the main ingestion pipeline. - Updated README.md to reflect new project structure and environment variables. - Added a new changelog entry for version 1.4.0 detailing recent changes and enhancements.
This commit is contained in:
parent
f5b2df94d2
commit
2ad09cc77f
|
|
@ -26,6 +26,10 @@ certifi==2026.1.4
|
||||||
# requests
|
# requests
|
||||||
charset-normalizer==3.4.4
|
charset-normalizer==3.4.4
|
||||||
# via requests
|
# via requests
|
||||||
|
chonkie==1.5.6
|
||||||
|
# via assistance-engine
|
||||||
|
chonkie-core==0.9.2
|
||||||
|
# via chonkie
|
||||||
click==8.3.1
|
click==8.3.1
|
||||||
# via nltk
|
# via nltk
|
||||||
colorama==0.4.6 ; sys_platform == 'win32'
|
colorama==0.4.6 ; sys_platform == 'win32'
|
||||||
|
|
@ -81,12 +85,16 @@ idna==3.11
|
||||||
# httpx
|
# httpx
|
||||||
# requests
|
# requests
|
||||||
# yarl
|
# yarl
|
||||||
|
jinja2==3.1.6
|
||||||
|
# via model2vec
|
||||||
jmespath==1.1.0
|
jmespath==1.1.0
|
||||||
# via
|
# via
|
||||||
# boto3
|
# boto3
|
||||||
# botocore
|
# botocore
|
||||||
joblib==1.5.3
|
joblib==1.5.3
|
||||||
# via nltk
|
# via
|
||||||
|
# model2vec
|
||||||
|
# nltk
|
||||||
jsonpatch==1.33
|
jsonpatch==1.33
|
||||||
# via langchain-core
|
# via langchain-core
|
||||||
jsonpointer==3.0.0
|
jsonpointer==3.0.0
|
||||||
|
|
@ -137,8 +145,16 @@ langsmith==0.7.6
|
||||||
# langchain-core
|
# langchain-core
|
||||||
loguru==0.7.3
|
loguru==0.7.3
|
||||||
# via assistance-engine
|
# via assistance-engine
|
||||||
|
markdown-it-py==4.0.0
|
||||||
|
# via rich
|
||||||
|
markupsafe==3.0.3
|
||||||
|
# via jinja2
|
||||||
marshmallow==3.26.2
|
marshmallow==3.26.2
|
||||||
# via dataclasses-json
|
# via dataclasses-json
|
||||||
|
mdurl==0.1.2
|
||||||
|
# via markdown-it-py
|
||||||
|
model2vec==0.7.0
|
||||||
|
# via chonkie
|
||||||
multidict==6.7.1
|
multidict==6.7.1
|
||||||
# via
|
# via
|
||||||
# aiohttp
|
# aiohttp
|
||||||
|
|
@ -150,9 +166,12 @@ nltk==3.9.3
|
||||||
numpy==2.4.2
|
numpy==2.4.2
|
||||||
# via
|
# via
|
||||||
# assistance-engine
|
# assistance-engine
|
||||||
|
# chonkie
|
||||||
|
# chonkie-core
|
||||||
# elasticsearch
|
# elasticsearch
|
||||||
# langchain-aws
|
# langchain-aws
|
||||||
# langchain-community
|
# langchain-community
|
||||||
|
# model2vec
|
||||||
# pandas
|
# pandas
|
||||||
ollama==0.6.1
|
ollama==0.6.1
|
||||||
# via langchain-ollama
|
# via langchain-ollama
|
||||||
|
|
@ -192,6 +211,8 @@ pydantic-core==2.41.5
|
||||||
# via pydantic
|
# via pydantic
|
||||||
pydantic-settings==2.13.1
|
pydantic-settings==2.13.1
|
||||||
# via langchain-community
|
# via langchain-community
|
||||||
|
pygments==2.19.2
|
||||||
|
# via rich
|
||||||
python-dateutil==2.9.0.post0
|
python-dateutil==2.9.0.post0
|
||||||
# via
|
# via
|
||||||
# botocore
|
# botocore
|
||||||
|
|
@ -220,10 +241,16 @@ requests==2.32.5
|
||||||
# requests-toolbelt
|
# requests-toolbelt
|
||||||
requests-toolbelt==1.0.0
|
requests-toolbelt==1.0.0
|
||||||
# via langsmith
|
# via langsmith
|
||||||
|
rich==14.3.3
|
||||||
|
# via model2vec
|
||||||
s3transfer==0.16.0
|
s3transfer==0.16.0
|
||||||
# via boto3
|
# via boto3
|
||||||
|
safetensors==0.7.0
|
||||||
|
# via model2vec
|
||||||
setuptools==82.0.0
|
setuptools==82.0.0
|
||||||
# via grpcio-tools
|
# via
|
||||||
|
# grpcio-tools
|
||||||
|
# model2vec
|
||||||
simsimd==6.5.13
|
simsimd==6.5.13
|
||||||
# via elasticsearch
|
# via elasticsearch
|
||||||
six==1.17.0
|
six==1.17.0
|
||||||
|
|
@ -234,14 +261,20 @@ sqlalchemy==2.0.46
|
||||||
# langchain-community
|
# langchain-community
|
||||||
tenacity==9.1.4
|
tenacity==9.1.4
|
||||||
# via
|
# via
|
||||||
|
# chonkie
|
||||||
# langchain-community
|
# langchain-community
|
||||||
# langchain-core
|
# langchain-core
|
||||||
tokenizers==0.22.2
|
tokenizers==0.22.2
|
||||||
# via langchain-huggingface
|
# via
|
||||||
|
# chonkie
|
||||||
|
# langchain-huggingface
|
||||||
|
# model2vec
|
||||||
tqdm==4.67.3
|
tqdm==4.67.3
|
||||||
# via
|
# via
|
||||||
# assistance-engine
|
# assistance-engine
|
||||||
|
# chonkie
|
||||||
# huggingface-hub
|
# huggingface-hub
|
||||||
|
# model2vec
|
||||||
# nltk
|
# nltk
|
||||||
typing-extensions==4.15.0
|
typing-extensions==4.15.0
|
||||||
# via
|
# via
|
||||||
|
|
|
||||||
35
README.md
35
README.md
|
|
@ -47,26 +47,26 @@ graph TD
|
||||||
├── changelog # Version tracking and release history
|
├── changelog # Version tracking and release history
|
||||||
├── pyproject.toml # Python project configuration
|
├── pyproject.toml # Python project configuration
|
||||||
├── Docker/
|
├── Docker/
|
||||||
|
│ ├── protos/
|
||||||
|
│ │ └── brunix.proto # Protocol Buffers: The source of truth for the API
|
||||||
|
│ ├── src/
|
||||||
|
│ │ ├── graph.py # Workflow graph orchestration
|
||||||
|
│ │ ├── prompts.py # Centralized prompt definitions
|
||||||
|
│ │ ├── server.py # gRPC Server & RAG Orchestration
|
||||||
|
│ │ ├── state.py # Shared state management
|
||||||
|
│ │ └── utils/ # Utility modules
|
||||||
│ ├── Dockerfile # Container definition for the Engine
|
│ ├── Dockerfile # Container definition for the Engine
|
||||||
│ ├── docker-compose.yaml # Local orchestration for dev environment
|
│ ├── docker-compose.yaml # Local orchestration for dev environment
|
||||||
│ ├── requirements.txt # Python dependencies for Docker
|
│ ├── .dockerignore # Files to ignore by Docker
|
||||||
│ ├── protos/
|
│ └── requirements.txt # Python dependencies for Docker
|
||||||
│ │ └── brunix.proto # Protocol Buffers: The source of truth for the API
|
├── docs/ # AVAP documentation
|
||||||
│ └── src/
|
|
||||||
│ ├── graph.py # Workflow graph orchestration
|
|
||||||
│ ├── prompts.py # Centralized prompt definitions
|
|
||||||
│ ├── server.py # gRPC Server & RAG Orchestration
|
|
||||||
│ ├── state.py # Shared state management
|
|
||||||
│ └── utils/ # Utility modules
|
|
||||||
├── ingestion/
|
|
||||||
│ └── docs/ # AVAP documentation chunks
|
|
||||||
├── kubernetes/
|
├── kubernetes/
|
||||||
│ └── kubeconfig.yaml # Kubernetes cluster configuration
|
│ └── kubeconfig.yaml # Kubernetes cluster configuration
|
||||||
├── scripts/
|
└── scripts/
|
||||||
│ └── pipelines/
|
└── pipelines/
|
||||||
│ └── flows/ # Data processing flows
|
├── flows/ # Data processing flows
|
||||||
└── src/
|
├── tasks/ # Function modules used by the flows
|
||||||
└── server.py # Core Logic: gRPC Server & RAG Orchestration
|
└── input/ # Inputs used by the flows
|
||||||
```
|
```
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
@ -136,6 +136,7 @@ OLLAMA_URL=http://host.docker.internal:11434
|
||||||
OLLAMA_LOCAL_URL=http://localhost:11434
|
OLLAMA_LOCAL_URL=http://localhost:11434
|
||||||
OLLAMA_MODEL_NAME=qwen2.5:1.5b
|
OLLAMA_MODEL_NAME=qwen2.5:1.5b
|
||||||
OLLAMA_EMB_MODEL_NAME=qwen3-0.6B-emb:latest
|
OLLAMA_EMB_MODEL_NAME=qwen3-0.6B-emb:latest
|
||||||
|
HF_EMB_MODEL_NAME=Qwen/Qwen3-Embedding-0.6B
|
||||||
```
|
```
|
||||||
|
|
||||||
| Variable | Required | Description | Example |
|
| Variable | Required | Description | Example |
|
||||||
|
|
@ -151,6 +152,8 @@ OLLAMA_EMB_MODEL_NAME=qwen3-0.6B-emb:latest
|
||||||
| `OLLAMA_LOCAL_URL` | Yes | Ollama endpoint used for text generation/embeddings in local | `http://localhost:11434` |
|
| `OLLAMA_LOCAL_URL` | Yes | Ollama endpoint used for text generation/embeddings in local | `http://localhost:11434` |
|
||||||
| `OLLAMA_MODEL_NAME` | Yes | Ollama model name for generation | `qwen2.5:1.5b` |
|
| `OLLAMA_MODEL_NAME` | Yes | Ollama model name for generation | `qwen2.5:1.5b` |
|
||||||
| `OLLAMA_EMB_MODEL_NAME` | Yes | Ollama embeddings model name | `qwen3-0.6B-emb:latest` |
|
| `OLLAMA_EMB_MODEL_NAME` | Yes | Ollama embeddings model name | `qwen3-0.6B-emb:latest` |
|
||||||
|
| `HF_TOKEN` | Yes | Hugginface secret token | `hf_...` |
|
||||||
|
| `HF_EMB_MODEL_NAME` | Yes | Hugginface embeddings model name | `Qwen/Qwen3-Embedding-0.6B` |
|
||||||
|
|
||||||
> Never commit real secret values. Use placeholder values when sharing configuration examples.
|
> Never commit real secret values. Use placeholder values when sharing configuration examples.
|
||||||
|
|
||||||
|
|
|
||||||
32
changelog
32
changelog
|
|
@ -4,16 +4,36 @@ All notable changes to the **Brunix Assistance Engine** will be documented in th
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
|
## [1.4.0] - 2026-03-11
|
||||||
|
|
||||||
|
### Added
|
||||||
|
- IMPLEMENTED:
|
||||||
|
- `scripts/pipelines/flows/translate_mbpp.py`: pipeline to generate synthethic dataset from mbpp dataset.
|
||||||
|
- `scripts/input/prompts.py`: module containing prompts for pipelines.
|
||||||
|
- `scripts/tasks/chunk.py`: module containing functions related to chunk management.
|
||||||
|
- `synthethic_datasets`: folder containing generated synthethic datasets.
|
||||||
|
- `src/config.py`: environment variables configuration file.
|
||||||
|
|
||||||
|
### Changed
|
||||||
|
- REFACTORED: `scripts/pipelines/flows/elasticsearch_ingestion.py` now uses `docs` documents instead of pre chunked files.
|
||||||
|
- RENAMED `docs/AVAP Language: Core Commands & Functional Specification` to `docs/avap_language_github_docs`.
|
||||||
|
- REMOVED: `Makefile` file.
|
||||||
|
- REMOVED: `scripts/start-tunnels.sh` script.
|
||||||
|
- REMOVED `ingestion` folder.
|
||||||
|
- DEPENDENCIES: `requirements.txt` updated with new libraries required by the new modules.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
## [1.3.0] - 2026-03-05
|
## [1.3.0] - 2026-03-05
|
||||||
|
|
||||||
### Added
|
### Added
|
||||||
- IMPLEMENTED:
|
- IMPLEMENTED:
|
||||||
- `src/utils/emb_factory`: factory modules created for embedding model generation.
|
- `Docker/src/utils/emb_factory`: factory modules created for embedding model generation.
|
||||||
- `src/utils/llm_factory`: factory modules created for LLM generation.
|
- `Docker/src/utils/llm_factory`: factory modules created for LLM generation.
|
||||||
- `src/graph.py`: workflow graph orchestration module added.
|
- `Docker/src/graph.py`: workflow graph orchestration module added.
|
||||||
- `src/prompts.py`: centralized prompt definitions added.
|
- `Docker/src/prompts.py`: centralized prompt definitions added.
|
||||||
- `src/state.py`: shared state management module added.
|
- `Docker/src/state.py`: shared state management module added.
|
||||||
- `pipelines/flows/elasticsearch_ingestion.py`: pipeline to populate the elasticsearch vector database.
|
- `scripts/pipelines/flows/elasticsearch_ingestion.py`: pipeline to populate the elasticsearch vector database.
|
||||||
- `ingestion/docs`: folder containing all chunked AVAP documents.
|
- `ingestion/docs`: folder containing all chunked AVAP documents.
|
||||||
|
|
||||||
### Changed
|
### Changed
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,129 @@
|
||||||
|
import re
|
||||||
|
import hashlib
|
||||||
|
from typing import Any
|
||||||
|
from enum import Enum
|
||||||
|
import typer
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from loguru import logger
|
||||||
|
from elasticsearch import Elasticsearch
|
||||||
|
from langchain_core.documents import Document
|
||||||
|
from langchain_elasticsearch import ElasticsearchStore
|
||||||
|
from langchain_community.embeddings import HuggingFaceEmbeddings
|
||||||
|
from langchain_experimental.text_splitter import SemanticChunker
|
||||||
|
|
||||||
|
from src.utils.emb_factory import create_embedding_model
|
||||||
|
from scripts.pipelines.tasks.chunk import scrape_avap_docs
|
||||||
|
|
||||||
|
app = typer.Typer()
|
||||||
|
|
||||||
|
ELASTICSEARCH_LOCAL_URL = os.getenv("ELASTICSEARCH_LOCAL_URL")
|
||||||
|
OLLAMA_LOCAL_URL = os.getenv("OLLAMA_LOCAL_URL")
|
||||||
|
ELASTICSEARCH_INDEX = os.getenv("ELASTICSEARCH_INDEX")
|
||||||
|
OLLAMA_URL = os.getenv("OLLAMA_URL")
|
||||||
|
OLLAMA_EMB_MODEL_NAME = os.getenv("OLLAMA_EMB_MODEL_NAME")
|
||||||
|
AVAP_WEB_DOCS_URL = os.getenv("AVAP_WEB_DOCS_URL")
|
||||||
|
|
||||||
|
class DistanceStrategy(str, Enum):
|
||||||
|
euclidean = "EUCLIDEAN_DISTANCE"
|
||||||
|
max_inner_product = "MAX_INNER_PRODUCT"
|
||||||
|
dot_product = "DOT_PRODUCT"
|
||||||
|
jaccard = "JACCARD"
|
||||||
|
cosine = "COSINE"
|
||||||
|
|
||||||
|
def clean_text(text: str) -> str:
|
||||||
|
text = text.replace("\u00a0", " ")
|
||||||
|
text = re.sub(r"\s+", " ", text).strip()
|
||||||
|
return text
|
||||||
|
|
||||||
|
def build_documents_from_folder(
|
||||||
|
folder_path: str,
|
||||||
|
) -> list[Document]:
|
||||||
|
|
||||||
|
folder = Path(folder_path)
|
||||||
|
|
||||||
|
if not folder.exists() or not folder.is_dir():
|
||||||
|
raise ValueError(f"Invalid folder path: {folder_path}")
|
||||||
|
|
||||||
|
all_documents: list[Document] = []
|
||||||
|
|
||||||
|
for file_path in folder.glob("*.txt"):
|
||||||
|
doc_text = file_path.read_text(encoding="utf-8")
|
||||||
|
|
||||||
|
if not doc_text.strip():
|
||||||
|
continue
|
||||||
|
|
||||||
|
metadata: dict[str, Any] = {
|
||||||
|
"source": file_path.name,
|
||||||
|
}
|
||||||
|
|
||||||
|
doc_text = clean_text(doc_text)
|
||||||
|
document = Document(
|
||||||
|
id=hashlib.md5(file_path.name.encode()).hexdigest(),
|
||||||
|
page_content=doc_text,
|
||||||
|
metadata={**metadata}
|
||||||
|
)
|
||||||
|
|
||||||
|
all_documents.append(document)
|
||||||
|
|
||||||
|
return all_documents
|
||||||
|
|
||||||
|
|
||||||
|
@app.command()
|
||||||
|
def elasticsearch_ingestion(
|
||||||
|
docs_folder_path: str = "ingestion/docs",
|
||||||
|
es_request_timeout: int = 120,
|
||||||
|
es_max_retries: int = 5,
|
||||||
|
es_retry_on_timeout: bool = True,
|
||||||
|
distance_strategy: DistanceStrategy = DistanceStrategy.cosine,
|
||||||
|
):
|
||||||
|
logger.info("Starting Elasticsearch ingestion pipeline...")
|
||||||
|
logger.info(f"Using docs folder path: {docs_folder_path}")
|
||||||
|
documents = build_documents_from_folder(folder_path=docs_folder_path)
|
||||||
|
|
||||||
|
logger.info("Connecting to Elasticsearch...")
|
||||||
|
try:
|
||||||
|
es = Elasticsearch(
|
||||||
|
ELASTICSEARCH_LOCAL_URL,
|
||||||
|
request_timeout=es_request_timeout,
|
||||||
|
max_retries=es_max_retries,
|
||||||
|
retry_on_timeout=es_retry_on_timeout,
|
||||||
|
)
|
||||||
|
except:
|
||||||
|
logger.exception("Failed to connect to Elasticsearch.")
|
||||||
|
raise
|
||||||
|
|
||||||
|
logger.info("Instantiating embeddings model...")
|
||||||
|
try:
|
||||||
|
embeddings = create_embedding_model(
|
||||||
|
provider="ollama",
|
||||||
|
model=OLLAMA_EMB_MODEL_NAME,
|
||||||
|
base_url=OLLAMA_LOCAL_URL,
|
||||||
|
)
|
||||||
|
except:
|
||||||
|
logger.exception("Failed to instantiate embeddings model.")
|
||||||
|
raise
|
||||||
|
|
||||||
|
logger.info(f"Uploading documents to index {ELASTICSEARCH_INDEX}...")
|
||||||
|
ElasticsearchStore.from_documents(
|
||||||
|
documents,
|
||||||
|
embeddings,
|
||||||
|
client=es,
|
||||||
|
index_name=ELASTICSEARCH_INDEX,
|
||||||
|
distance_strategy=distance_strategy.value,
|
||||||
|
)
|
||||||
|
logger.info(f"Finished uploading documents to index {ELASTICSEARCH_INDEX}.")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
logging.basicConfig(
|
||||||
|
level=logging.INFO,
|
||||||
|
format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
app()
|
||||||
|
except Exception as exc:
|
||||||
|
logger.exception(exc)
|
||||||
|
raise
|
||||||
|
|
@ -2,7 +2,7 @@
|
||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 1,
|
||||||
"id": "f8ea7a75",
|
"id": "f8ea7a75",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
|
|
@ -24,7 +24,7 @@
|
||||||
"from chonkie import SemanticChunker\n",
|
"from chonkie import SemanticChunker\n",
|
||||||
"\n",
|
"\n",
|
||||||
"from src.utils.emb_factory import create_embedding_model\n",
|
"from src.utils.emb_factory import create_embedding_model\n",
|
||||||
"from scripts.pipelines.tasks.chunk import read_concat_files, get_chunk_docs, chunks_to_document\n",
|
"from scripts.pipelines.tasks.chunk import read_files, get_chunk_docs, convert_chunks_to_document\n",
|
||||||
"from src.config import PROJ_ROOT\n",
|
"from src.config import PROJ_ROOT\n",
|
||||||
"\n",
|
"\n",
|
||||||
"ELASTICSEARCH_LOCAL_URL = os.getenv(\"ELASTICSEARCH_LOCAL_URL\")\n",
|
"ELASTICSEARCH_LOCAL_URL = os.getenv(\"ELASTICSEARCH_LOCAL_URL\")\n",
|
||||||
|
|
@ -37,7 +37,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 2,
|
||||||
"id": "a8b8de3f",
|
"id": "a8b8de3f",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
|
|
@ -45,8 +45,8 @@
|
||||||
"name": "stderr",
|
"name": "stderr",
|
||||||
"output_type": "stream",
|
"output_type": "stream",
|
||||||
"text": [
|
"text": [
|
||||||
"\u001b[32m2026-03-10 13:58:32.657\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m<module>\u001b[0m:\u001b[36m3\u001b[0m - \u001b[1mStarting Elasticsearch ingestion pipeline...\u001b[0m\n",
|
"\u001b[32m2026-03-10 15:15:53.994\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m<module>\u001b[0m:\u001b[36m3\u001b[0m - \u001b[1mStarting Elasticsearch ingestion pipeline...\u001b[0m\n",
|
||||||
"\u001b[32m2026-03-10 13:58:32.658\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m<module>\u001b[0m:\u001b[36m4\u001b[0m - \u001b[1mReading and concatenating files from folder: docs/developer.avapframework.com\u001b[0m\n"
|
"\u001b[32m2026-03-10 15:15:53.996\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m<module>\u001b[0m:\u001b[36m4\u001b[0m - \u001b[1mReading and concatenating files from folder: docs/developer.avapframework.com\u001b[0m\n"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|
@ -55,7 +55,7 @@
|
||||||
"text": [
|
"text": [
|
||||||
"1\n",
|
"1\n",
|
||||||
"1\n",
|
"1\n",
|
||||||
"14\n",
|
"0\n",
|
||||||
"24\n"
|
"24\n"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|
@ -65,8 +65,8 @@
|
||||||
"\n",
|
"\n",
|
||||||
"logger.info(\"Starting Elasticsearch ingestion pipeline...\")\n",
|
"logger.info(\"Starting Elasticsearch ingestion pipeline...\")\n",
|
||||||
"logger.info(f\"Reading and concatenating files from folder: {docs_folder_path}/developer.avapframework.com\")\n",
|
"logger.info(f\"Reading and concatenating files from folder: {docs_folder_path}/developer.avapframework.com\")\n",
|
||||||
"avap_github_docs = read_concat_files(PROJ_ROOT / f\"{docs_folder_path}/avap_language_github_docs\", \"AVAP\", concatenate=False)\n",
|
"avap_github_docs = read_files(PROJ_ROOT / f\"{docs_folder_path}/avap_language_github_docs\", \"AVAP\", concatenate=False)\n",
|
||||||
"avap_web_docs_intro = read_concat_files(PROJ_ROOT / f\"{docs_folder_path}/developer.avapframework.com\", \"intro\", concatenate=True)\n",
|
"avap_web_docs_intro = read_files(PROJ_ROOT / f\"{docs_folder_path}/developer.avapframework.com\", \"intro\", concatenate=True)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# Check chapters in developer.avapframework.com folder and read and concatenate files for each chapter\n",
|
"# Check chapters in developer.avapframework.com folder and read and concatenate files for each chapter\n",
|
||||||
"chapters = sorted({\n",
|
"chapters = sorted({\n",
|
||||||
|
|
@ -77,14 +77,14 @@
|
||||||
"avap_web_docs_chapters = [\n",
|
"avap_web_docs_chapters = [\n",
|
||||||
" item\n",
|
" item\n",
|
||||||
" for chapter in chapters\n",
|
" for chapter in chapters\n",
|
||||||
" for item in read_concat_files(\n",
|
" for item in read_files(\n",
|
||||||
" f\"{docs_folder_path}/developer.avapframework.com\",\n",
|
" f\"{docs_folder_path}/developer.avapframework.com\",\n",
|
||||||
" f\"{chapter}_\",\n",
|
" f\"{chapter}_\",\n",
|
||||||
" concatenate=True\n",
|
" concatenate=True\n",
|
||||||
" )\n",
|
" )\n",
|
||||||
"]\n",
|
"]\n",
|
||||||
"avap_web_docs_appendices = read_concat_files(PROJ_ROOT / f\"{docs_folder_path}/developer.avapframework.com\", \"appendices_\", concatenate=False)\n",
|
"avap_web_docs_appendices = read_files(PROJ_ROOT / f\"{docs_folder_path}/developer.avapframework.com\", \"appendices_\", concatenate=False)\n",
|
||||||
"avap_examples_docs = read_concat_files(PROJ_ROOT / f\"{docs_folder_path}/samples\", concatenate=False)\n",
|
"avap_examples_docs = read_files(PROJ_ROOT / f\"{docs_folder_path}/samples\", concatenate=False)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"print(len(avap_github_docs))\n",
|
"print(len(avap_github_docs))\n",
|
||||||
"print(len(avap_web_docs_intro))\n",
|
"print(len(avap_web_docs_intro))\n",
|
||||||
|
|
@ -94,7 +94,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 12,
|
"execution_count": 3,
|
||||||
"id": "36abc025",
|
"id": "36abc025",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
|
|
@ -167,7 +167,7 @@
|
||||||
" 'title': 'validacion_in_pertenece_a_lista.avap'}]"
|
" 'title': 'validacion_in_pertenece_a_lista.avap'}]"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"execution_count": 12,
|
"execution_count": 3,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"output_type": "execute_result"
|
"output_type": "execute_result"
|
||||||
}
|
}
|
||||||
|
|
@ -194,7 +194,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 6,
|
"execution_count": 4,
|
||||||
"id": "27e5774d",
|
"id": "27e5774d",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
|
|
@ -202,7 +202,7 @@
|
||||||
"name": "stderr",
|
"name": "stderr",
|
||||||
"output_type": "stream",
|
"output_type": "stream",
|
||||||
"text": [
|
"text": [
|
||||||
"\u001b[32m2026-03-10 13:58:34.531\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m<module>\u001b[0m:\u001b[36m1\u001b[0m - \u001b[1mChunking documents...\u001b[0m\n"
|
"\u001b[32m2026-03-10 15:15:54.053\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m<module>\u001b[0m:\u001b[36m1\u001b[0m - \u001b[1mChunking documents...\u001b[0m\n"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
|
|
@ -218,7 +218,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 7,
|
"execution_count": 5,
|
||||||
"id": "a5ce984e",
|
"id": "a5ce984e",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
|
|
@ -226,39 +226,25 @@
|
||||||
"name": "stderr",
|
"name": "stderr",
|
||||||
"output_type": "stream",
|
"output_type": "stream",
|
||||||
"text": [
|
"text": [
|
||||||
"\u001b[32m2026-03-10 13:58:51.740\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m<module>\u001b[0m:\u001b[36m1\u001b[0m - \u001b[1mChunking AVAP GitHub docs...\u001b[0m\n",
|
"\u001b[32m2026-03-10 15:16:04.305\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m<module>\u001b[0m:\u001b[36m1\u001b[0m - \u001b[1mChunking AVAP GitHub docs...\u001b[0m\n",
|
||||||
"\u001b[32m2026-03-10 14:01:00.535\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mscripts.pipelines.tasks.chunk\u001b[0m:\u001b[36mchunk_docs\u001b[0m:\u001b[36m181\u001b[0m - \u001b[1mFinished chunking AVAP.md\u001b[0m\n",
|
"\u001b[32m2026-03-10 15:20:42.896\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mscripts.pipelines.tasks.chunk\u001b[0m:\u001b[36mget_chunk_docs\u001b[0m:\u001b[36m102\u001b[0m - \u001b[1mFinished chunking AVAP.md\u001b[0m\n",
|
||||||
"\u001b[32m2026-03-10 14:01:00.536\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m<module>\u001b[0m:\u001b[36m4\u001b[0m - \u001b[1mChunking AVAP web docs chapters...\u001b[0m\n",
|
"\u001b[32m2026-03-10 15:20:42.897\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m<module>\u001b[0m:\u001b[36m4\u001b[0m - \u001b[1mChunking AVAP web docs chapters...\u001b[0m\n",
|
||||||
"\u001b[32m2026-03-10 14:01:09.128\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mscripts.pipelines.tasks.chunk\u001b[0m:\u001b[36mchunk_docs\u001b[0m:\u001b[36m181\u001b[0m - \u001b[1mFinished chunking chapter1_\u001b[0m\n",
|
"\u001b[32m2026-03-10 15:20:42.897\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m<module>\u001b[0m:\u001b[36m7\u001b[0m - \u001b[1mCreating Langchain Document to index...\u001b[0m\n"
|
||||||
"\u001b[32m2026-03-10 14:01:12.763\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mscripts.pipelines.tasks.chunk\u001b[0m:\u001b[36mchunk_docs\u001b[0m:\u001b[36m181\u001b[0m - \u001b[1mFinished chunking chapter2_\u001b[0m\n",
|
|
||||||
"\u001b[32m2026-03-10 14:01:42.995\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mscripts.pipelines.tasks.chunk\u001b[0m:\u001b[36mchunk_docs\u001b[0m:\u001b[36m181\u001b[0m - \u001b[1mFinished chunking chapter3_\u001b[0m\n",
|
|
||||||
"\u001b[32m2026-03-10 14:01:48.772\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mscripts.pipelines.tasks.chunk\u001b[0m:\u001b[36mchunk_docs\u001b[0m:\u001b[36m181\u001b[0m - \u001b[1mFinished chunking chapter4_\u001b[0m\n",
|
|
||||||
"\u001b[32m2026-03-10 14:01:48.772\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mscripts.pipelines.tasks.chunk\u001b[0m:\u001b[36mchunk_docs\u001b[0m:\u001b[36m181\u001b[0m - \u001b[1mFinished chunking chapter5_\u001b[0m\n",
|
|
||||||
"\u001b[32m2026-03-10 14:01:48.773\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mscripts.pipelines.tasks.chunk\u001b[0m:\u001b[36mchunk_docs\u001b[0m:\u001b[36m181\u001b[0m - \u001b[1mFinished chunking chapter6_\u001b[0m\n",
|
|
||||||
"\u001b[32m2026-03-10 14:02:06.408\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mscripts.pipelines.tasks.chunk\u001b[0m:\u001b[36mchunk_docs\u001b[0m:\u001b[36m181\u001b[0m - \u001b[1mFinished chunking chapter7_\u001b[0m\n",
|
|
||||||
"\u001b[32m2026-03-10 14:02:21.501\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mscripts.pipelines.tasks.chunk\u001b[0m:\u001b[36mchunk_docs\u001b[0m:\u001b[36m181\u001b[0m - \u001b[1mFinished chunking chapter8_\u001b[0m\n",
|
|
||||||
"\u001b[32m2026-03-10 14:07:27.158\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mscripts.pipelines.tasks.chunk\u001b[0m:\u001b[36mchunk_docs\u001b[0m:\u001b[36m181\u001b[0m - \u001b[1mFinished chunking chapter9_\u001b[0m\n",
|
|
||||||
"\u001b[32m2026-03-10 14:07:48.389\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mscripts.pipelines.tasks.chunk\u001b[0m:\u001b[36mchunk_docs\u001b[0m:\u001b[36m181\u001b[0m - \u001b[1mFinished chunking chapter10_\u001b[0m\n",
|
|
||||||
"\u001b[32m2026-03-10 14:08:10.823\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mscripts.pipelines.tasks.chunk\u001b[0m:\u001b[36mchunk_docs\u001b[0m:\u001b[36m181\u001b[0m - \u001b[1mFinished chunking chapter11_\u001b[0m\n",
|
|
||||||
"\u001b[32m2026-03-10 14:08:27.335\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mscripts.pipelines.tasks.chunk\u001b[0m:\u001b[36mchunk_docs\u001b[0m:\u001b[36m181\u001b[0m - \u001b[1mFinished chunking chapter12_\u001b[0m\n",
|
|
||||||
"\u001b[32m2026-03-10 14:08:55.010\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mscripts.pipelines.tasks.chunk\u001b[0m:\u001b[36mchunk_docs\u001b[0m:\u001b[36m181\u001b[0m - \u001b[1mFinished chunking chapter13_\u001b[0m\n",
|
|
||||||
"\u001b[32m2026-03-10 14:09:10.211\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mscripts.pipelines.tasks.chunk\u001b[0m:\u001b[36mchunk_docs\u001b[0m:\u001b[36m181\u001b[0m - \u001b[1mFinished chunking chapter14_\u001b[0m\n",
|
|
||||||
"\u001b[32m2026-03-10 14:09:10.211\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m<module>\u001b[0m:\u001b[36m7\u001b[0m - \u001b[1mCreating Langchain Document to index...\u001b[0m\n"
|
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"logger.info(\"Chunking AVAP GitHub docs...\")\n",
|
"logger.info(\"Chunking AVAP GitHub docs...\")\n",
|
||||||
"avap_github_docs_chunks = chunk_docs(avap_github_docs, chunker)\n",
|
"avap_github_docs_chunks = get_chunk_docs(avap_github_docs, chunker)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"logger.info(\"Chunking AVAP web docs chapters...\")\n",
|
"logger.info(\"Chunking AVAP web docs chapters...\")\n",
|
||||||
"avap_web_docs_chapters_chunks = chunk_docs(avap_web_docs_chapters, chunker)\n",
|
"# avap_web_docs_chapters_chunks = get_chunk_docs(avap_web_docs_chapters, chunker)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"logger.info(\"Creating Langchain Document to index...\")\n",
|
"logger.info(\"Creating Langchain Document to index...\")\n",
|
||||||
"avap_github_langchain_docs = chunks_to_document(avap_github_docs_chunks)\n",
|
"avap_github_langchain_docs = convert_chunks_to_document(avap_github_docs_chunks)\n",
|
||||||
"avap_web_chapters_langchain_docs = chunks_to_document(avap_web_docs_chapters_chunks)\n",
|
"# avap_web_chapters_langchain_docs = convert_chunks_to_document(avap_web_docs_chapters_chunks)\n",
|
||||||
"avap_web_intro_langchain_docs = chunks_to_document(avap_web_docs_intro)\n",
|
"avap_web_intro_langchain_docs = convert_chunks_to_document(avap_web_docs_intro)\n",
|
||||||
"avap_web_appendices_langchain_docs = chunks_to_document(avap_web_docs_appendices)"
|
"avap_web_appendices_langchain_docs = convert_chunks_to_document(avap_web_docs_appendices)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|
@ -1,6 +1,3 @@
|
||||||
import re
|
|
||||||
import hashlib
|
|
||||||
from typing import Any
|
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
import typer
|
import typer
|
||||||
import logging
|
import logging
|
||||||
|
|
@ -9,13 +6,15 @@ from pathlib import Path
|
||||||
|
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
from elasticsearch import Elasticsearch
|
from elasticsearch import Elasticsearch
|
||||||
from langchain_core.documents import Document
|
|
||||||
from langchain_elasticsearch import ElasticsearchStore
|
from langchain_elasticsearch import ElasticsearchStore
|
||||||
from langchain_community.embeddings import HuggingFaceEmbeddings
|
from chonkie import SemanticChunker
|
||||||
from langchain_experimental.text_splitter import SemanticChunker
|
|
||||||
|
|
||||||
from src.utils.emb_factory import create_embedding_model
|
from src.utils.emb_factory import create_embedding_model
|
||||||
from scripts.pipelines.tasks.chunk import scrape_avap_docs
|
from scripts.pipelines.tasks.chunk import (
|
||||||
|
read_files,
|
||||||
|
get_chunk_docs,
|
||||||
|
convert_chunks_to_document
|
||||||
|
)
|
||||||
|
|
||||||
app = typer.Typer()
|
app = typer.Typer()
|
||||||
|
|
||||||
|
|
@ -25,6 +24,7 @@ ELASTICSEARCH_INDEX = os.getenv("ELASTICSEARCH_INDEX")
|
||||||
OLLAMA_URL = os.getenv("OLLAMA_URL")
|
OLLAMA_URL = os.getenv("OLLAMA_URL")
|
||||||
OLLAMA_EMB_MODEL_NAME = os.getenv("OLLAMA_EMB_MODEL_NAME")
|
OLLAMA_EMB_MODEL_NAME = os.getenv("OLLAMA_EMB_MODEL_NAME")
|
||||||
AVAP_WEB_DOCS_URL = os.getenv("AVAP_WEB_DOCS_URL")
|
AVAP_WEB_DOCS_URL = os.getenv("AVAP_WEB_DOCS_URL")
|
||||||
|
HF_EMB_MODEL_NAME = os.getenv("HF_EMB_MODEL_NAME")
|
||||||
|
|
||||||
class DistanceStrategy(str, Enum):
|
class DistanceStrategy(str, Enum):
|
||||||
euclidean = "EUCLIDEAN_DISTANCE"
|
euclidean = "EUCLIDEAN_DISTANCE"
|
||||||
|
|
@ -33,55 +33,65 @@ class DistanceStrategy(str, Enum):
|
||||||
jaccard = "JACCARD"
|
jaccard = "JACCARD"
|
||||||
cosine = "COSINE"
|
cosine = "COSINE"
|
||||||
|
|
||||||
def clean_text(text: str) -> str:
|
|
||||||
text = text.replace("\u00a0", " ")
|
|
||||||
text = re.sub(r"\s+", " ", text).strip()
|
|
||||||
return text
|
|
||||||
|
|
||||||
def build_documents_from_folder(
|
|
||||||
folder_path: str,
|
|
||||||
) -> list[Document]:
|
|
||||||
|
|
||||||
folder = Path(folder_path)
|
|
||||||
|
|
||||||
if not folder.exists() or not folder.is_dir():
|
|
||||||
raise ValueError(f"Invalid folder path: {folder_path}")
|
|
||||||
|
|
||||||
all_documents: list[Document] = []
|
|
||||||
|
|
||||||
for file_path in folder.glob("*.txt"):
|
|
||||||
doc_text = file_path.read_text(encoding="utf-8")
|
|
||||||
|
|
||||||
if not doc_text.strip():
|
|
||||||
continue
|
|
||||||
|
|
||||||
metadata: dict[str, Any] = {
|
|
||||||
"source": file_path.name,
|
|
||||||
}
|
|
||||||
|
|
||||||
doc_text = clean_text(doc_text)
|
|
||||||
document = Document(
|
|
||||||
id=hashlib.md5(file_path.name.encode()).hexdigest(),
|
|
||||||
page_content=doc_text,
|
|
||||||
metadata={**metadata}
|
|
||||||
)
|
|
||||||
|
|
||||||
all_documents.append(document)
|
|
||||||
|
|
||||||
return all_documents
|
|
||||||
|
|
||||||
|
|
||||||
@app.command()
|
@app.command()
|
||||||
def elasticsearch_ingestion(
|
def elasticsearch_ingestion(
|
||||||
docs_folder_path: str = "ingestion/docs",
|
docs_folder_path: str = "docs",
|
||||||
es_request_timeout: int = 120,
|
es_request_timeout: int = 120,
|
||||||
es_max_retries: int = 5,
|
es_max_retries: int = 5,
|
||||||
es_retry_on_timeout: bool = True,
|
es_retry_on_timeout: bool = True,
|
||||||
distance_strategy: DistanceStrategy = DistanceStrategy.cosine,
|
distance_strategy: DistanceStrategy = DistanceStrategy.cosine,
|
||||||
):
|
chunk_size: int = 2048,
|
||||||
|
chunk_threshold: float = 0.5,
|
||||||
|
chunk_similarity_window: int = 3,
|
||||||
|
chunk_skip_window: int = 1,
|
||||||
|
):
|
||||||
logger.info("Starting Elasticsearch ingestion pipeline...")
|
logger.info("Starting Elasticsearch ingestion pipeline...")
|
||||||
logger.info(f"Using docs folder path: {docs_folder_path}")
|
logger.info(f"Reading and concatenating files from folder: {docs_folder_path}/developer.avapframework.com")
|
||||||
documents = build_documents_from_folder(folder_path=docs_folder_path)
|
avap_github_docs = read_files(f"{docs_folder_path}/avap_language_github_docs", concatenate=False)
|
||||||
|
avap_web_docs_intro = read_files(f"{docs_folder_path}/developer.avapframework.com", "intro", concatenate=True)
|
||||||
|
|
||||||
|
# Check chapters in developer.avapframework.com folder and read and concatenate files for each chapter
|
||||||
|
chapters = sorted({
|
||||||
|
p.name.split("_")[0]
|
||||||
|
for p in Path(f"{docs_folder_path}/developer.avapframework.com").glob("chapter*.md")
|
||||||
|
})
|
||||||
|
|
||||||
|
avap_web_docs_chapters = [
|
||||||
|
item
|
||||||
|
for chapter in chapters
|
||||||
|
for item in read_files(
|
||||||
|
f"{docs_folder_path}/developer.avapframework.com",
|
||||||
|
f"{chapter}_",
|
||||||
|
concatenate=True
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
|
avap_web_docs_appendices = read_files(f"{docs_folder_path}/developer.avapframework.com", "appendices_", concatenate=False)
|
||||||
|
avap_samples_docs = read_files(f"{docs_folder_path}/samples", concatenate=False)
|
||||||
|
|
||||||
|
logger.info("Instantiating semantic chunker...")
|
||||||
|
chunker = SemanticChunker(
|
||||||
|
embedding_model=HF_EMB_MODEL_NAME,
|
||||||
|
chunk_size=chunk_size,
|
||||||
|
threshold=chunk_threshold,
|
||||||
|
similarity_window=chunk_similarity_window,
|
||||||
|
skip_window=chunk_skip_window
|
||||||
|
)
|
||||||
|
|
||||||
|
logger.info("Chunking AVAP GitHub docs...")
|
||||||
|
avap_github_docs_chunks = get_chunk_docs(avap_github_docs, chunker)
|
||||||
|
|
||||||
|
logger.info("Chunking AVAP web docs chapters...")
|
||||||
|
avap_web_docs_chapters_chunks = get_chunk_docs(avap_web_docs_chapters, chunker)
|
||||||
|
|
||||||
|
logger.info("Creating Langchain Document to index...")
|
||||||
|
avap_github_langchain_docs = convert_chunks_to_document(avap_github_docs_chunks)
|
||||||
|
avap_web_chapters_langchain_docs = convert_chunks_to_document(avap_web_docs_chapters_chunks)
|
||||||
|
avap_web_intro_langchain_docs = convert_chunks_to_document(avap_web_docs_intro)
|
||||||
|
avap_web_appendices_langchain_docs = convert_chunks_to_document(avap_web_docs_appendices)
|
||||||
|
avap_samples_langchain_docs = convert_chunks_to_document(avap_samples_docs)
|
||||||
|
avap_documents = avap_github_langchain_docs + avap_web_chapters_langchain_docs + avap_web_intro_langchain_docs + avap_web_appendices_langchain_docs + avap_samples_langchain_docs
|
||||||
|
|
||||||
logger.info("Connecting to Elasticsearch...")
|
logger.info("Connecting to Elasticsearch...")
|
||||||
try:
|
try:
|
||||||
|
|
@ -105,10 +115,14 @@ def elasticsearch_ingestion(
|
||||||
except:
|
except:
|
||||||
logger.exception("Failed to instantiate embeddings model.")
|
logger.exception("Failed to instantiate embeddings model.")
|
||||||
raise
|
raise
|
||||||
|
|
||||||
|
logger.info(f"Checking if index {ELASTICSEARCH_INDEX} exists and deleting if it does...")
|
||||||
|
if es.indices.exists(index=ELASTICSEARCH_INDEX):
|
||||||
|
es.indices.delete(index=ELASTICSEARCH_INDEX)
|
||||||
|
|
||||||
logger.info(f"Uploading documents to index {ELASTICSEARCH_INDEX}...")
|
logger.info(f"Uploading documents to index {ELASTICSEARCH_INDEX}...")
|
||||||
ElasticsearchStore.from_documents(
|
ElasticsearchStore.from_documents(
|
||||||
documents,
|
avap_documents,
|
||||||
embeddings,
|
embeddings,
|
||||||
client=es,
|
client=es,
|
||||||
index_name=ELASTICSEARCH_INDEX,
|
index_name=ELASTICSEARCH_INDEX,
|
||||||
|
|
|
||||||
|
|
@ -1,143 +0,0 @@
|
||||||
from enum import Enum
|
|
||||||
import typer
|
|
||||||
import logging
|
|
||||||
import os
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
from loguru import logger
|
|
||||||
from elasticsearch import Elasticsearch
|
|
||||||
from langchain_elasticsearch import ElasticsearchStore
|
|
||||||
from chonkie import SemanticChunker
|
|
||||||
|
|
||||||
from src.utils.emb_factory import create_embedding_model
|
|
||||||
from scripts.pipelines.tasks.chunk import (
|
|
||||||
read_files,
|
|
||||||
get_chunk_docs,
|
|
||||||
convert_chunks_to_document
|
|
||||||
)
|
|
||||||
|
|
||||||
app = typer.Typer()
|
|
||||||
|
|
||||||
ELASTICSEARCH_LOCAL_URL = os.getenv("ELASTICSEARCH_LOCAL_URL")
|
|
||||||
OLLAMA_LOCAL_URL = os.getenv("OLLAMA_LOCAL_URL")
|
|
||||||
ELASTICSEARCH_INDEX = os.getenv("ELASTICSEARCH_INDEX")
|
|
||||||
OLLAMA_URL = os.getenv("OLLAMA_URL")
|
|
||||||
OLLAMA_EMB_MODEL_NAME = os.getenv("OLLAMA_EMB_MODEL_NAME")
|
|
||||||
AVAP_WEB_DOCS_URL = os.getenv("AVAP_WEB_DOCS_URL")
|
|
||||||
HF_EMB_MODEL_NAME = os.getenv("HF_EMB_MODEL_NAME")
|
|
||||||
|
|
||||||
class DistanceStrategy(str, Enum):
|
|
||||||
euclidean = "EUCLIDEAN_DISTANCE"
|
|
||||||
max_inner_product = "MAX_INNER_PRODUCT"
|
|
||||||
dot_product = "DOT_PRODUCT"
|
|
||||||
jaccard = "JACCARD"
|
|
||||||
cosine = "COSINE"
|
|
||||||
|
|
||||||
|
|
||||||
@app.command()
|
|
||||||
def elasticsearch_ingestion(
|
|
||||||
docs_folder_path: str = "docs",
|
|
||||||
es_request_timeout: int = 120,
|
|
||||||
es_max_retries: int = 5,
|
|
||||||
es_retry_on_timeout: bool = True,
|
|
||||||
distance_strategy: DistanceStrategy = DistanceStrategy.cosine,
|
|
||||||
chunk_size: int = 2048,
|
|
||||||
chunk_threshold: float = 0.5,
|
|
||||||
chunk_similarity_window: int = 3,
|
|
||||||
chunk_skip_window: int = 1,
|
|
||||||
):
|
|
||||||
logger.info("Starting Elasticsearch ingestion pipeline...")
|
|
||||||
logger.info(f"Reading and concatenating files from folder: {docs_folder_path}/developer.avapframework.com")
|
|
||||||
avap_github_docs = read_files(f"{docs_folder_path}/avap_language_github_docs", concatenate=False)
|
|
||||||
avap_web_docs_intro = read_files(f"{docs_folder_path}/developer.avapframework.com", "intro", concatenate=True)
|
|
||||||
|
|
||||||
# Check chapters in developer.avapframework.com folder and read and concatenate files for each chapter
|
|
||||||
chapters = sorted({
|
|
||||||
p.name.split("_")[0]
|
|
||||||
for p in Path(f"{docs_folder_path}/developer.avapframework.com").glob("chapter*.md")
|
|
||||||
})
|
|
||||||
|
|
||||||
avap_web_docs_chapters = [
|
|
||||||
item
|
|
||||||
for chapter in chapters
|
|
||||||
for item in read_files(
|
|
||||||
f"{docs_folder_path}/developer.avapframework.com",
|
|
||||||
f"{chapter}_",
|
|
||||||
concatenate=True
|
|
||||||
)
|
|
||||||
]
|
|
||||||
|
|
||||||
avap_web_docs_appendices = read_files(f"{docs_folder_path}/developer.avapframework.com", "appendices_", concatenate=False)
|
|
||||||
avap_samples_docs = read_files(f"{docs_folder_path}/samples", concatenate=False)
|
|
||||||
|
|
||||||
logger.info("Instantiating semantic chunker...")
|
|
||||||
chunker = SemanticChunker(
|
|
||||||
embedding_model=HF_EMB_MODEL_NAME,
|
|
||||||
chunk_size=chunk_size,
|
|
||||||
threshold=chunk_threshold,
|
|
||||||
similarity_window=chunk_similarity_window,
|
|
||||||
skip_window=chunk_skip_window
|
|
||||||
)
|
|
||||||
|
|
||||||
logger.info("Chunking AVAP GitHub docs...")
|
|
||||||
avap_github_docs_chunks = get_chunk_docs(avap_github_docs, chunker)
|
|
||||||
|
|
||||||
logger.info("Chunking AVAP web docs chapters...")
|
|
||||||
avap_web_docs_chapters_chunks = get_chunk_docs(avap_web_docs_chapters, chunker)
|
|
||||||
|
|
||||||
logger.info("Creating Langchain Document to index...")
|
|
||||||
avap_github_langchain_docs = convert_chunks_to_document(avap_github_docs_chunks)
|
|
||||||
avap_web_chapters_langchain_docs = convert_chunks_to_document(avap_web_docs_chapters_chunks)
|
|
||||||
avap_web_intro_langchain_docs = convert_chunks_to_document(avap_web_docs_intro)
|
|
||||||
avap_web_appendices_langchain_docs = convert_chunks_to_document(avap_web_docs_appendices)
|
|
||||||
avap_samples_langchain_docs = convert_chunks_to_document(avap_samples_docs)
|
|
||||||
avap_documents = avap_github_langchain_docs + avap_web_chapters_langchain_docs + avap_web_intro_langchain_docs + avap_web_appendices_langchain_docs + avap_samples_langchain_docs
|
|
||||||
|
|
||||||
logger.info("Connecting to Elasticsearch...")
|
|
||||||
try:
|
|
||||||
es = Elasticsearch(
|
|
||||||
ELASTICSEARCH_LOCAL_URL,
|
|
||||||
request_timeout=es_request_timeout,
|
|
||||||
max_retries=es_max_retries,
|
|
||||||
retry_on_timeout=es_retry_on_timeout,
|
|
||||||
)
|
|
||||||
except:
|
|
||||||
logger.exception("Failed to connect to Elasticsearch.")
|
|
||||||
raise
|
|
||||||
|
|
||||||
logger.info("Instantiating embeddings model...")
|
|
||||||
try:
|
|
||||||
embeddings = create_embedding_model(
|
|
||||||
provider="ollama",
|
|
||||||
model=OLLAMA_EMB_MODEL_NAME,
|
|
||||||
base_url=OLLAMA_LOCAL_URL,
|
|
||||||
)
|
|
||||||
except:
|
|
||||||
logger.exception("Failed to instantiate embeddings model.")
|
|
||||||
raise
|
|
||||||
|
|
||||||
logger.info(f"Checking if index {ELASTICSEARCH_INDEX} exists and deleting if it does...")
|
|
||||||
if es.indices.exists(index=ELASTICSEARCH_INDEX):
|
|
||||||
es.indices.delete(index=ELASTICSEARCH_INDEX)
|
|
||||||
|
|
||||||
logger.info(f"Uploading documents to index {ELASTICSEARCH_INDEX}...")
|
|
||||||
ElasticsearchStore.from_documents(
|
|
||||||
avap_documents,
|
|
||||||
embeddings,
|
|
||||||
client=es,
|
|
||||||
index_name=ELASTICSEARCH_INDEX,
|
|
||||||
distance_strategy=distance_strategy.value,
|
|
||||||
)
|
|
||||||
logger.info(f"Finished uploading documents to index {ELASTICSEARCH_INDEX}.")
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
logging.basicConfig(
|
|
||||||
level=logging.INFO,
|
|
||||||
format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
|
|
||||||
)
|
|
||||||
try:
|
|
||||||
app()
|
|
||||||
except Exception as exc:
|
|
||||||
logger.exception(exc)
|
|
||||||
raise
|
|
||||||
Loading…
Reference in New Issue