Refactor Elasticsearch ingestion pipeline and add MBPP generation script

- Updated `elasticsearch_ingestion.py` to streamline document processing and ingestion into Elasticsearch.
- Introduced `generate_mbap.py` for generating benchmark problems in AVAP language from a provided LRM.
- Created `prompts.py` to define prompts for converting Python problems to AVAP.
- Enhanced chunk processing in `chunk.py` to support markdown and AVAP documents.
- Added `OllamaEmbeddings` class in `embeddings.py` for handling embeddings with Ollama model.
- Updated dependencies in `uv.lock` to include new packages and versions.
This commit is contained in:
acano 2026-03-11 17:17:44 +01:00
parent 3caed4deb6
commit 5f21544e0b
13 changed files with 486 additions and 5808 deletions

View File

@ -47,11 +47,12 @@ graph TD
├── changelog # Version tracking and release history ├── changelog # Version tracking and release history
├── pyproject.toml # Python project configuration ├── pyproject.toml # Python project configuration
├── docs/ ├── docs/
| ├── AVAP Language: ... # AVAP DSL Documentation ├── AVAP Language: ... # AVAP DSL Documentation
| | └── AVAP.md │ │ └── AVAP.md
│ ├── developer.avapfr... # Documents on developer web page │ ├── developer.avapfr... # Documents on developer web page
| └── LRM/ # AVAP LRM documentation │ ├── LRM/ # AVAP LRM documentation
| └── avap.md │ │ └── avap.md
│ └── samples/ # AVAP code samples
├── Docker/ ├── Docker/
│ ├── protos/ │ ├── protos/
│ │ └── brunix.proto # Protocol Buffers: The source of truth for the API │ │ └── brunix.proto # Protocol Buffers: The source of truth for the API
@ -64,26 +65,17 @@ graph TD
│ ├── Dockerfile # Container definition for the Engine │ ├── Dockerfile # Container definition for the Engine
│ ├── docker-compose.yaml # Local orchestration for dev environment │ ├── docker-compose.yaml # Local orchestration for dev environment
│ ├── requirements.txt # Python dependencies for Docker │ ├── requirements.txt # Python dependencies for Docker
│ ├── protos/ │ └── .dockerignore # Docker ignore files
│ │ └── brunix.proto # Protocol Buffers: The source of truth for the API
│ └── src/
│ ├── graph.py # Workflow graph orchestration
│ ├── prompts.py # Centralized prompt definitions
│ ├── server.py # gRPC Server & RAG Orchestration
│ ├── state.py # Shared state management
│ └── utils/ # Utility modules
├── ingestion/
│ └── docs/ # AVAP documentation chunks
├── kubernetes/ ├── kubernetes/
│ └── kubeconfig.yaml # Kubernetes cluster configuration │ └── kubeconfig.yaml # Kubernetes cluster configuration
├── scripts/ ├── scripts/
│ └── pipelines/ │ └── pipelines/
| ├── samples_generator/ # AVAP Sample generator │ ├── flows/ # Processing pipelines
| | └─ generate_mbap.py │ ├── tasks/ # Modules used by the flows
│ └── flows/ # Data processing flows │ └── inputs/ # Inputs used by the flows
| └─ elasticsearch_ingestion.py
└── src/ └── src/
├── __init__.py ├── __init__.py
├── config.py # Environment variables configuration file
└── utils/ └── utils/
├── emb_factory.py # Embedding model factory ├── emb_factory.py # Embedding model factory
├── llm_factory.py # LLM model factory ├── llm_factory.py # LLM model factory
@ -157,6 +149,7 @@ OLLAMA_URL=http://host.docker.internal:11434
OLLAMA_LOCAL_URL=http://localhost:11434 OLLAMA_LOCAL_URL=http://localhost:11434
OLLAMA_MODEL_NAME=qwen2.5:1.5b OLLAMA_MODEL_NAME=qwen2.5:1.5b
OLLAMA_EMB_MODEL_NAME=qwen3-0.6B-emb:latest OLLAMA_EMB_MODEL_NAME=qwen3-0.6B-emb:latest
HF_TOKEN=hf_...
HF_EMB_MODEL_NAME=Qwen/Qwen3-Embedding-0.6B HF_EMB_MODEL_NAME=Qwen/Qwen3-Embedding-0.6B
``` ```
@ -183,13 +176,13 @@ Open a terminal and establish the connection to the Devaron Cluster:
```bash ```bash
# 1. AI Model Tunnel (Ollama) # 1. AI Model Tunnel (Ollama)
kubectl port-forward --address 0.0.0.0 svc/ollama-light-service 11434:11434 -n brunix --kubeconfig ./kubernetes/ivar.yaml & kubectl port-forward --address 0.0.0.0 svc/ollama-light-service 11434:11434 -n brunix --kubeconfig ./kubernetes/kubeconfig.yaml &
# 2. Knowledge Base Tunnel (Elasticsearch) # 2. Knowledge Base Tunnel (Elasticsearch)
kubectl port-forward --address 0.0.0.0 svc/brunix-vector-db 9200:9200 -n brunix --kubeconfig ./kubernetes/ivar.yaml & kubectl port-forward --address 0.0.0.0 svc/brunix-vector-db 9200:9200 -n brunix --kubeconfig ./kubernetes/kubeconfig.yaml &
# 3. Observability DB Tunnel (PostgreSQL) # 3. Observability DB Tunnel (PostgreSQL)
kubectl port-forward --address 0.0.0.0 svc/brunix-postgres 5432:5432 -n brunix --kubeconfig ./kubernetes/ivar.yaml & kubectl port-forward --address 0.0.0.0 svc/brunix-postgres 5432:5432 -n brunix --kubeconfig ./kubernetes/kubeconfig.yaml &
``` ```
### 5. Launch the Engine ### 5. Launch the Engine

View File

@ -15,13 +15,13 @@ All notable changes to the **Brunix Assistance Engine** will be documented in th
- `src/config.py`: environment variables configuration file. - `src/config.py`: environment variables configuration file.
### Changed ### Changed
- REFACTORED: `scripts/pipelines/flows/elasticsearch_ingestion.py` now uses `docs` documents instead of pre chunked files. - REFACTORED: `scripts/pipelines/flows/elasticsearch_ingestion.py` now uses `docs/LRM` or `docs/samples` documents instead of pre chunked files.
- RENAMED `docs/AVAP Language: Core Commands & Functional Specification` to `docs/avap_language_github_docs`. - RENAMED `docs/AVAP Language: Core Commands & Functional Specification` to `docs/avap_language_github_docs`.
- REMOVED: `Makefile` file. - REMOVED: `Makefile` file.
- REMOVED: `scripts/start-tunnels.sh` script. - REMOVED: `scripts/start-tunnels.sh` script.
- REMOVED `ingestion` folder. - REMOVED `ingestion` folder.
- DEPENDENCIES: `requirements.txt` updated with new libraries required by the new modules. - DEPENDENCIES: `requirements.txt` updated with new libraries required by the new modules.
- MOVED `scripts/generate_mbpp_avap.py` into `scripts/flows/generate_mbpp_avap.py` - MOVED `scripts/generate_mbap.py` into `scripts/flows/generate_mbap.py`.
## [1.4.0] - 2026-03-10 ## [1.4.0] - 2026-03-10

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -5,7 +5,6 @@ description = "Add your description here"
readme = "README.md" readme = "README.md"
requires-python = ">=3.11" requires-python = ">=3.11"
dependencies = [ dependencies = [
"chonkie[semantic]>=1.5.6",
"grpcio>=1.78.0", "grpcio>=1.78.0",
"grpcio-reflection>=1.78.0", "grpcio-reflection>=1.78.0",
"grpcio-tools>=1.78.0", "grpcio-tools>=1.78.0",
@ -28,7 +27,9 @@ dependencies = [
dev = [ dev = [
"beir>=2.2.0", "beir>=2.2.0",
"boto3>=1.42.58", "boto3>=1.42.58",
"chonkie[elastic,semantic]>=1.6.0",
"evidently>=0.7.20", "evidently>=0.7.20",
"flatbuffers>=25.12.19",
"jupyter>=1.1.1", "jupyter>=1.1.1",
"langfuse<3", "langfuse<3",
"litellm>=1.82.0", "litellm>=1.82.0",

View File

@ -1,30 +1,29 @@
import re
import hashlib
from typing import Any
from enum import Enum from enum import Enum
import typer import typer
import logging import logging
import os import os
from pathlib import Path
from loguru import logger from loguru import logger
from elasticsearch import Elasticsearch from elasticsearch import Elasticsearch
from langchain_core.documents import Document
from langchain_elasticsearch import ElasticsearchStore from langchain_elasticsearch import ElasticsearchStore
from langchain_community.embeddings import HuggingFaceEmbeddings from chonkie import SemanticChunker, MarkdownChef
from langchain_experimental.text_splitter import SemanticChunker from transformers import AutoTokenizer
from src.utils.emb_factory import create_embedding_model from src.utils.emb_factory import create_embedding_model
from scripts.pipelines.tasks.chunk import scrape_avap_docs from scripts.pipelines.tasks.chunk import (
read_files,
get_chunk_docs,
convert_chunks_to_document
)
app = typer.Typer() app = typer.Typer()
ELASTICSEARCH_LOCAL_URL = os.getenv("ELASTICSEARCH_LOCAL_URL") ELASTICSEARCH_LOCAL_URL = os.getenv("ELASTICSEARCH_LOCAL_URL")
OLLAMA_LOCAL_URL = os.getenv("OLLAMA_LOCAL_URL") OLLAMA_LOCAL_URL = os.getenv("OLLAMA_LOCAL_URL")
ELASTICSEARCH_INDEX = os.getenv("ELASTICSEARCH_INDEX")
OLLAMA_URL = os.getenv("OLLAMA_URL") OLLAMA_URL = os.getenv("OLLAMA_URL")
OLLAMA_EMB_MODEL_NAME = os.getenv("OLLAMA_EMB_MODEL_NAME") OLLAMA_EMB_MODEL_NAME = os.getenv("OLLAMA_EMB_MODEL_NAME")
AVAP_WEB_DOCS_URL = os.getenv("AVAP_WEB_DOCS_URL") AVAP_WEB_DOCS_URL = os.getenv("AVAP_WEB_DOCS_URL")
HF_EMB_MODEL_NAME = os.getenv("HF_EMB_MODEL_NAME")
class DistanceStrategy(str, Enum): class DistanceStrategy(str, Enum):
euclidean = "EUCLIDEAN_DISTANCE" euclidean = "EUCLIDEAN_DISTANCE"
@ -33,55 +32,45 @@ class DistanceStrategy(str, Enum):
jaccard = "JACCARD" jaccard = "JACCARD"
cosine = "COSINE" cosine = "COSINE"
def clean_text(text: str) -> str:
text = text.replace("\u00a0", " ")
text = re.sub(r"\s+", " ", text).strip()
return text
def build_documents_from_folder(
folder_path: str,
) -> list[Document]:
folder = Path(folder_path)
if not folder.exists() or not folder.is_dir():
raise ValueError(f"Invalid folder path: {folder_path}")
all_documents: list[Document] = []
for file_path in folder.glob("*.txt"):
doc_text = file_path.read_text(encoding="utf-8")
if not doc_text.strip():
continue
metadata: dict[str, Any] = {
"source": file_path.name,
}
doc_text = clean_text(doc_text)
document = Document(
id=hashlib.md5(file_path.name.encode()).hexdigest(),
page_content=doc_text,
metadata={**metadata}
)
all_documents.append(document)
return all_documents
@app.command() @app.command()
def elasticsearch_ingestion( def elasticsearch_ingestion(
docs_folder_path: str = "ingestion/docs", docs_folder_path: str = "docs",
es_index: str = "avap-docs-test-v2",
es_request_timeout: int = 120, es_request_timeout: int = 120,
es_max_retries: int = 5, es_max_retries: int = 5,
es_retry_on_timeout: bool = True, es_retry_on_timeout: bool = True,
distance_strategy: DistanceStrategy = DistanceStrategy.cosine, distance_strategy: DistanceStrategy = DistanceStrategy.cosine,
chunk_size: int = 2048,
chunk_threshold: float = 0.5,
chunk_similarity_window: int = 3,
chunk_skip_window: int = 1,
): ):
logger.info("Starting Elasticsearch ingestion pipeline...") logger.info("Starting Elasticsearch ingestion pipeline...")
logger.info(f"Using docs folder path: {docs_folder_path}") logger.info(f"Reading files from folder: {docs_folder_path}/LRM and {docs_folder_path}/samples...")
documents = build_documents_from_folder(folder_path=docs_folder_path) avap_code_docs = read_files(f"{docs_folder_path}/samples")
avap_language_docs = read_files(f"{docs_folder_path}/LRM")
logger.info("Instantiating semantic chunker and chef...")
custom_tokenizer = AutoTokenizer.from_pretrained(HF_EMB_MODEL_NAME)
chef = MarkdownChef(tokenizer=custom_tokenizer)
chunker = SemanticChunker(
embedding_model=HF_EMB_MODEL_NAME,
chunk_size=chunk_size,
threshold=chunk_threshold,
similarity_window=chunk_similarity_window,
skip_window=chunk_skip_window
)
logger.info("Processing Markdown docs with chef...")
doc = chef.process(f"{docs_folder_path}/LRM/avap.md")
logger.info("Chunking AVAP Language docs...")
avap_language_docs_chunks = get_chunk_docs(avap_language_docs, chunker)
logger.info("Creating Langchain Document to index...")
avap_language_langchain_docs = convert_chunks_to_document(avap_language_docs_chunks)
avap_code_langchain_docs = convert_chunks_to_document(avap_code_docs)
avap_documents = avap_language_langchain_docs + avap_code_langchain_docs
logger.info("Connecting to Elasticsearch...") logger.info("Connecting to Elasticsearch...")
try: try:
@ -106,15 +95,19 @@ def elasticsearch_ingestion(
logger.exception("Failed to instantiate embeddings model.") logger.exception("Failed to instantiate embeddings model.")
raise raise
logger.info(f"Uploading documents to index {ELASTICSEARCH_INDEX}...") logger.info(f"Checking if index {es_index} exists and deleting if it does...")
if es.indices.exists(index=es_index):
es.indices.delete(index=es_index)
logger.info(f"Uploading documents to index {es_index}...")
ElasticsearchStore.from_documents( ElasticsearchStore.from_documents(
documents, avap_documents,
embeddings, embeddings,
client=es, client=es,
index_name=ELASTICSEARCH_INDEX, index_name=es_index,
distance_strategy=distance_strategy.value, distance_strategy=distance_strategy.value,
) )
logger.info(f"Finished uploading documents to index {ELASTICSEARCH_INDEX}.") logger.info(f"Finished uploading documents to index {es_index}.")
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -0,0 +1,122 @@
import typer
import logging
from loguru import logger
from elasticsearch import Elasticsearch
from chonkie import MarkdownChef, FileFetcher, ElasticHandshake
from transformers import AutoTokenizer
from src.config import settings
from scripts.pipelines.tasks.embeddings import OllamaEmbeddings
from scripts.pipelines.tasks.chunk import merge_markdown_document
app = typer.Typer()
def get_processing_and_chunking_config(docs_extension: str, chunk_size: int,
chunk_threshold: float | None,
chunk_similarity_window: int| None,
chunk_skip_window: int | None) -> tuple[str, dict, str, dict]:
"""
Check the file extension and return the appropriate processing and chunking strategies and their kwargs.
Args:
docs_extension (str): The file extension of the documents to be ingested.
chunk_size (int): The size of the chunks to be created.
chunk_threshold (float, optional): The threshold for semantic chunking. Required if docs_extension is .md.
chunk_similarity_window (int, optional): The similarity window for semantic chunking
chunk_skip_window (int, optional): The skip window for semantic chunking.
Returns:
tuple[str, dict, str, dict]: A tuple containing the processing strategy, its kwargs, the chunking strategy, and its kwargs.
"""
if docs_extension == ".md":
process_type = "markdown"
custom_tokenizer = AutoTokenizer.from_pretrained(settings.hf_emb_model_name)
process_kwargs = {"tokenizer": custom_tokenizer}
# process_type = "text"
# process_kwargs = {}
chunk_strat = "semantic"
chunk_kwargs = {"embedding_model": settings.hf_emb_model_name, "threshold": chunk_threshold, "chunk_size": chunk_size,
"similarity_window": chunk_similarity_window, "skip_window": chunk_skip_window}
elif docs_extension == ".avap":
process_type = "text"
process_kwargs = {}
chunk_strat = "recursive" # Once we have the BNF and uploaded to tree-sitter, we can use code (?)
chunk_kwargs = {"chunk_size": chunk_size}
return process_type, process_kwargs, chunk_strat, chunk_kwargs
@app.command()
def elasticsearch_ingestion(
docs_folder_path: str = "docs/LRM",
docs_extension: str = ".md",
es_index: str = "avap-docs-test-v3",
es_request_timeout: int = 120,
es_max_retries: int = 5,
es_retry_on_timeout: bool = True,
delete_es_index: bool = True,
chunk_size: int = 2048,
chunk_threshold: float | None = 0.5,
chunk_similarity_window: int | None = 3,
chunk_skip_window: int | None = 1
):
custom_tokenizer = AutoTokenizer.from_pretrained(settings.hf_emb_model_name)
processed_docs = []
fused_docs = []
logger.info(f"Instantiating Elasticsearch client with URL: {settings.elasticsearch_local_url}...")
es = Elasticsearch(
hosts=settings.elasticsearch_local_url,
request_timeout=es_request_timeout,
max_retries=es_max_retries,
retry_on_timeout=es_retry_on_timeout,
)
if delete_es_index and es.indices.exists(index=es_index):
logger.info(f"Deleting existing Elasticsearch index: {es_index}...")
es.indices.delete(index=es_index)
logger.info("Starting Elasticsearch ingestion pipeline...")
(process_type,
process_kwargs,
chunk_strat,
chunk_kwargs) = get_processing_and_chunking_config(docs_extension, chunk_size, chunk_threshold, chunk_similarity_window, chunk_skip_window)
logger.info(f"Fetching files from {docs_folder_path}...")
fetcher = FileFetcher()
docs = fetcher.fetch(dir=f"{settings.proj_root}/{docs_folder_path}")
logger.info(f"Processing documents with process_type: {process_type}...")
chef = MarkdownChef(tokenizer=custom_tokenizer)
for doc in docs:
processed_doc = chef.process(doc)
processed_docs.append(processed_doc)
logger.info(f"Chunking documents with chunk_strat: {chunk_strat}...")
for processed_doc in processed_docs:
fused_doc = merge_markdown_document(processed_doc)
fused_docs.append(fused_doc)
logger.info(f"Ingesting chunks in Elasticsearch index: {es_index}...")
handshake = ElasticHandshake(
client=es,
index_name=es_index,
embedding_model=OllamaEmbeddings(model=settings.ollama_emb_model_name)
)
for fused_doc in fused_docs:
handshake.write(fused_doc.chunks)
logger.info(f"Finished ingesting in {es_index}.")
if __name__ == "__main__":
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
)
try:
app()
except Exception as exc:
logger.exception(exc)
raise

View File

@ -1,134 +1,38 @@
from enum import Enum
import typer import typer
import logging import logging
import os
from pathlib import Path
from loguru import logger from loguru import logger
from elasticsearch import Elasticsearch from chonkie import FileFetcher
from langchain_elasticsearch import ElasticsearchStore
from chonkie import SemanticChunker
from src.utils.emb_factory import create_embedding_model from src.config import settings
from scripts.pipelines.tasks.chunk import ( from scripts.pipelines.tasks.chunk import process_documents, ingest_documents
read_files,
get_chunk_docs,
convert_chunks_to_document
)
app = typer.Typer() app = typer.Typer()
ELASTICSEARCH_LOCAL_URL = os.getenv("ELASTICSEARCH_LOCAL_URL")
OLLAMA_LOCAL_URL = os.getenv("OLLAMA_LOCAL_URL")
ELASTICSEARCH_INDEX = os.getenv("ELASTICSEARCH_INDEX")
OLLAMA_URL = os.getenv("OLLAMA_URL")
OLLAMA_EMB_MODEL_NAME = os.getenv("OLLAMA_EMB_MODEL_NAME")
AVAP_WEB_DOCS_URL = os.getenv("AVAP_WEB_DOCS_URL")
HF_EMB_MODEL_NAME = os.getenv("HF_EMB_MODEL_NAME")
class DistanceStrategy(str, Enum):
euclidean = "EUCLIDEAN_DISTANCE"
max_inner_product = "MAX_INNER_PRODUCT"
dot_product = "DOT_PRODUCT"
jaccard = "JACCARD"
cosine = "COSINE"
@app.command() @app.command()
def elasticsearch_ingestion( def elasticsearch_ingestion(
docs_folder_path: str = "docs", docs_folder_path: str = "docs/LRM",
docs_extension: str = ".md",
es_index: str = "avap-docs-test-v3",
es_request_timeout: int = 120, es_request_timeout: int = 120,
es_max_retries: int = 5, es_max_retries: int = 5,
es_retry_on_timeout: bool = True, es_retry_on_timeout: bool = True,
distance_strategy: DistanceStrategy = DistanceStrategy.cosine, delete_es_index: bool = True
chunk_size: int = 2048,
chunk_threshold: float = 0.5,
chunk_similarity_window: int = 3,
chunk_skip_window: int = 1,
): ):
logger.info("Starting Elasticsearch ingestion pipeline...") logger.info("Starting Elasticsearch ingestion pipeline...")
logger.info(f"Reading and concatenating files from folder: {docs_folder_path}/developer.avapframework.com") logger.info(f"Fetching files from {docs_folder_path}...")
avap_github_docs = read_files(f"{docs_folder_path}/avap_language_github_docs", concatenate=False) fetcher = FileFetcher()
avap_web_docs_intro = read_files(f"{docs_folder_path}/developer.avapframework.com", "intro", concatenate=True) docs_path = fetcher.fetch(dir=f"{settings.proj_root}/{docs_folder_path}")
# Check chapters in developer.avapframework.com folder and read and concatenate files for each chapter logger.info("Processing docs...")
chapters = sorted({ chunked_docs = process_documents(docs_path, docs_extension)
p.name.split("_")[0]
for p in Path(f"{docs_folder_path}/developer.avapframework.com").glob("chapter*.md")
})
avap_web_docs_chapters = [ logger.info(f"Ingesting chunks in Elasticsearch index: {es_index}...")
item ingest_documents(chunked_docs, es_index, es_request_timeout, es_max_retries,
for chapter in chapters es_retry_on_timeout, delete_es_index)
for item in read_files(
f"{docs_folder_path}/developer.avapframework.com",
f"{chapter}_",
concatenate=True
)
]
avap_web_docs_appendices = read_files(f"{docs_folder_path}/developer.avapframework.com", "appendices_", concatenate=False) logger.info(f"Finished ingesting in {es_index}.")
avap_samples_docs = read_files(f"{docs_folder_path}/samples", concatenate=False)
logger.info("Instantiating semantic chunker...")
chunker = SemanticChunker(
embedding_model=HF_EMB_MODEL_NAME,
chunk_size=chunk_size,
threshold=chunk_threshold,
similarity_window=chunk_similarity_window,
skip_window=chunk_skip_window
)
logger.info("Chunking AVAP GitHub docs...")
avap_github_docs_chunks = get_chunk_docs(avap_github_docs, chunker)
logger.info("Chunking AVAP web docs chapters...")
avap_web_docs_chapters_chunks = get_chunk_docs(avap_web_docs_chapters, chunker)
logger.info("Creating Langchain Document to index...")
avap_github_langchain_docs = convert_chunks_to_document(avap_github_docs_chunks)
avap_web_chapters_langchain_docs = convert_chunks_to_document(avap_web_docs_chapters_chunks)
avap_web_intro_langchain_docs = convert_chunks_to_document(avap_web_docs_intro)
avap_web_appendices_langchain_docs = convert_chunks_to_document(avap_web_docs_appendices)
avap_samples_langchain_docs = convert_chunks_to_document(avap_samples_docs)
avap_documents = avap_github_langchain_docs + avap_web_chapters_langchain_docs + avap_web_intro_langchain_docs + avap_web_appendices_langchain_docs + avap_samples_langchain_docs
logger.info("Connecting to Elasticsearch...")
try:
es = Elasticsearch(
ELASTICSEARCH_LOCAL_URL,
request_timeout=es_request_timeout,
max_retries=es_max_retries,
retry_on_timeout=es_retry_on_timeout,
)
except:
logger.exception("Failed to connect to Elasticsearch.")
raise
logger.info("Instantiating embeddings model...")
try:
embeddings = create_embedding_model(
provider="ollama",
model=OLLAMA_EMB_MODEL_NAME,
base_url=OLLAMA_LOCAL_URL,
)
except:
logger.exception("Failed to instantiate embeddings model.")
raise
logger.info(f"Checking if index {ELASTICSEARCH_INDEX} exists and deleting if it does...")
if es.indices.exists(index=ELASTICSEARCH_INDEX):
es.indices.delete(index=ELASTICSEARCH_INDEX)
logger.info(f"Uploading documents to index {ELASTICSEARCH_INDEX}...")
ElasticsearchStore.from_documents(
avap_documents,
embeddings,
client=es,
index_name=ELASTICSEARCH_INDEX,
distance_strategy=distance_strategy.value,
)
logger.info(f"Finished uploading documents to index {ELASTICSEARCH_INDEX}.")
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -1,136 +1,159 @@
import os from copy import deepcopy
import re from dataclasses import replace
import uuid from pathlib import Path
from loguru import logger from chonkie import (
from chonkie import Chunk, SemanticChunker Chunk,
from langchain_core.documents import Document ElasticHandshake,
MarkdownChef,
TextChef,
def replace_javascript_with_avap(text: str) -> str: TokenChunker,
"""
Replace mentions of javascript language with avap in the text.
Handles code blocks, language identifiers, and references.
Args:
text: The text to process.
Returns:
The text with javascript references replaced with avap.
"""
# Replace ```javascript with ```avap
text = text.replace("```javascript", "```avap")
# Replace ```js with ```avap
text = text.replace("```js", "```avap")
# Replace common phrases (case-insensitive)
text = re.sub(r"\bjavascript\s+code\b", "avap code", text, flags=re.IGNORECASE)
text = re.sub(
r"\bjavascript\s+example\b", "avap example", text, flags=re.IGNORECASE
) )
text = re.sub(r"\bjavascript\b(?!\s+file)", "avap", text, flags=re.IGNORECASE) from elasticsearch import Elasticsearch
from loguru import logger
from transformers import AutoTokenizer
return text from scripts.pipelines.tasks.embeddings import OllamaEmbeddings
from src.config import settings
def read_files( def _get_text(element) -> str:
folder_path: str, file_prefix: str | None = None, concatenate: bool = True for attr in ("text", "content", "markdown"):
) -> list[dict]: value = getattr(element, attr, None)
""" if isinstance(value, str):
Read files in a folder whose names start with a given prefix. return value
Replaces javascript language markers with avap. raise AttributeError(
f"Could not extract text from element of type {type(element).__name__}"
Args: )
folder_path: Path to the folder to search in.
file_prefix: The prefix that file names must start with.
If None, all files in the folder are included.
concatenate: Whether to concatenate the contents of the files.
Returns:
A list of dictionaries, each containing 'content' and 'title' keys.
If concatenate is True, returns a single dict with concatenated content and title as 'appendix'.
If concatenate is False, returns one dict per file with filename as title.
"""
contents = []
filenames = []
for filename in sorted(os.listdir(folder_path)):
include_file = file_prefix is None or filename.startswith(file_prefix)
if include_file:
file_path = os.path.join(folder_path, filename)
if os.path.isfile(file_path):
with open(file_path, "r", encoding="utf-8") as f:
content = f.read()
cleaned_content = content.strip()
if cleaned_content:
contents.append(cleaned_content)
filenames.append(filename)
if concatenate:
concatenated = "\n".join(contents)
processed_content = replace_javascript_with_avap(concatenated)
title = file_prefix if file_prefix is not None else "all_files"
return [{"content": processed_content, "title": title}]
else:
return [
{"content": replace_javascript_with_avap(content), "title": filename}
for content, filename in zip(contents, filenames)
]
def get_chunk_docs(docs: list[dict], chunker: SemanticChunker) -> list[list[Chunk]]: def _merge_markdown_document(doc):
""" elements = []
Chunk the content of the documents using the provided chunker.
Args: for chunk in doc.chunks:
docs: A list of dictionaries, each containing 'content' and 'title' keys. elements.append(("chunk", chunk.start_index, chunk.end_index, chunk))
chunker: An instance of SemanticChunker to use for chunking the content.
Returns: for code in doc.code:
A list of lists of Chunk objects, where each inner list corresponds to the chunks of a elements.append(("code", code.start_index, code.end_index, code))
single document.
"""
list_chunks = []
for doc in docs: for table in doc.tables:
content = doc["content"] elements.append(("table", table.start_index, table.end_index, table))
chunks = chunker.chunk(content)
for chunk in chunks:
chunk.context = {"source": doc["title"]}
list_chunks.append(chunks)
logger.info(f"Finished chunking {doc['title']}")
return list_chunks elements.sort(key=lambda item: (item[1], item[2]))
merged_chunks = []
current_chunk = None
current_parts = []
current_end_index = None
current_token_count = None
def flush():
nonlocal current_chunk, current_parts, current_end_index, current_token_count
if current_chunk is None:
return
merged_text = "\n\n".join(part for part in current_parts if part)
merged_chunks.append(
replace(
current_chunk,
text=merged_text,
end_index=current_end_index,
token_count=current_token_count,
)
)
current_chunk = None
current_parts = []
current_end_index = None
current_token_count = None
for kind, _, _, element in elements:
if kind == "chunk":
flush()
current_chunk = element
current_parts = [_get_text(element)]
current_end_index = element.end_index
current_token_count = element.token_count
continue
if current_chunk is None:
continue
current_parts.append(_get_text(element))
current_end_index = max(current_end_index, element.end_index)
current_token_count += getattr(element, "token_count", 0)
flush()
new_doc = deepcopy(doc)
new_doc.chunks = merged_chunks
new_doc.code = doc.code
new_doc.tables = doc.tables
return new_doc
def convert_chunks_to_document(chunks: list[dict] | list[list[Chunk]]) -> list[Document]: def process_documents(docs_path: list[Path], docs_extension: str) -> list[Chunk]:
""" processed_docs = []
Convert the chunked content into a list of Document objects. chunked_docs = []
custom_tokenizer = AutoTokenizer.from_pretrained(settings.hf_emb_model_name)
Args: if docs_extension == ".md":
chunks: A list of dictionaries containing 'content' and 'title' keys. chef = MarkdownChef(tokenizer=custom_tokenizer)
for doc in docs_path:
processed_doc = chef.process(doc)
processed_docs.append((processed_doc, doc.name))
Returns: for processed_doc, filename in processed_docs:
A list of Document objects created from the chunked content. fused_doc = _merge_markdown_document(processed_doc)
""" chunked_docs.extend(fused_doc.chunks)
documents = []
if isinstance(chunks[0], dict): elif docs_extension == ".avap":
for chunk in chunks: chef = TextChef()
content = chunk["content"] chunker = TokenChunker(tokenizer=custom_tokenizer)
title = chunk["title"] for doc in docs_path:
documents.append(Document(id=str(uuid.uuid4()), processed_doc = chef.process(doc)
page_content=content, processed_docs.append((processed_doc, doc.name))
metadata={"source": title}))
else: for processed_doc, filename in processed_docs:
for chunk_list in chunks: chunked_doc = chunker.chunk(processed_doc.content)
for chunk in chunk_list: chunked_docs.extend(chunked_doc)
content = chunk.text
title = chunk.context.get("source", "unknown")
documents.append(Document(id=str(uuid.uuid4()),
page_content=content,
metadata={"source": title}))
return documents return chunked_docs
def ingest_documents(
chunked_docs: list[Chunk],
es_index: str,
es_request_timeout: int,
es_max_retries: int,
es_retry_on_timeout: bool,
delete_es_index: bool,
) -> None:
logger.info(
f"Instantiating Elasticsearch client with URL: {settings.elasticsearch_local_url}..."
)
es = Elasticsearch(
hosts=settings.elasticsearch_local_url,
request_timeout=es_request_timeout,
max_retries=es_max_retries,
retry_on_timeout=es_retry_on_timeout,
)
if delete_es_index and es.indices.exists(index=es_index):
logger.info(f"Deleting existing Elasticsearch index: {es_index}...")
es.indices.delete(index=es_index)
handshake = ElasticHandshake(
client=es,
index_name=es_index,
embedding_model=OllamaEmbeddings(model=settings.ollama_emb_model_name),
)
logger.info(
f"Ingesting {len(chunked_docs)} chunks into Elasticsearch index: {es_index}..."
)
handshake.write(chunked_docs)

View File

@ -0,0 +1,125 @@
import requests
from typing import Any, Callable
import numpy as np
from chonkie.embeddings import BaseEmbeddings
from src.config import settings
class OllamaEmbeddings(BaseEmbeddings):
"""Chonkie embeddings adapter for a local Ollama embedding model."""
def __init__(
self,
model: str,
base_url: str = settings.ollama_local_url,
timeout: float = 60.0,
truncate: bool = True,
keep_alive: str = "5m",
) -> None:
self.model = model
self.base_url = base_url.rstrip("/")
self.timeout = timeout
self.truncate = truncate
self.keep_alive = keep_alive
self._dimension: int | None = None
@property
def dimension(self) -> int:
if self._dimension is None:
# Lazy-load the dimension from a real embedding response.
self._dimension = int(self.embed(" ").shape[0])
return self._dimension
def embed(self, text: str) -> np.ndarray:
embeddings = self._embed_api(text)
vector = np.asarray(embeddings[0], dtype=np.float32)
if self._dimension is None:
self._dimension = int(vector.shape[0])
return vector
def embed_batch(self, texts: list[str]) -> list[np.ndarray]:
if not texts:
return []
embeddings = self._embed_api(texts)
vectors = [np.asarray(vector, dtype=np.float32) for vector in embeddings]
if vectors and self._dimension is None:
self._dimension = int(vectors[0].shape[0])
return vectors
def count_tokens(self, text: str) -> int:
payload = self._build_payload(text)
response = self._post_embed(payload)
return int(response["prompt_eval_count"])
def count_tokens_batch(self, texts: list[str]) -> list[int]:
# Ollama returns a single prompt_eval_count for the whole request,
# not one count per input item, so we compute them individually.
return [self.count_tokens(text) for text in texts]
def get_tokenizer(self) -> Callable[[str], int]:
# Chonkie mainly needs something usable for token counting.
return self.count_tokens
@classmethod
def is_available(cls) -> bool:
try:
response = requests.get(
f"{settings.ollama_local_url}/api/tags",
timeout=5.0,
)
response.raise_for_status()
return True
except requests.RequestException:
return False
def __repr__(self) -> str:
return (
f"OllamaEmbeddings("
f"model={self.model!r}, "
f"base_url={self.base_url!r}, "
f"dimension={self._dimension!r}"
f")"
)
def _build_payload(self, text_or_texts: str | list[str]) -> dict[str, Any]:
return {
"model": self.model,
"input": text_or_texts,
"truncate": self.truncate,
"keep_alive": self.keep_alive,
}
def _post_embed(self, payload: dict[str, Any]) -> dict[str, Any]:
try:
response = requests.post(
f"{self.base_url}/api/embed",
json=payload,
timeout=self.timeout,
)
response.raise_for_status()
data = response.json()
except requests.RequestException as exc:
raise RuntimeError(
f"Failed to call Ollama embeddings endpoint at "
f"{self.base_url}/api/embed"
) from exc
if "embeddings" not in data:
raise RuntimeError(
"Ollama response did not include 'embeddings'. "
f"Response keys: {list(data.keys())}"
)
return data
def _embed_api(self, text_or_texts: str | list[str]) -> list[list[float]]:
payload = self._build_payload(text_or_texts)
data = self._post_embed(payload)
return data["embeddings"]

29
uv.lock
View File

@ -250,7 +250,6 @@ name = "assistance-engine"
version = "0.1.0" version = "0.1.0"
source = { virtual = "." } source = { virtual = "." }
dependencies = [ dependencies = [
{ name = "chonkie", extra = ["semantic"] },
{ name = "grpcio" }, { name = "grpcio" },
{ name = "grpcio-reflection" }, { name = "grpcio-reflection" },
{ name = "grpcio-tools" }, { name = "grpcio-tools" },
@ -273,7 +272,9 @@ dependencies = [
dev = [ dev = [
{ name = "beir" }, { name = "beir" },
{ name = "boto3" }, { name = "boto3" },
{ name = "chonkie", extra = ["elastic", "semantic"] },
{ name = "evidently" }, { name = "evidently" },
{ name = "flatbuffers" },
{ name = "jupyter" }, { name = "jupyter" },
{ name = "langfuse" }, { name = "langfuse" },
{ name = "litellm" }, { name = "litellm" },
@ -288,7 +289,6 @@ dev = [
[package.metadata] [package.metadata]
requires-dist = [ requires-dist = [
{ name = "chonkie", extras = ["semantic"], specifier = ">=1.5.6" },
{ name = "grpcio", specifier = ">=1.78.0" }, { name = "grpcio", specifier = ">=1.78.0" },
{ name = "grpcio-reflection", specifier = ">=1.78.0" }, { name = "grpcio-reflection", specifier = ">=1.78.0" },
{ name = "grpcio-tools", specifier = ">=1.78.0" }, { name = "grpcio-tools", specifier = ">=1.78.0" },
@ -311,7 +311,9 @@ requires-dist = [
dev = [ dev = [
{ name = "beir", specifier = ">=2.2.0" }, { name = "beir", specifier = ">=2.2.0" },
{ name = "boto3", specifier = ">=1.42.58" }, { name = "boto3", specifier = ">=1.42.58" },
{ name = "chonkie", extras = ["elastic", "semantic"], specifier = ">=1.6.0" },
{ name = "evidently", specifier = ">=0.7.20" }, { name = "evidently", specifier = ">=0.7.20" },
{ name = "flatbuffers", specifier = ">=25.12.19" },
{ name = "jupyter", specifier = ">=1.1.1" }, { name = "jupyter", specifier = ">=1.1.1" },
{ name = "langfuse", specifier = "<3" }, { name = "langfuse", specifier = "<3" },
{ name = "litellm", specifier = ">=1.82.0" }, { name = "litellm", specifier = ">=1.82.0" },
@ -595,7 +597,7 @@ wheels = [
[[package]] [[package]]
name = "chonkie" name = "chonkie"
version = "1.5.6" version = "1.6.0"
source = { registry = "https://pypi.org/simple" } source = { registry = "https://pypi.org/simple" }
dependencies = [ dependencies = [
{ name = "chonkie-core" }, { name = "chonkie-core" },
@ -603,12 +605,15 @@ dependencies = [
{ name = "tenacity" }, { name = "tenacity" },
{ name = "tqdm" }, { name = "tqdm" },
] ]
sdist = { url = "https://files.pythonhosted.org/packages/a4/16/e51295955f5a627ebb7867dc2e7fa48d4c6dc2a5f3cde3690de84812e929/chonkie-1.5.6.tar.gz", hash = "sha256:282a24c20b88c4c28d8cae893ac78bcbee531a87d28ec86b419897a9eea2ecf3", size = 172066, upload-time = "2026-02-16T21:44:01.336Z" } sdist = { url = "https://files.pythonhosted.org/packages/e5/72/fdf8f89ff439f4ec357af0866c819512391936e4e61b6f15635a48434b8a/chonkie-1.6.0.tar.gz", hash = "sha256:14120d80610c1f549027fc7aa9a5ff604a729b545836f6cadd65d5ae83596279", size = 187056, upload-time = "2026-03-11T04:55:07.657Z" }
wheels = [ wheels = [
{ url = "https://files.pythonhosted.org/packages/18/3a/24cf4cb377f4d44126231d55a19b48a645a0f78f891288a8d4300c95160d/chonkie-1.5.6-py3-none-any.whl", hash = "sha256:4c3be39a0f97315eb3c5efe6dc5d7933d3d27a1918b55c39ab211b403bb03df7", size = 210065, upload-time = "2026-02-16T21:43:59.926Z" }, { url = "https://files.pythonhosted.org/packages/ae/c2/7ea7d3409df220dd0e048b1113b44f47eccab9d517b00b037ab0e34c3c7a/chonkie-1.6.0-py3-none-any.whl", hash = "sha256:aa357e02f5cdacac6f8280c5e8651207c866b4137bcf20904db8670ee0808877", size = 232997, upload-time = "2026-03-11T04:55:05.252Z" },
] ]
[package.optional-dependencies] [package.optional-dependencies]
elastic = [
{ name = "elasticsearch" },
]
semantic = [ semantic = [
{ name = "model2vec" }, { name = "model2vec" },
{ name = "tokenizers" }, { name = "tokenizers" },
@ -1061,6 +1066,14 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/9c/0f/5d0c71a1aefeb08efff26272149e07ab922b64f46c63363756224bd6872e/filelock-3.24.3-py3-none-any.whl", hash = "sha256:426e9a4660391f7f8a810d71b0555bce9008b0a1cc342ab1f6947d37639e002d", size = 24331, upload-time = "2026-02-19T00:48:18.465Z" }, { url = "https://files.pythonhosted.org/packages/9c/0f/5d0c71a1aefeb08efff26272149e07ab922b64f46c63363756224bd6872e/filelock-3.24.3-py3-none-any.whl", hash = "sha256:426e9a4660391f7f8a810d71b0555bce9008b0a1cc342ab1f6947d37639e002d", size = 24331, upload-time = "2026-02-19T00:48:18.465Z" },
] ]
[[package]]
name = "flatbuffers"
version = "25.12.19"
source = { registry = "https://pypi.org/simple" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/e8/2d/d2a548598be01649e2d46231d151a6c56d10b964d94043a335ae56ea2d92/flatbuffers-25.12.19-py2.py3-none-any.whl", hash = "sha256:7634f50c427838bb021c2d66a3d1168e9d199b0607e6329399f04846d42e20b4", size = 26661, upload-time = "2025-12-19T23:16:13.622Z" },
]
[[package]] [[package]]
name = "fqdn" name = "fqdn"
version = "1.5.1" version = "1.5.1"
@ -3112,14 +3125,14 @@ wheels = [
[[package]] [[package]]
name = "opentelemetry-proto" name = "opentelemetry-proto"
version = "1.39.1" version = "1.40.0"
source = { registry = "https://pypi.org/simple" } source = { registry = "https://pypi.org/simple" }
dependencies = [ dependencies = [
{ name = "protobuf" }, { name = "protobuf" },
] ]
sdist = { url = "https://files.pythonhosted.org/packages/49/1d/f25d76d8260c156c40c97c9ed4511ec0f9ce353f8108ca6e7561f82a06b2/opentelemetry_proto-1.39.1.tar.gz", hash = "sha256:6c8e05144fc0d3ed4d22c2289c6b126e03bcd0e6a7da0f16cedd2e1c2772e2c8", size = 46152, upload-time = "2025-12-11T13:32:48.681Z" } sdist = { url = "https://files.pythonhosted.org/packages/4c/77/dd38991db037fdfce45849491cb61de5ab000f49824a00230afb112a4392/opentelemetry_proto-1.40.0.tar.gz", hash = "sha256:03f639ca129ba513f5819810f5b1f42bcb371391405d99c168fe6937c62febcd", size = 45667, upload-time = "2026-03-04T14:17:31.194Z" }
wheels = [ wheels = [
{ url = "https://files.pythonhosted.org/packages/51/95/b40c96a7b5203005a0b03d8ce8cd212ff23f1793d5ba289c87a097571b18/opentelemetry_proto-1.39.1-py3-none-any.whl", hash = "sha256:22cdc78efd3b3765d09e68bfbd010d4fc254c9818afd0b6b423387d9dee46007", size = 72535, upload-time = "2025-12-11T13:32:33.866Z" }, { url = "https://files.pythonhosted.org/packages/b9/b2/189b2577dde745b15625b3214302605b1353436219d42b7912e77fa8dc24/opentelemetry_proto-1.40.0-py3-none-any.whl", hash = "sha256:266c4385d88923a23d63e353e9761af0f47a6ed0d486979777fe4de59dc9b25f", size = 72073, upload-time = "2026-03-04T14:17:16.673Z" },
] ]
[[package]] [[package]]