Refactor Elasticsearch ingestion pipeline and add MBPP generation script

- Updated `elasticsearch_ingestion.py` to streamline document processing and ingestion into Elasticsearch. - Introduced `generate_mbap.py` for generating benchmark problems in AVAP language from a provided LRM. - Created `prompts.py` to define prompts for converting Python problems to AVAP. - Enhanced chunk processing in `chunk.py` to support markdown and AVAP documents. - Added `OllamaEmbeddings` class in `embeddings.py` for handling embeddings with Ollama model. - Updated dependencies in `uv.lock` to include new packages and versions.
2026-03-11 17:17:44 +01:00 · 2026-03-11 17:17:44 +01:00 · 5f21544e0b
parent 3caed4deb6
commit 5f21544e0b
13 changed files with 486 additions and 5808 deletions
--- a/README.md
+++ b/README.md
@ -47,11 +47,12 @@ graph TD
 ├── changelog                     # Version tracking and release history
 ├── pyproject.toml                # Python project configuration
 ├── docs/
-|   ├── AVAP Language: ...        # AVAP DSL Documentation
-|   |   └── AVAP.md
+│   ├── AVAP Language: ...        # AVAP DSL Documentation
+│   │   └── AVAP.md
 │   ├── developer.avapfr...       # Documents on developer web page
-|   └── LRM/                      # AVAP LRM documentation
-|       └── avap.md 
+│   ├── LRM/                      # AVAP LRM documentation
+│   │   └── avap.md 
+│   └── samples/                  # AVAP code samples
 ├── Docker/
 │   ├── protos/
 │   │    └── brunix.proto         # Protocol Buffers: The source of truth for the API
@ -64,26 +65,17 @@ graph TD
 │   ├── Dockerfile                # Container definition for the Engine
 │   ├── docker-compose.yaml       # Local orchestration for dev environment
 │   ├── requirements.txt          # Python dependencies for Docker
-│   ├── protos/
-│   │   └── brunix.proto          # Protocol Buffers: The source of truth for the API
-│   └── src/
-│       ├── graph.py              # Workflow graph orchestration
-│       ├── prompts.py            # Centralized prompt definitions
-│       ├── server.py             # gRPC Server & RAG Orchestration
-│       ├── state.py              # Shared state management
-│       └── utils/                # Utility modules
-├── ingestion/
-│   └── docs/                     # AVAP documentation chunks
+│   └── .dockerignore             # Docker ignore files
 ├── kubernetes/
 │   └── kubeconfig.yaml           # Kubernetes cluster configuration
 ├── scripts/
 │   └── pipelines/
-|       ├── samples_generator/    # AVAP Sample generator
-|       |   └─ generate_mbap.py   
-│       └── flows/                # Data processing flows
-|           └─ elasticsearch_ingestion.py 
+│       ├── flows/                # Processing pipelines
+│       ├── tasks/                # Modules used by the flows
+│       └── inputs/               # Inputs used by the flows
 └── src/
    ├── __init__.py
+    ├── config.py                 # Environment variables configuration file
    └── utils/
        ├── emb_factory.py        # Embedding model factory
        ├── llm_factory.py        # LLM model factory
@ -157,6 +149,7 @@ OLLAMA_URL=http://host.docker.internal:11434
 OLLAMA_LOCAL_URL=http://localhost:11434
 OLLAMA_MODEL_NAME=qwen2.5:1.5b
 OLLAMA_EMB_MODEL_NAME=qwen3-0.6B-emb:latest
+HF_TOKEN=hf_...
 HF_EMB_MODEL_NAME=Qwen/Qwen3-Embedding-0.6B
 ```

@ -183,13 +176,13 @@ Open a terminal and establish the connection to the Devaron Cluster:

 ```bash
 # 1. AI Model Tunnel (Ollama)
-kubectl port-forward --address 0.0.0.0 svc/ollama-light-service 11434:11434 -n brunix --kubeconfig ./kubernetes/ivar.yaml &
+kubectl port-forward --address 0.0.0.0 svc/ollama-light-service 11434:11434 -n brunix --kubeconfig ./kubernetes/kubeconfig.yaml &

 # 2. Knowledge Base Tunnel (Elasticsearch)
-kubectl port-forward --address 0.0.0.0 svc/brunix-vector-db 9200:9200 -n brunix --kubeconfig ./kubernetes/ivar.yaml &
+kubectl port-forward --address 0.0.0.0 svc/brunix-vector-db 9200:9200 -n brunix --kubeconfig ./kubernetes/kubeconfig.yaml &

 # 3. Observability DB Tunnel (PostgreSQL)
-kubectl port-forward --address 0.0.0.0 svc/brunix-postgres 5432:5432 -n brunix --kubeconfig ./kubernetes/ivar.yaml &
+kubectl port-forward --address 0.0.0.0 svc/brunix-postgres 5432:5432 -n brunix --kubeconfig ./kubernetes/kubeconfig.yaml &
 ```

 ### 5. Launch the Engine
--- a/4
+++ b/4
@ -15,13 +15,13 @@ All notable changes to the **Brunix Assistance Engine** will be documented in th
    - `src/config.py`: environment variables configuration file.

 ### Changed
- REFACTORED: `scripts/pipelines/flows/elasticsearch_ingestion.py` now uses `docs` documents instead of pre chunked files.
+- REFACTORED: `scripts/pipelines/flows/elasticsearch_ingestion.py` now uses `docs/LRM` or `docs/samples` documents instead of pre chunked files.
 - RENAMED `docs/AVAP Language: Core Commands & Functional Specification` to `docs/avap_language_github_docs`.
 - REMOVED: `Makefile` file.
 - REMOVED: `scripts/start-tunnels.sh` script.
 - REMOVED `ingestion` folder.
 - DEPENDENCIES: `requirements.txt` updated with new libraries required by the new modules.
- MOVED `scripts/generate_mbpp_avap.py` into `scripts/flows/generate_mbpp_avap.py`
+- MOVED `scripts/generate_mbap.py` into `scripts/flows/generate_mbap.py`.


 ## [1.4.0] - 2026-03-10
--- a/docs/AVAP_dev.md
+++ b/docs/AVAP_dev.md
--- a/docs/avap.txt
+++ b/docs/avap.txt
--- a/pyproject.toml
+++ b/pyproject.toml
@ -5,7 +5,6 @@ description = "Add your description here"
 readme = "README.md"
 requires-python = ">=3.11"
 dependencies = [
-    "chonkie[semantic]>=1.5.6",
    "grpcio>=1.78.0",
    "grpcio-reflection>=1.78.0",
    "grpcio-tools>=1.78.0",
@ -28,7 +27,9 @@ dependencies = [
 dev = [
    "beir>=2.2.0",
    "boto3>=1.42.58",
+    "chonkie[elastic,semantic]>=1.6.0",
    "evidently>=0.7.20",
+    "flatbuffers>=25.12.19",
    "jupyter>=1.1.1",
    "langfuse<3",
    "litellm>=1.82.0",
--- a/scratches/acano/elasticsearch_ingestion.py
+++ b/scratches/acano/elasticsearch_ingestion.py
@ -1,30 +1,29 @@
-import re
-import hashlib
-from typing import Any
 from enum import Enum
 import typer
 import logging
 import os
-from pathlib import Path

 from loguru import logger
 from elasticsearch import Elasticsearch
-from langchain_core.documents import Document
 from langchain_elasticsearch import ElasticsearchStore
-from langchain_community.embeddings import HuggingFaceEmbeddings
-from langchain_experimental.text_splitter import SemanticChunker
+from chonkie import SemanticChunker, MarkdownChef
+from transformers import AutoTokenizer

 from src.utils.emb_factory import create_embedding_model
-from scripts.pipelines.tasks.chunk import scrape_avap_docs
+from scripts.pipelines.tasks.chunk import (
+    read_files, 
+    get_chunk_docs, 
+    convert_chunks_to_document
+)

 app = typer.Typer()

 ELASTICSEARCH_LOCAL_URL = os.getenv("ELASTICSEARCH_LOCAL_URL")
 OLLAMA_LOCAL_URL = os.getenv("OLLAMA_LOCAL_URL")
-ELASTICSEARCH_INDEX = os.getenv("ELASTICSEARCH_INDEX")
 OLLAMA_URL = os.getenv("OLLAMA_URL")
 OLLAMA_EMB_MODEL_NAME = os.getenv("OLLAMA_EMB_MODEL_NAME")
 AVAP_WEB_DOCS_URL = os.getenv("AVAP_WEB_DOCS_URL")
+HF_EMB_MODEL_NAME = os.getenv("HF_EMB_MODEL_NAME")

 class DistanceStrategy(str, Enum):
    euclidean = "EUCLIDEAN_DISTANCE"
@ -33,55 +32,45 @@ class DistanceStrategy(str, Enum):
    jaccard = "JACCARD"
    cosine = "COSINE"

-def clean_text(text: str) -> str:
-    text = text.replace("\u00a0", " ")
-    text = re.sub(r"\s+", " ", text).strip()
-    return text
-
-def build_documents_from_folder(
-    folder_path: str,
-) -> list[Document]:
-
-    folder = Path(folder_path)
-
-    if not folder.exists() or not folder.is_dir():
-        raise ValueError(f"Invalid folder path: {folder_path}")
-
-    all_documents: list[Document] = []
-
-    for file_path in folder.glob("*.txt"):
-        doc_text = file_path.read_text(encoding="utf-8")
-
-        if not doc_text.strip():
-            continue
-
-        metadata: dict[str, Any] = {
-            "source": file_path.name,
-        }   
-
-        doc_text = clean_text(doc_text)
-        document = Document(
-            id=hashlib.md5(file_path.name.encode()).hexdigest(),
-            page_content=doc_text,
-            metadata={**metadata}
-        )
-
-        all_documents.append(document)
-
-    return all_documents
-

@app.command()
 def elasticsearch_ingestion(
-    docs_folder_path: str = "ingestion/docs",
+    docs_folder_path: str = "docs",
+    es_index: str = "avap-docs-test-v2",
    es_request_timeout: int = 120,
    es_max_retries: int = 5,
    es_retry_on_timeout: bool = True,
    distance_strategy: DistanceStrategy = DistanceStrategy.cosine,
-):
+    chunk_size: int = 2048,
+    chunk_threshold: float = 0.5,
+    chunk_similarity_window: int = 3,
+    chunk_skip_window: int = 1,
+):  
    logger.info("Starting Elasticsearch ingestion pipeline...")
-    logger.info(f"Using docs folder path: {docs_folder_path}")
-    documents = build_documents_from_folder(folder_path=docs_folder_path)
+    logger.info(f"Reading files from folder: {docs_folder_path}/LRM and {docs_folder_path}/samples...")
+    avap_code_docs = read_files(f"{docs_folder_path}/samples")
+    avap_language_docs = read_files(f"{docs_folder_path}/LRM")
+
+    logger.info("Instantiating semantic chunker and chef...")
+    custom_tokenizer = AutoTokenizer.from_pretrained(HF_EMB_MODEL_NAME)
+    chef = MarkdownChef(tokenizer=custom_tokenizer)
+    chunker = SemanticChunker(
+        embedding_model=HF_EMB_MODEL_NAME,
+        chunk_size=chunk_size,
+        threshold=chunk_threshold,
+        similarity_window=chunk_similarity_window,
+        skip_window=chunk_skip_window
+    )
+    logger.info("Processing Markdown docs with chef...")
+    doc = chef.process(f"{docs_folder_path}/LRM/avap.md")
+
+    logger.info("Chunking AVAP Language docs...")
+    avap_language_docs_chunks = get_chunk_docs(avap_language_docs, chunker)
+
+    logger.info("Creating Langchain Document to index...")
+    avap_language_langchain_docs = convert_chunks_to_document(avap_language_docs_chunks)
+    avap_code_langchain_docs = convert_chunks_to_document(avap_code_docs)
+    avap_documents = avap_language_langchain_docs + avap_code_langchain_docs

    logger.info("Connecting to Elasticsearch...")
    try:
@ -105,16 +94,20 @@ def elasticsearch_ingestion(
    except:
        logger.exception("Failed to instantiate embeddings model.")
        raise
+    
+    logger.info(f"Checking if index {es_index} exists and deleting if it does...")
+    if es.indices.exists(index=es_index):
+        es.indices.delete(index=es_index)

-    logger.info(f"Uploading documents to index {ELASTICSEARCH_INDEX}...")
+    logger.info(f"Uploading documents to index {es_index}...")
    ElasticsearchStore.from_documents(
-        documents,
+        avap_documents,
        embeddings,
        client=es,
-        index_name=ELASTICSEARCH_INDEX,
+        index_name=es_index,
        distance_strategy=distance_strategy.value,
    )
-    logger.info(f"Finished uploading documents to index {ELASTICSEARCH_INDEX}.")
+    logger.info(f"Finished uploading documents to index {es_index}.")


 if __name__ == "__main__":
--- a/scratches/acano/elasticsearch_ingestion_v2.py
+++ b/scratches/acano/elasticsearch_ingestion_v2.py
@ -0,0 +1,122 @@
+import typer
+import logging
+
+from loguru import logger
+from elasticsearch import Elasticsearch
+from chonkie import MarkdownChef, FileFetcher, ElasticHandshake
+from transformers import AutoTokenizer
+
+from src.config import settings
+from scripts.pipelines.tasks.embeddings import OllamaEmbeddings
+from scripts.pipelines.tasks.chunk import merge_markdown_document
+
+app = typer.Typer()
+
+def get_processing_and_chunking_config(docs_extension: str, chunk_size: int, 
+                                        chunk_threshold: float | None, 
+                                        chunk_similarity_window: int| None, 
+                                        chunk_skip_window: int | None) -> tuple[str, dict, str, dict]:
+    """
+    Check the file extension and return the appropriate processing and chunking strategies and their kwargs.
+
+    Args:
+        docs_extension (str): The file extension of the documents to be ingested.
+        chunk_size (int): The size of the chunks to be created.
+        chunk_threshold (float, optional): The threshold for semantic chunking. Required if docs_extension is .md.
+        chunk_similarity_window (int, optional): The similarity window for semantic chunking
+        chunk_skip_window (int, optional): The skip window for semantic chunking.
+
+    Returns:
+        tuple[str, dict, str, dict]: A tuple containing the processing strategy, its kwargs, the chunking strategy, and its kwargs.
+    """
+    if docs_extension == ".md":
+        process_type = "markdown"
+        custom_tokenizer = AutoTokenizer.from_pretrained(settings.hf_emb_model_name)
+        process_kwargs = {"tokenizer": custom_tokenizer}
+        # process_type = "text"
+        # process_kwargs = {}
+        chunk_strat = "semantic"
+        chunk_kwargs = {"embedding_model": settings.hf_emb_model_name, "threshold": chunk_threshold, "chunk_size": chunk_size, 
+                        "similarity_window": chunk_similarity_window, "skip_window": chunk_skip_window}
+    
+    elif docs_extension == ".avap":
+        process_type = "text"
+        process_kwargs = {}
+        chunk_strat = "recursive" # Once we have the BNF and uploaded to tree-sitter, we can use code (?)
+        chunk_kwargs = {"chunk_size": chunk_size}
+    
+    return process_type, process_kwargs, chunk_strat, chunk_kwargs
+
+
+@app.command()
+def elasticsearch_ingestion(
+    docs_folder_path: str = "docs/LRM",
+    docs_extension: str = ".md",
+    es_index: str = "avap-docs-test-v3",
+    es_request_timeout: int = 120,
+    es_max_retries: int = 5,
+    es_retry_on_timeout: bool = True,
+    delete_es_index: bool = True,
+    chunk_size: int = 2048,
+    chunk_threshold: float | None = 0.5,
+    chunk_similarity_window: int | None = 3,
+    chunk_skip_window: int | None = 1
+):  
+    custom_tokenizer = AutoTokenizer.from_pretrained(settings.hf_emb_model_name)
+    processed_docs = []
+    fused_docs = []
+    logger.info(f"Instantiating Elasticsearch client with URL: {settings.elasticsearch_local_url}...")
+    es = Elasticsearch(
+        hosts=settings.elasticsearch_local_url,
+        request_timeout=es_request_timeout,
+        max_retries=es_max_retries,
+        retry_on_timeout=es_retry_on_timeout,
+    )
+
+    if delete_es_index and es.indices.exists(index=es_index):
+        logger.info(f"Deleting existing Elasticsearch index: {es_index}...")
+        es.indices.delete(index=es_index)
+
+    logger.info("Starting Elasticsearch ingestion pipeline...")
+    (process_type, 
+     process_kwargs, 
+     chunk_strat, 
+     chunk_kwargs) = get_processing_and_chunking_config(docs_extension, chunk_size, chunk_threshold, chunk_similarity_window, chunk_skip_window)
+    
+    logger.info(f"Fetching files from {docs_folder_path}...")
+    fetcher = FileFetcher()
+    docs = fetcher.fetch(dir=f"{settings.proj_root}/{docs_folder_path}")
+
+    logger.info(f"Processing documents with process_type: {process_type}...")
+    chef = MarkdownChef(tokenizer=custom_tokenizer)
+    for doc in docs:
+        processed_doc = chef.process(doc)
+        processed_docs.append(processed_doc)
+
+    logger.info(f"Chunking documents with chunk_strat: {chunk_strat}...")
+    for processed_doc in processed_docs:
+        fused_doc = merge_markdown_document(processed_doc)
+        fused_docs.append(fused_doc)
+
+    logger.info(f"Ingesting chunks in Elasticsearch index: {es_index}...")
+    handshake = ElasticHandshake(
+        client=es,
+        index_name=es_index,
+        embedding_model=OllamaEmbeddings(model=settings.ollama_emb_model_name)
+    )
+    for fused_doc in fused_docs:
+        handshake.write(fused_doc.chunks)
+
+    logger.info(f"Finished ingesting in {es_index}.")
+
+
+if __name__ == "__main__":
+    logging.basicConfig(
+        level=logging.INFO,
+        format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
+    )
+    try:
+        app()
+    except Exception as exc:
+        logger.exception(exc)
+        raise
--- a/scripts/pipelines/flows/elasticsearch_ingestion.py
+++ b/scripts/pipelines/flows/elasticsearch_ingestion.py
@ -1,134 +1,38 @@
-from enum import Enum
 import typer
 import logging
-import os
-from pathlib import Path

 from loguru import logger
-from elasticsearch import Elasticsearch
-from langchain_elasticsearch import ElasticsearchStore
-from chonkie import SemanticChunker
+from chonkie import FileFetcher

-from src.utils.emb_factory import create_embedding_model
-from scripts.pipelines.tasks.chunk import (
-    read_files, 
-    get_chunk_docs, 
-    convert_chunks_to_document
-)
+from src.config import settings
+from scripts.pipelines.tasks.chunk import process_documents, ingest_documents

 app = typer.Typer()

-ELASTICSEARCH_LOCAL_URL = os.getenv("ELASTICSEARCH_LOCAL_URL")
-OLLAMA_LOCAL_URL = os.getenv("OLLAMA_LOCAL_URL")
-ELASTICSEARCH_INDEX = os.getenv("ELASTICSEARCH_INDEX")
-OLLAMA_URL = os.getenv("OLLAMA_URL")
-OLLAMA_EMB_MODEL_NAME = os.getenv("OLLAMA_EMB_MODEL_NAME")
-AVAP_WEB_DOCS_URL = os.getenv("AVAP_WEB_DOCS_URL")
-HF_EMB_MODEL_NAME = os.getenv("HF_EMB_MODEL_NAME")
-
-class DistanceStrategy(str, Enum):
-    euclidean = "EUCLIDEAN_DISTANCE"
-    max_inner_product = "MAX_INNER_PRODUCT"
-    dot_product = "DOT_PRODUCT"
-    jaccard = "JACCARD"
-    cosine = "COSINE"
-

@app.command()
 def elasticsearch_ingestion(
-    docs_folder_path: str = "docs",
+    docs_folder_path: str = "docs/LRM",
+    docs_extension: str = ".md",
+    es_index: str = "avap-docs-test-v3",
    es_request_timeout: int = 120,
    es_max_retries: int = 5,
    es_retry_on_timeout: bool = True,
-    distance_strategy: DistanceStrategy = DistanceStrategy.cosine,
-    chunk_size: int = 2048,
-    chunk_threshold: float = 0.5,
-    chunk_similarity_window: int = 3,
-    chunk_skip_window: int = 1,
+    delete_es_index: bool = True
 ):  
    logger.info("Starting Elasticsearch ingestion pipeline...")
-    logger.info(f"Reading and concatenating files from folder: {docs_folder_path}/developer.avapframework.com")
-    avap_github_docs = read_files(f"{docs_folder_path}/avap_language_github_docs", concatenate=False)
-    avap_web_docs_intro = read_files(f"{docs_folder_path}/developer.avapframework.com", "intro", concatenate=True)
-    
-    # Check chapters in developer.avapframework.com folder and read and concatenate files for each chapter
-    chapters = sorted({
-        p.name.split("_")[0]
-        for p in Path(f"{docs_folder_path}/developer.avapframework.com").glob("chapter*.md")
-    })
+    logger.info(f"Fetching files from {docs_folder_path}...")
+    fetcher = FileFetcher()
+    docs_path = fetcher.fetch(dir=f"{settings.proj_root}/{docs_folder_path}")

-    avap_web_docs_chapters = [
-        item
-        for chapter in chapters
-        for item in read_files(
-            f"{docs_folder_path}/developer.avapframework.com",
-            f"{chapter}_",
-            concatenate=True
-        )
-    ]
+    logger.info("Processing docs...")
+    chunked_docs = process_documents(docs_path, docs_extension)

-    avap_web_docs_appendices = read_files(f"{docs_folder_path}/developer.avapframework.com", "appendices_", concatenate=False)
-    avap_samples_docs = read_files(f"{docs_folder_path}/samples", concatenate=False)
+    logger.info(f"Ingesting chunks in Elasticsearch index: {es_index}...")
+    ingest_documents(chunked_docs, es_index, es_request_timeout, es_max_retries, 
+                     es_retry_on_timeout, delete_es_index)

-    logger.info("Instantiating semantic chunker...")
-    chunker = SemanticChunker(
-        embedding_model=HF_EMB_MODEL_NAME,
-        chunk_size=chunk_size,
-        threshold=chunk_threshold,
-        similarity_window=chunk_similarity_window,
-        skip_window=chunk_skip_window
-    )
-
-    logger.info("Chunking AVAP GitHub docs...")
-    avap_github_docs_chunks = get_chunk_docs(avap_github_docs, chunker)
-
-    logger.info("Chunking AVAP web docs chapters...")
-    avap_web_docs_chapters_chunks = get_chunk_docs(avap_web_docs_chapters, chunker)
-
-    logger.info("Creating Langchain Document to index...")
-    avap_github_langchain_docs = convert_chunks_to_document(avap_github_docs_chunks)
-    avap_web_chapters_langchain_docs = convert_chunks_to_document(avap_web_docs_chapters_chunks)
-    avap_web_intro_langchain_docs = convert_chunks_to_document(avap_web_docs_intro)
-    avap_web_appendices_langchain_docs = convert_chunks_to_document(avap_web_docs_appendices)
-    avap_samples_langchain_docs = convert_chunks_to_document(avap_samples_docs)
-    avap_documents = avap_github_langchain_docs + avap_web_chapters_langchain_docs + avap_web_intro_langchain_docs + avap_web_appendices_langchain_docs + avap_samples_langchain_docs
-
-    logger.info("Connecting to Elasticsearch...")
-    try:
-        es = Elasticsearch(
-            ELASTICSEARCH_LOCAL_URL,
-            request_timeout=es_request_timeout,
-            max_retries=es_max_retries,
-            retry_on_timeout=es_retry_on_timeout,
-        )
-    except:
-        logger.exception("Failed to connect to Elasticsearch.")
-        raise
-
-    logger.info("Instantiating embeddings model...")
-    try:
-        embeddings = create_embedding_model(
-            provider="ollama",
-            model=OLLAMA_EMB_MODEL_NAME,
-            base_url=OLLAMA_LOCAL_URL, 
-        )
-    except:
-        logger.exception("Failed to instantiate embeddings model.")
-        raise
-    
-    logger.info(f"Checking if index {ELASTICSEARCH_INDEX} exists and deleting if it does...")
-    if es.indices.exists(index=ELASTICSEARCH_INDEX):
-        es.indices.delete(index=ELASTICSEARCH_INDEX)
-
-    logger.info(f"Uploading documents to index {ELASTICSEARCH_INDEX}...")
-    ElasticsearchStore.from_documents(
-        avap_documents,
-        embeddings,
-        client=es,
-        index_name=ELASTICSEARCH_INDEX,
-        distance_strategy=distance_strategy.value,
-    )
-    logger.info(f"Finished uploading documents to index {ELASTICSEARCH_INDEX}.")
+    logger.info(f"Finished ingesting in {es_index}.")


 if __name__ == "__main__":
@ -140,4 +44,4 @@ if __name__ == "__main__":
        app()
    except Exception as exc:
        logger.exception(exc)
-        raise
+        raise
--- a/scripts/pipelines/samples_generator/generate_mbap.py
+++ b/scripts/pipelines/samples_generator/generate_mbap.py
--- a/scripts/pipelines/inputs/prompts.py
+++ b/scripts/pipelines/inputs/prompts.py
--- a/scripts/pipelines/tasks/chunk.py
+++ b/scripts/pipelines/tasks/chunk.py
@ -1,136 +1,159 @@
-import os
-import re
-import uuid
+from copy import deepcopy
+from dataclasses import replace
+from pathlib import Path

+from chonkie import (
+    Chunk,
+    ElasticHandshake,
+    MarkdownChef,
+    TextChef,
+    TokenChunker,
+)
+from elasticsearch import Elasticsearch
 from loguru import logger
-from chonkie import Chunk, SemanticChunker
-from langchain_core.documents import Document
+from transformers import AutoTokenizer
+
+from scripts.pipelines.tasks.embeddings import OllamaEmbeddings
+from src.config import settings


-def replace_javascript_with_avap(text: str) -> str:
-    """
-    Replace mentions of javascript language with avap in the text.
-    Handles code blocks, language identifiers, and references.
-
-    Args:
-        text: The text to process.
-
-    Returns:
-        The text with javascript references replaced with avap.
-    """
-    # Replace ```javascript with ```avap
-    text = text.replace("```javascript", "```avap")
-
-    # Replace ```js with ```avap
-    text = text.replace("```js", "```avap")
-
-    # Replace common phrases (case-insensitive)
-    text = re.sub(r"\bjavascript\s+code\b", "avap code", text, flags=re.IGNORECASE)
-    text = re.sub(
-        r"\bjavascript\s+example\b", "avap example", text, flags=re.IGNORECASE
+def _get_text(element) -> str:
+    for attr in ("text", "content", "markdown"):
+        value = getattr(element, attr, None)
+        if isinstance(value, str):
+            return value
+    raise AttributeError(
+        f"Could not extract text from element of type {type(element).__name__}"
    )
-    text = re.sub(r"\bjavascript\b(?!\s+file)", "avap", text, flags=re.IGNORECASE)
-
-    return text


-def read_files(
-    folder_path: str, file_prefix: str | None = None, concatenate: bool = True
-) -> list[dict]:
-    """
-    Read files in a folder whose names start with a given prefix.
-    Replaces javascript language markers with avap.
+def _merge_markdown_document(doc):
+    elements = []

-    Args:
-        folder_path: Path to the folder to search in.
-        file_prefix: The prefix that file names must start with.
-            If None, all files in the folder are included.
-        concatenate: Whether to concatenate the contents of the files.
+    for chunk in doc.chunks:
+        elements.append(("chunk", chunk.start_index, chunk.end_index, chunk))

-    Returns:
-        A list of dictionaries, each containing 'content' and 'title' keys.
-        If concatenate is True, returns a single dict with concatenated content and title as 'appendix'.
-        If concatenate is False, returns one dict per file with filename as title.
-    """
-    contents = []
-    filenames = []
+    for code in doc.code:
+        elements.append(("code", code.start_index, code.end_index, code))

-    for filename in sorted(os.listdir(folder_path)):
-        include_file = file_prefix is None or filename.startswith(file_prefix)
-        if include_file:
-            file_path = os.path.join(folder_path, filename)
-            if os.path.isfile(file_path):
-                with open(file_path, "r", encoding="utf-8") as f:
-                    content = f.read()
-                    cleaned_content = content.strip()
-                    if cleaned_content:
-                        contents.append(cleaned_content)
-                        filenames.append(filename)
+    for table in doc.tables:
+        elements.append(("table", table.start_index, table.end_index, table))

-    if concatenate:
-        concatenated = "\n".join(contents)
-        processed_content = replace_javascript_with_avap(concatenated)
-        title = file_prefix if file_prefix is not None else "all_files"
-        return [{"content": processed_content, "title": title}]
-    else:
-        return [
-            {"content": replace_javascript_with_avap(content), "title": filename}
-            for content, filename in zip(contents, filenames)
-        ]
+    elements.sort(key=lambda item: (item[1], item[2]))
+
+    merged_chunks = []
+    current_chunk = None
+    current_parts = []
+    current_end_index = None
+    current_token_count = None
+
+    def flush():
+        nonlocal current_chunk, current_parts, current_end_index, current_token_count
+
+        if current_chunk is None:
+            return
+
+        merged_text = "\n\n".join(part for part in current_parts if part)
+
+        merged_chunks.append(
+            replace(
+                current_chunk,
+                text=merged_text,
+                end_index=current_end_index,
+                token_count=current_token_count,
+            )
+        )
+
+        current_chunk = None
+        current_parts = []
+        current_end_index = None
+        current_token_count = None
+
+    for kind, _, _, element in elements:
+        if kind == "chunk":
+            flush()
+            current_chunk = element
+            current_parts = [_get_text(element)]
+            current_end_index = element.end_index
+            current_token_count = element.token_count
+            continue
+
+        if current_chunk is None:
+            continue
+
+        current_parts.append(_get_text(element))
+        current_end_index = max(current_end_index, element.end_index)
+        current_token_count += getattr(element, "token_count", 0)
+
+    flush()
+
+    new_doc = deepcopy(doc)
+    new_doc.chunks = merged_chunks
+    new_doc.code = doc.code
+    new_doc.tables = doc.tables
+
+    return new_doc


-def get_chunk_docs(docs: list[dict], chunker: SemanticChunker) -> list[list[Chunk]]:
-    """
-    Chunk the content of the documents using the provided chunker.
+def process_documents(docs_path: list[Path], docs_extension: str) -> list[Chunk]:
+    processed_docs = []
+    chunked_docs = []
+    custom_tokenizer = AutoTokenizer.from_pretrained(settings.hf_emb_model_name)

-    Args:
-        docs: A list of dictionaries, each containing 'content' and 'title' keys.
-        chunker: An instance of SemanticChunker to use for chunking the content.
+    if docs_extension == ".md":
+        chef = MarkdownChef(tokenizer=custom_tokenizer)
+        for doc in docs_path:
+            processed_doc = chef.process(doc)
+            processed_docs.append((processed_doc, doc.name))

-    Returns:
-        A list of lists of Chunk objects, where each inner list corresponds to the chunks of a
-        single document.
-    """
-    list_chunks = []
+        for processed_doc, filename in processed_docs:
+            fused_doc = _merge_markdown_document(processed_doc)
+            chunked_docs.extend(fused_doc.chunks)

-    for doc in docs:
-        content = doc["content"]
-        chunks = chunker.chunk(content)
-        for chunk in chunks:
-            chunk.context = {"source": doc["title"]}
-        list_chunks.append(chunks)
-        logger.info(f"Finished chunking {doc['title']}")
+    elif docs_extension == ".avap":
+        chef = TextChef()
+        chunker = TokenChunker(tokenizer=custom_tokenizer)
+        for doc in docs_path:
+            processed_doc = chef.process(doc)
+            processed_docs.append((processed_doc, doc.name))

-    return list_chunks
+        for processed_doc, filename in processed_docs:
+            chunked_doc = chunker.chunk(processed_doc.content)
+            chunked_docs.extend(chunked_doc)
+
+    return chunked_docs


-def convert_chunks_to_document(chunks: list[dict] | list[list[Chunk]]) -> list[Document]:
-    """
-    Convert the chunked content into a list of Document objects.
+def ingest_documents(
+    chunked_docs: list[Chunk],
+    es_index: str,
+    es_request_timeout: int,
+    es_max_retries: int,
+    es_retry_on_timeout: bool,
+    delete_es_index: bool,
+) -> None:

-    Args:
-        chunks: A list of dictionaries containing 'content' and 'title' keys.
+    logger.info(
+        f"Instantiating Elasticsearch client with URL: {settings.elasticsearch_local_url}..."
+    )
+    es = Elasticsearch(
+        hosts=settings.elasticsearch_local_url,
+        request_timeout=es_request_timeout,
+        max_retries=es_max_retries,
+        retry_on_timeout=es_retry_on_timeout,
+    )

-    Returns:
-        A list of Document objects created from the chunked content.
-    """
-    documents = []
+    if delete_es_index and es.indices.exists(index=es_index):
+        logger.info(f"Deleting existing Elasticsearch index: {es_index}...")
+        es.indices.delete(index=es_index)

-    if isinstance(chunks[0], dict):
-        for chunk in chunks:
-            content = chunk["content"]
-            title = chunk["title"]
-            documents.append(Document(id=str(uuid.uuid4()),
-                                      page_content=content, 
-                                      metadata={"source": title}))
-    
-    else:
-        for chunk_list in chunks:
-            for chunk in chunk_list:
-                content = chunk.text
-                title = chunk.context.get("source", "unknown")
-                documents.append(Document(id=str(uuid.uuid4()),
-                                          page_content=content, 
-                                          metadata={"source": title}))
+    handshake = ElasticHandshake(
+        client=es,
+        index_name=es_index,
+        embedding_model=OllamaEmbeddings(model=settings.ollama_emb_model_name),
+    )

-    return documents
+    logger.info(
+        f"Ingesting {len(chunked_docs)} chunks into Elasticsearch index: {es_index}..."
+    )
+    handshake.write(chunked_docs)
--- a/scripts/pipelines/tasks/embeddings.py
+++ b/scripts/pipelines/tasks/embeddings.py
@ -0,0 +1,125 @@
+import requests
+from typing import Any, Callable
+
+import numpy as np
+from chonkie.embeddings import BaseEmbeddings
+
+from src.config import settings
+
+
+class OllamaEmbeddings(BaseEmbeddings):
+    """Chonkie embeddings adapter for a local Ollama embedding model."""
+
+    def __init__(
+        self,
+        model: str,
+        base_url: str = settings.ollama_local_url,
+        timeout: float = 60.0,
+        truncate: bool = True,
+        keep_alive: str = "5m",
+    ) -> None:
+        self.model = model
+        self.base_url = base_url.rstrip("/")
+        self.timeout = timeout
+        self.truncate = truncate
+        self.keep_alive = keep_alive
+        self._dimension: int | None = None
+
+    @property
+    def dimension(self) -> int:
+        if self._dimension is None:
+            # Lazy-load the dimension from a real embedding response.
+            self._dimension = int(self.embed(" ").shape[0])
+        return self._dimension
+
+    def embed(self, text: str) -> np.ndarray:
+        embeddings = self._embed_api(text)
+        vector = np.asarray(embeddings[0], dtype=np.float32)
+
+        if self._dimension is None:
+            self._dimension = int(vector.shape[0])
+
+        return vector
+
+    def embed_batch(self, texts: list[str]) -> list[np.ndarray]:
+        if not texts:
+            return []
+
+        embeddings = self._embed_api(texts)
+        vectors = [np.asarray(vector, dtype=np.float32) for vector in embeddings]
+
+        if vectors and self._dimension is None:
+            self._dimension = int(vectors[0].shape[0])
+
+        return vectors
+
+    def count_tokens(self, text: str) -> int:
+        payload = self._build_payload(text)
+        response = self._post_embed(payload)
+        return int(response["prompt_eval_count"])
+
+    def count_tokens_batch(self, texts: list[str]) -> list[int]:
+        # Ollama returns a single prompt_eval_count for the whole request,
+        # not one count per input item, so we compute them individually.
+        return [self.count_tokens(text) for text in texts]
+
+    def get_tokenizer(self) -> Callable[[str], int]:
+        # Chonkie mainly needs something usable for token counting.
+        return self.count_tokens
+
+    @classmethod
+    def is_available(cls) -> bool:
+        try:
+            response = requests.get(
+                f"{settings.ollama_local_url}/api/tags",
+                timeout=5.0,
+            )
+            response.raise_for_status()
+            return True
+        except requests.RequestException:
+            return False
+
+    def __repr__(self) -> str:
+        return (
+            f"OllamaEmbeddings("
+            f"model={self.model!r}, "
+            f"base_url={self.base_url!r}, "
+            f"dimension={self._dimension!r}"
+            f")"
+        )
+
+    def _build_payload(self, text_or_texts: str | list[str]) -> dict[str, Any]:
+        return {
+            "model": self.model,
+            "input": text_or_texts,
+            "truncate": self.truncate,
+            "keep_alive": self.keep_alive,
+        }
+
+    def _post_embed(self, payload: dict[str, Any]) -> dict[str, Any]:
+        try:
+            response = requests.post(
+                f"{self.base_url}/api/embed",
+                json=payload,
+                timeout=self.timeout,
+            )
+            response.raise_for_status()
+            data = response.json()
+        except requests.RequestException as exc:
+            raise RuntimeError(
+                f"Failed to call Ollama embeddings endpoint at "
+                f"{self.base_url}/api/embed"
+            ) from exc
+
+        if "embeddings" not in data:
+            raise RuntimeError(
+                "Ollama response did not include 'embeddings'. "
+                f"Response keys: {list(data.keys())}"
+            )
+
+        return data
+
+    def _embed_api(self, text_or_texts: str | list[str]) -> list[list[float]]:
+        payload = self._build_payload(text_or_texts)
+        data = self._post_embed(payload)
+        return data["embeddings"]
--- a/uv.lock
+++ b/uv.lock
@ -250,7 +250,6 @@ name = "assistance-engine"
 version = "0.1.0"
 source = { virtual = "." }
 dependencies = [
-    { name = "chonkie", extra = ["semantic"] },
    { name = "grpcio" },
    { name = "grpcio-reflection" },
    { name = "grpcio-tools" },
@ -273,7 +272,9 @@ dependencies = [
 dev = [
    { name = "beir" },
    { name = "boto3" },
+    { name = "chonkie", extra = ["elastic", "semantic"] },
    { name = "evidently" },
+    { name = "flatbuffers" },
    { name = "jupyter" },
    { name = "langfuse" },
    { name = "litellm" },
@ -288,7 +289,6 @@ dev = [

 [package.metadata]
 requires-dist = [
-    { name = "chonkie", extras = ["semantic"], specifier = ">=1.5.6" },
    { name = "grpcio", specifier = ">=1.78.0" },
    { name = "grpcio-reflection", specifier = ">=1.78.0" },
    { name = "grpcio-tools", specifier = ">=1.78.0" },
@ -311,7 +311,9 @@ requires-dist = [
 dev = [
    { name = "beir", specifier = ">=2.2.0" },
    { name = "boto3", specifier = ">=1.42.58" },
+    { name = "chonkie", extras = ["elastic", "semantic"], specifier = ">=1.6.0" },
    { name = "evidently", specifier = ">=0.7.20" },
+    { name = "flatbuffers", specifier = ">=25.12.19" },
    { name = "jupyter", specifier = ">=1.1.1" },
    { name = "langfuse", specifier = "<3" },
    { name = "litellm", specifier = ">=1.82.0" },
@ -595,7 +597,7 @@ wheels = [

 [[package]]
 name = "chonkie"
-version = "1.5.6"
+version = "1.6.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
    { name = "chonkie-core" },
@ -603,12 +605,15 @@ dependencies = [
    { name = "tenacity" },
    { name = "tqdm" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/a4/16/e51295955f5a627ebb7867dc2e7fa48d4c6dc2a5f3cde3690de84812e929/chonkie-1.5.6.tar.gz", hash = "sha256:282a24c20b88c4c28d8cae893ac78bcbee531a87d28ec86b419897a9eea2ecf3", size = 172066, upload-time = "2026-02-16T21:44:01.336Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/e5/72/fdf8f89ff439f4ec357af0866c819512391936e4e61b6f15635a48434b8a/chonkie-1.6.0.tar.gz", hash = "sha256:14120d80610c1f549027fc7aa9a5ff604a729b545836f6cadd65d5ae83596279", size = 187056, upload-time = "2026-03-11T04:55:07.657Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/18/3a/24cf4cb377f4d44126231d55a19b48a645a0f78f891288a8d4300c95160d/chonkie-1.5.6-py3-none-any.whl", hash = "sha256:4c3be39a0f97315eb3c5efe6dc5d7933d3d27a1918b55c39ab211b403bb03df7", size = 210065, upload-time = "2026-02-16T21:43:59.926Z" },
+    { url = "https://files.pythonhosted.org/packages/ae/c2/7ea7d3409df220dd0e048b1113b44f47eccab9d517b00b037ab0e34c3c7a/chonkie-1.6.0-py3-none-any.whl", hash = "sha256:aa357e02f5cdacac6f8280c5e8651207c866b4137bcf20904db8670ee0808877", size = 232997, upload-time = "2026-03-11T04:55:05.252Z" },
 ]

 [package.optional-dependencies]
+elastic = [
+    { name = "elasticsearch" },
+]
 semantic = [
    { name = "model2vec" },
    { name = "tokenizers" },
@ -1061,6 +1066,14 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/9c/0f/5d0c71a1aefeb08efff26272149e07ab922b64f46c63363756224bd6872e/filelock-3.24.3-py3-none-any.whl", hash = "sha256:426e9a4660391f7f8a810d71b0555bce9008b0a1cc342ab1f6947d37639e002d", size = 24331, upload-time = "2026-02-19T00:48:18.465Z" },
 ]

+[[package]]
+name = "flatbuffers"
+version = "25.12.19"
+source = { registry = "https://pypi.org/simple" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/e8/2d/d2a548598be01649e2d46231d151a6c56d10b964d94043a335ae56ea2d92/flatbuffers-25.12.19-py2.py3-none-any.whl", hash = "sha256:7634f50c427838bb021c2d66a3d1168e9d199b0607e6329399f04846d42e20b4", size = 26661, upload-time = "2025-12-19T23:16:13.622Z" },
+]
+
 [[package]]
 name = "fqdn"
 version = "1.5.1"
@ -3112,14 +3125,14 @@ wheels = [

 [[package]]
 name = "opentelemetry-proto"
-version = "1.39.1"
+version = "1.40.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
    { name = "protobuf" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/49/1d/f25d76d8260c156c40c97c9ed4511ec0f9ce353f8108ca6e7561f82a06b2/opentelemetry_proto-1.39.1.tar.gz", hash = "sha256:6c8e05144fc0d3ed4d22c2289c6b126e03bcd0e6a7da0f16cedd2e1c2772e2c8", size = 46152, upload-time = "2025-12-11T13:32:48.681Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/4c/77/dd38991db037fdfce45849491cb61de5ab000f49824a00230afb112a4392/opentelemetry_proto-1.40.0.tar.gz", hash = "sha256:03f639ca129ba513f5819810f5b1f42bcb371391405d99c168fe6937c62febcd", size = 45667, upload-time = "2026-03-04T14:17:31.194Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/51/95/b40c96a7b5203005a0b03d8ce8cd212ff23f1793d5ba289c87a097571b18/opentelemetry_proto-1.39.1-py3-none-any.whl", hash = "sha256:22cdc78efd3b3765d09e68bfbd010d4fc254c9818afd0b6b423387d9dee46007", size = 72535, upload-time = "2025-12-11T13:32:33.866Z" },
+    { url = "https://files.pythonhosted.org/packages/b9/b2/189b2577dde745b15625b3214302605b1353436219d42b7912e77fa8dc24/opentelemetry_proto-1.40.0-py3-none-any.whl", hash = "sha256:266c4385d88923a23d63e353e9761af0f47a6ed0d486979777fe4de59dc9b25f", size = 72073, upload-time = "2026-03-04T14:17:16.673Z" },
 ]

 [[package]]