Refactor Elasticsearch ingestion pipeline and add MBPP generation script

- Updated `elasticsearch_ingestion.py` to streamline document processing and ingestion into Elasticsearch. - Introduced `generate_mbap.py` for generating benchmark problems in AVAP language from a provided LRM. - Created `prompts.py` to define prompts for converting Python problems to AVAP. - Enhanced chunk processing in `chunk.py` to support markdown and AVAP documents. - Added `OllamaEmbeddings` class in `embeddings.py` for handling embeddings with Ollama model. - Updated dependencies in `uv.lock` to include new packages and versions.
2026-03-11 17:17:44 +01:00 · 2026-03-11 17:17:44 +01:00 · 5f21544e0b
parent 3caed4deb6
commit 5f21544e0b
13 changed files with 486 additions and 5808 deletions
--- a/README.md
+++ b/README.md
@ -47,11 +47,12 @@ graph TD
 ├── changelog                     # Version tracking and release history
 ├── pyproject.toml                # Python project configuration
 ├── docs/
-|   ├── AVAP Language: ...        # AVAP DSL Documentation
+│   ├── AVAP Language: ...        # AVAP DSL Documentation
-|   |   └── AVAP.md
+│   │   └── AVAP.md
 │   ├── developer.avapfr...       # Documents on developer web page
-|   └── LRM/                      # AVAP LRM documentation
+│   ├── LRM/                      # AVAP LRM documentation
-|       └── avap.md 
+│   │   └── avap.md 
 │   └── samples/                  # AVAP code samples
 ├── Docker/
 │   ├── protos/
 │   │    └── brunix.proto         # Protocol Buffers: The source of truth for the API
@ -64,26 +65,17 @@ graph TD
 │   ├── Dockerfile                # Container definition for the Engine
 │   ├── docker-compose.yaml       # Local orchestration for dev environment
 │   ├── requirements.txt          # Python dependencies for Docker
-│   ├── protos/
+│   └── .dockerignore             # Docker ignore files
 │   │   └── brunix.proto          # Protocol Buffers: The source of truth for the API
 │   └── src/
 │       ├── graph.py              # Workflow graph orchestration
 │       ├── prompts.py            # Centralized prompt definitions
 │       ├── server.py             # gRPC Server & RAG Orchestration
 │       ├── state.py              # Shared state management
 │       └── utils/                # Utility modules
 ├── ingestion/
 │   └── docs/                     # AVAP documentation chunks
 ├── kubernetes/
 │   └── kubeconfig.yaml           # Kubernetes cluster configuration
 ├── scripts/
 │   └── pipelines/
-|       ├── samples_generator/    # AVAP Sample generator
+│       ├── flows/                # Processing pipelines
-|       |   └─ generate_mbap.py   
+│       ├── tasks/                # Modules used by the flows
-│       └── flows/                # Data processing flows
+│       └── inputs/               # Inputs used by the flows
 |           └─ elasticsearch_ingestion.py 
 └── src/
    ├── __init__.py
    ├── config.py                 # Environment variables configuration file
    └── utils/
        ├── emb_factory.py        # Embedding model factory
        ├── llm_factory.py        # LLM model factory
@ -157,6 +149,7 @@ OLLAMA_URL=http://host.docker.internal:11434
 OLLAMA_LOCAL_URL=http://localhost:11434
 OLLAMA_MODEL_NAME=qwen2.5:1.5b
 OLLAMA_EMB_MODEL_NAME=qwen3-0.6B-emb:latest
 HF_TOKEN=hf_...
 HF_EMB_MODEL_NAME=Qwen/Qwen3-Embedding-0.6B
 ```
@ -183,13 +176,13 @@ Open a terminal and establish the connection to the Devaron Cluster:
 ```bash
 # 1. AI Model Tunnel (Ollama)
-kubectl port-forward --address 0.0.0.0 svc/ollama-light-service 11434:11434 -n brunix --kubeconfig ./kubernetes/ivar.yaml &
+kubectl port-forward --address 0.0.0.0 svc/ollama-light-service 11434:11434 -n brunix --kubeconfig ./kubernetes/kubeconfig.yaml &
 # 2. Knowledge Base Tunnel (Elasticsearch)
-kubectl port-forward --address 0.0.0.0 svc/brunix-vector-db 9200:9200 -n brunix --kubeconfig ./kubernetes/ivar.yaml &
+kubectl port-forward --address 0.0.0.0 svc/brunix-vector-db 9200:9200 -n brunix --kubeconfig ./kubernetes/kubeconfig.yaml &
 # 3. Observability DB Tunnel (PostgreSQL)
-kubectl port-forward --address 0.0.0.0 svc/brunix-postgres 5432:5432 -n brunix --kubeconfig ./kubernetes/ivar.yaml &
+kubectl port-forward --address 0.0.0.0 svc/brunix-postgres 5432:5432 -n brunix --kubeconfig ./kubernetes/kubeconfig.yaml &
 ```
 ### 5. Launch the Engine
--- a/4
+++ b/4
@ -15,13 +15,13 @@ All notable changes to the **Brunix Assistance Engine** will be documented in th
    - `src/config.py`: environment variables configuration file.
 ### Changed
- REFACTORED: `scripts/pipelines/flows/elasticsearch_ingestion.py` now uses `docs` documents instead of pre chunked files.
+- REFACTORED: `scripts/pipelines/flows/elasticsearch_ingestion.py` now uses `docs/LRM` or `docs/samples` documents instead of pre chunked files.
 - RENAMED `docs/AVAP Language: Core Commands & Functional Specification` to `docs/avap_language_github_docs`.
 - REMOVED: `Makefile` file.
 - REMOVED: `scripts/start-tunnels.sh` script.
 - REMOVED `ingestion` folder.
 - DEPENDENCIES: `requirements.txt` updated with new libraries required by the new modules.
- MOVED `scripts/generate_mbpp_avap.py` into `scripts/flows/generate_mbpp_avap.py`
+- MOVED `scripts/generate_mbap.py` into `scripts/flows/generate_mbap.py`.
 ## [1.4.0] - 2026-03-10
--- a/docs/AVAP_dev.md
+++ b/docs/AVAP_dev.md
--- a/docs/avap.txt
+++ b/docs/avap.txt
--- a/pyproject.toml
+++ b/pyproject.toml
@ -5,7 +5,6 @@ description = "Add your description here"
 readme = "README.md"
 requires-python = ">=3.11"
 dependencies = [
    "chonkie[semantic]>=1.5.6",
    "grpcio>=1.78.0",
    "grpcio-reflection>=1.78.0",
    "grpcio-tools>=1.78.0",
@ -28,7 +27,9 @@ dependencies = [
 dev = [
    "beir>=2.2.0",
    "boto3>=1.42.58",
    "chonkie[elastic,semantic]>=1.6.0",
    "evidently>=0.7.20",
    "flatbuffers>=25.12.19",
    "jupyter>=1.1.1",
    "langfuse<3",
    "litellm>=1.82.0",
--- a/scratches/acano/elasticsearch_ingestion.py
+++ b/scratches/acano/elasticsearch_ingestion.py
@ -1,30 +1,29 @@
 import re
 import hashlib
 from typing import Any
 from enum import Enum
 import typer
 import logging
 import os
 from pathlib import Path
 from loguru import logger
 from elasticsearch import Elasticsearch
 from langchain_core.documents import Document
 from langchain_elasticsearch import ElasticsearchStore
-from langchain_community.embeddings import HuggingFaceEmbeddings
+from chonkie import SemanticChunker, MarkdownChef
-from langchain_experimental.text_splitter import SemanticChunker
+from transformers import AutoTokenizer
 from src.utils.emb_factory import create_embedding_model
-from scripts.pipelines.tasks.chunk import scrape_avap_docs
+from scripts.pipelines.tasks.chunk import (
    read_files, 
    get_chunk_docs, 
    convert_chunks_to_document
 )
 app = typer.Typer()
 ELASTICSEARCH_LOCAL_URL = os.getenv("ELASTICSEARCH_LOCAL_URL")
 OLLAMA_LOCAL_URL = os.getenv("OLLAMA_LOCAL_URL")
 ELASTICSEARCH_INDEX = os.getenv("ELASTICSEARCH_INDEX")
 OLLAMA_URL = os.getenv("OLLAMA_URL")
 OLLAMA_EMB_MODEL_NAME = os.getenv("OLLAMA_EMB_MODEL_NAME")
 AVAP_WEB_DOCS_URL = os.getenv("AVAP_WEB_DOCS_URL")
 HF_EMB_MODEL_NAME = os.getenv("HF_EMB_MODEL_NAME")
 class DistanceStrategy(str, Enum):
    euclidean = "EUCLIDEAN_DISTANCE"
@ -33,55 +32,45 @@ class DistanceStrategy(str, Enum):
    jaccard = "JACCARD"
    cosine = "COSINE"
 def clean_text(text: str) -> str:
    text = text.replace("\u00a0", " ")
    text = re.sub(r"\s+", " ", text).strip()
    return text
 def build_documents_from_folder(
    folder_path: str,
 ) -> list[Document]:
    folder = Path(folder_path)
    if not folder.exists() or not folder.is_dir():
        raise ValueError(f"Invalid folder path: {folder_path}")
    all_documents: list[Document] = []
    for file_path in folder.glob("*.txt"):
        doc_text = file_path.read_text(encoding="utf-8")
        if not doc_text.strip():
            continue
        metadata: dict[str, Any] = {
            "source": file_path.name,
        }   
        doc_text = clean_text(doc_text)
        document = Document(
            id=hashlib.md5(file_path.name.encode()).hexdigest(),
            page_content=doc_text,
            metadata={**metadata}
        )
        all_documents.append(document)
    return all_documents
@app.command()
 def elasticsearch_ingestion(
-    docs_folder_path: str = "ingestion/docs",
+    docs_folder_path: str = "docs",
    es_index: str = "avap-docs-test-v2",
    es_request_timeout: int = 120,
    es_max_retries: int = 5,
    es_retry_on_timeout: bool = True,
    distance_strategy: DistanceStrategy = DistanceStrategy.cosine,
    chunk_size: int = 2048,
    chunk_threshold: float = 0.5,
    chunk_similarity_window: int = 3,
    chunk_skip_window: int = 1,
 ):  
    logger.info("Starting Elasticsearch ingestion pipeline...")
-    logger.info(f"Using docs folder path: {docs_folder_path}")
+    logger.info(f"Reading files from folder: {docs_folder_path}/LRM and {docs_folder_path}/samples...")
-    documents = build_documents_from_folder(folder_path=docs_folder_path)
+    avap_code_docs = read_files(f"{docs_folder_path}/samples")
    avap_language_docs = read_files(f"{docs_folder_path}/LRM")
    logger.info("Instantiating semantic chunker and chef...")
    custom_tokenizer = AutoTokenizer.from_pretrained(HF_EMB_MODEL_NAME)
    chef = MarkdownChef(tokenizer=custom_tokenizer)
    chunker = SemanticChunker(
        embedding_model=HF_EMB_MODEL_NAME,
        chunk_size=chunk_size,
        threshold=chunk_threshold,
        similarity_window=chunk_similarity_window,
        skip_window=chunk_skip_window
    )
    logger.info("Processing Markdown docs with chef...")
    doc = chef.process(f"{docs_folder_path}/LRM/avap.md")
    logger.info("Chunking AVAP Language docs...")
    avap_language_docs_chunks = get_chunk_docs(avap_language_docs, chunker)
    logger.info("Creating Langchain Document to index...")
    avap_language_langchain_docs = convert_chunks_to_document(avap_language_docs_chunks)
    avap_code_langchain_docs = convert_chunks_to_document(avap_code_docs)
    avap_documents = avap_language_langchain_docs + avap_code_langchain_docs
    logger.info("Connecting to Elasticsearch...")
    try:
@ -106,15 +95,19 @@ def elasticsearch_ingestion(
        logger.exception("Failed to instantiate embeddings model.")
        raise
-    logger.info(f"Uploading documents to index {ELASTICSEARCH_INDEX}...")
+    logger.info(f"Checking if index {es_index} exists and deleting if it does...")
    if es.indices.exists(index=es_index):
        es.indices.delete(index=es_index)
    logger.info(f"Uploading documents to index {es_index}...")
    ElasticsearchStore.from_documents(
-        documents,
+        avap_documents,
        embeddings,
        client=es,
-        index_name=ELASTICSEARCH_INDEX,
+        index_name=es_index,
        distance_strategy=distance_strategy.value,
    )
-    logger.info(f"Finished uploading documents to index {ELASTICSEARCH_INDEX}.")
+    logger.info(f"Finished uploading documents to index {es_index}.")
 if __name__ == "__main__":
--- a/scratches/acano/elasticsearch_ingestion_v2.py
+++ b/scratches/acano/elasticsearch_ingestion_v2.py
@ -0,0 +1,122 @@
 import typer
 import logging
 from loguru import logger
 from elasticsearch import Elasticsearch
 from chonkie import MarkdownChef, FileFetcher, ElasticHandshake
 from transformers import AutoTokenizer
 from src.config import settings
 from scripts.pipelines.tasks.embeddings import OllamaEmbeddings
 from scripts.pipelines.tasks.chunk import merge_markdown_document
 app = typer.Typer()
 def get_processing_and_chunking_config(docs_extension: str, chunk_size: int, 
                                        chunk_threshold: float | None, 
                                        chunk_similarity_window: int| None, 
                                        chunk_skip_window: int | None) -> tuple[str, dict, str, dict]:
    """
    Check the file extension and return the appropriate processing and chunking strategies and their kwargs.
    Args:
        docs_extension (str): The file extension of the documents to be ingested.
        chunk_size (int): The size of the chunks to be created.
        chunk_threshold (float, optional): The threshold for semantic chunking. Required if docs_extension is .md.
        chunk_similarity_window (int, optional): The similarity window for semantic chunking
        chunk_skip_window (int, optional): The skip window for semantic chunking.
    Returns:
        tuple[str, dict, str, dict]: A tuple containing the processing strategy, its kwargs, the chunking strategy, and its kwargs.
    """
    if docs_extension == ".md":
        process_type = "markdown"
        custom_tokenizer = AutoTokenizer.from_pretrained(settings.hf_emb_model_name)
        process_kwargs = {"tokenizer": custom_tokenizer}
        # process_type = "text"
        # process_kwargs = {}
        chunk_strat = "semantic"
        chunk_kwargs = {"embedding_model": settings.hf_emb_model_name, "threshold": chunk_threshold, "chunk_size": chunk_size, 
                        "similarity_window": chunk_similarity_window, "skip_window": chunk_skip_window}
    elif docs_extension == ".avap":
        process_type = "text"
        process_kwargs = {}
        chunk_strat = "recursive" # Once we have the BNF and uploaded to tree-sitter, we can use code (?)
        chunk_kwargs = {"chunk_size": chunk_size}
    return process_type, process_kwargs, chunk_strat, chunk_kwargs
@app.command()
 def elasticsearch_ingestion(
    docs_folder_path: str = "docs/LRM",
    docs_extension: str = ".md",
    es_index: str = "avap-docs-test-v3",
    es_request_timeout: int = 120,
    es_max_retries: int = 5,
    es_retry_on_timeout: bool = True,
    delete_es_index: bool = True,
    chunk_size: int = 2048,
    chunk_threshold: float | None = 0.5,
    chunk_similarity_window: int | None = 3,
    chunk_skip_window: int | None = 1
 ):  
    custom_tokenizer = AutoTokenizer.from_pretrained(settings.hf_emb_model_name)
    processed_docs = []
    fused_docs = []
    logger.info(f"Instantiating Elasticsearch client with URL: {settings.elasticsearch_local_url}...")
    es = Elasticsearch(
        hosts=settings.elasticsearch_local_url,
        request_timeout=es_request_timeout,
        max_retries=es_max_retries,
        retry_on_timeout=es_retry_on_timeout,
    )
    if delete_es_index and es.indices.exists(index=es_index):
        logger.info(f"Deleting existing Elasticsearch index: {es_index}...")
        es.indices.delete(index=es_index)
    logger.info("Starting Elasticsearch ingestion pipeline...")
    (process_type, 
     process_kwargs, 
     chunk_strat, 
     chunk_kwargs) = get_processing_and_chunking_config(docs_extension, chunk_size, chunk_threshold, chunk_similarity_window, chunk_skip_window)
    logger.info(f"Fetching files from {docs_folder_path}...")
    fetcher = FileFetcher()
    docs = fetcher.fetch(dir=f"{settings.proj_root}/{docs_folder_path}")
    logger.info(f"Processing documents with process_type: {process_type}...")
    chef = MarkdownChef(tokenizer=custom_tokenizer)
    for doc in docs:
        processed_doc = chef.process(doc)
        processed_docs.append(processed_doc)
    logger.info(f"Chunking documents with chunk_strat: {chunk_strat}...")
    for processed_doc in processed_docs:
        fused_doc = merge_markdown_document(processed_doc)
        fused_docs.append(fused_doc)
    logger.info(f"Ingesting chunks in Elasticsearch index: {es_index}...")
    handshake = ElasticHandshake(
        client=es,
        index_name=es_index,
        embedding_model=OllamaEmbeddings(model=settings.ollama_emb_model_name)
    )
    for fused_doc in fused_docs:
        handshake.write(fused_doc.chunks)
    logger.info(f"Finished ingesting in {es_index}.")
 if __name__ == "__main__":
    logging.basicConfig(
        level=logging.INFO,
        format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
    )
    try:
        app()
    except Exception as exc:
        logger.exception(exc)
        raise
--- a/scripts/pipelines/flows/elasticsearch_ingestion.py
+++ b/scripts/pipelines/flows/elasticsearch_ingestion.py
@ -1,134 +1,38 @@
 from enum import Enum
 import typer
 import logging
 import os
 from pathlib import Path
 from loguru import logger
-from elasticsearch import Elasticsearch
+from chonkie import FileFetcher
 from langchain_elasticsearch import ElasticsearchStore
 from chonkie import SemanticChunker
-from src.utils.emb_factory import create_embedding_model
+from src.config import settings
-from scripts.pipelines.tasks.chunk import (
+from scripts.pipelines.tasks.chunk import process_documents, ingest_documents
    read_files, 
    get_chunk_docs, 
    convert_chunks_to_document
 )
 app = typer.Typer()
 ELASTICSEARCH_LOCAL_URL = os.getenv("ELASTICSEARCH_LOCAL_URL")
 OLLAMA_LOCAL_URL = os.getenv("OLLAMA_LOCAL_URL")
 ELASTICSEARCH_INDEX = os.getenv("ELASTICSEARCH_INDEX")
 OLLAMA_URL = os.getenv("OLLAMA_URL")
 OLLAMA_EMB_MODEL_NAME = os.getenv("OLLAMA_EMB_MODEL_NAME")
 AVAP_WEB_DOCS_URL = os.getenv("AVAP_WEB_DOCS_URL")
 HF_EMB_MODEL_NAME = os.getenv("HF_EMB_MODEL_NAME")
 class DistanceStrategy(str, Enum):
    euclidean = "EUCLIDEAN_DISTANCE"
    max_inner_product = "MAX_INNER_PRODUCT"
    dot_product = "DOT_PRODUCT"
    jaccard = "JACCARD"
    cosine = "COSINE"
@app.command()
 def elasticsearch_ingestion(
-    docs_folder_path: str = "docs",
+    docs_folder_path: str = "docs/LRM",
    docs_extension: str = ".md",
    es_index: str = "avap-docs-test-v3",
    es_request_timeout: int = 120,
    es_max_retries: int = 5,
    es_retry_on_timeout: bool = True,
-    distance_strategy: DistanceStrategy = DistanceStrategy.cosine,
+    delete_es_index: bool = True
    chunk_size: int = 2048,
    chunk_threshold: float = 0.5,
    chunk_similarity_window: int = 3,
    chunk_skip_window: int = 1,
 ):  
    logger.info("Starting Elasticsearch ingestion pipeline...")
-    logger.info(f"Reading and concatenating files from folder: {docs_folder_path}/developer.avapframework.com")
+    logger.info(f"Fetching files from {docs_folder_path}...")
-    avap_github_docs = read_files(f"{docs_folder_path}/avap_language_github_docs", concatenate=False)
+    fetcher = FileFetcher()
-    avap_web_docs_intro = read_files(f"{docs_folder_path}/developer.avapframework.com", "intro", concatenate=True)
+    docs_path = fetcher.fetch(dir=f"{settings.proj_root}/{docs_folder_path}")
-    # Check chapters in developer.avapframework.com folder and read and concatenate files for each chapter
+    logger.info("Processing docs...")
-    chapters = sorted({
+    chunked_docs = process_documents(docs_path, docs_extension)
        p.name.split("_")[0]
        for p in Path(f"{docs_folder_path}/developer.avapframework.com").glob("chapter*.md")
    })
-    avap_web_docs_chapters = [
+    logger.info(f"Ingesting chunks in Elasticsearch index: {es_index}...")
-        item
+    ingest_documents(chunked_docs, es_index, es_request_timeout, es_max_retries, 
-        for chapter in chapters
+                     es_retry_on_timeout, delete_es_index)
        for item in read_files(
            f"{docs_folder_path}/developer.avapframework.com",
            f"{chapter}_",
            concatenate=True
        )
    ]
-    avap_web_docs_appendices = read_files(f"{docs_folder_path}/developer.avapframework.com", "appendices_", concatenate=False)
+    logger.info(f"Finished ingesting in {es_index}.")
    avap_samples_docs = read_files(f"{docs_folder_path}/samples", concatenate=False)
    logger.info("Instantiating semantic chunker...")
    chunker = SemanticChunker(
        embedding_model=HF_EMB_MODEL_NAME,
        chunk_size=chunk_size,
        threshold=chunk_threshold,
        similarity_window=chunk_similarity_window,
        skip_window=chunk_skip_window
    )
    logger.info("Chunking AVAP GitHub docs...")
    avap_github_docs_chunks = get_chunk_docs(avap_github_docs, chunker)
    logger.info("Chunking AVAP web docs chapters...")
    avap_web_docs_chapters_chunks = get_chunk_docs(avap_web_docs_chapters, chunker)
    logger.info("Creating Langchain Document to index...")
    avap_github_langchain_docs = convert_chunks_to_document(avap_github_docs_chunks)
    avap_web_chapters_langchain_docs = convert_chunks_to_document(avap_web_docs_chapters_chunks)
    avap_web_intro_langchain_docs = convert_chunks_to_document(avap_web_docs_intro)
    avap_web_appendices_langchain_docs = convert_chunks_to_document(avap_web_docs_appendices)
    avap_samples_langchain_docs = convert_chunks_to_document(avap_samples_docs)
    avap_documents = avap_github_langchain_docs + avap_web_chapters_langchain_docs + avap_web_intro_langchain_docs + avap_web_appendices_langchain_docs + avap_samples_langchain_docs
    logger.info("Connecting to Elasticsearch...")
    try:
        es = Elasticsearch(
            ELASTICSEARCH_LOCAL_URL,
            request_timeout=es_request_timeout,
            max_retries=es_max_retries,
            retry_on_timeout=es_retry_on_timeout,
        )
    except:
        logger.exception("Failed to connect to Elasticsearch.")
        raise
    logger.info("Instantiating embeddings model...")
    try:
        embeddings = create_embedding_model(
            provider="ollama",
            model=OLLAMA_EMB_MODEL_NAME,
            base_url=OLLAMA_LOCAL_URL, 
        )
    except:
        logger.exception("Failed to instantiate embeddings model.")
        raise
    logger.info(f"Checking if index {ELASTICSEARCH_INDEX} exists and deleting if it does...")
    if es.indices.exists(index=ELASTICSEARCH_INDEX):
        es.indices.delete(index=ELASTICSEARCH_INDEX)
    logger.info(f"Uploading documents to index {ELASTICSEARCH_INDEX}...")
    ElasticsearchStore.from_documents(
        avap_documents,
        embeddings,
        client=es,
        index_name=ELASTICSEARCH_INDEX,
        distance_strategy=distance_strategy.value,
    )
    logger.info(f"Finished uploading documents to index {ELASTICSEARCH_INDEX}.")
 if __name__ == "__main__":
--- a/scripts/pipelines/samples_generator/generate_mbap.py
+++ b/scripts/pipelines/samples_generator/generate_mbap.py
--- a/scripts/pipelines/inputs/prompts.py
+++ b/scripts/pipelines/inputs/prompts.py
--- a/scripts/pipelines/tasks/chunk.py
+++ b/scripts/pipelines/tasks/chunk.py
@ -1,136 +1,159 @@
-import os
+from copy import deepcopy
-import re
+from dataclasses import replace
-import uuid
+from pathlib import Path
 from chonkie import (
    Chunk,
    ElasticHandshake,
    MarkdownChef,
    TextChef,
    TokenChunker,
 )
 from elasticsearch import Elasticsearch
 from loguru import logger
-from chonkie import Chunk, SemanticChunker
+from transformers import AutoTokenizer
-from langchain_core.documents import Document
+
 from scripts.pipelines.tasks.embeddings import OllamaEmbeddings
 from src.config import settings
-def replace_javascript_with_avap(text: str) -> str:
+def _get_text(element) -> str:
-    """
+    for attr in ("text", "content", "markdown"):
-    Replace mentions of javascript language with avap in the text.
+        value = getattr(element, attr, None)
-    Handles code blocks, language identifiers, and references.
+        if isinstance(value, str):
-
+            return value
-    Args:
+    raise AttributeError(
-        text: The text to process.
+        f"Could not extract text from element of type {type(element).__name__}"
    Returns:
        The text with javascript references replaced with avap.
    """
    # Replace ```javascript with ```avap
    text = text.replace("```javascript", "```avap")
    # Replace ```js with ```avap
    text = text.replace("```js", "```avap")
    # Replace common phrases (case-insensitive)
    text = re.sub(r"\bjavascript\s+code\b", "avap code", text, flags=re.IGNORECASE)
    text = re.sub(
        r"\bjavascript\s+example\b", "avap example", text, flags=re.IGNORECASE
    )
    text = re.sub(r"\bjavascript\b(?!\s+file)", "avap", text, flags=re.IGNORECASE)
    return text
-def read_files(
+def _merge_markdown_document(doc):
-    folder_path: str, file_prefix: str | None = None, concatenate: bool = True
+    elements = []
 ) -> list[dict]:
    """
    Read files in a folder whose names start with a given prefix.
    Replaces javascript language markers with avap.
-    Args:
+    for chunk in doc.chunks:
-        folder_path: Path to the folder to search in.
+        elements.append(("chunk", chunk.start_index, chunk.end_index, chunk))
        file_prefix: The prefix that file names must start with.
            If None, all files in the folder are included.
        concatenate: Whether to concatenate the contents of the files.
-    Returns:
+    for code in doc.code:
-        A list of dictionaries, each containing 'content' and 'title' keys.
+        elements.append(("code", code.start_index, code.end_index, code))
        If concatenate is True, returns a single dict with concatenated content and title as 'appendix'.
        If concatenate is False, returns one dict per file with filename as title.
    """
    contents = []
    filenames = []
-    for filename in sorted(os.listdir(folder_path)):
+    for table in doc.tables:
-        include_file = file_prefix is None or filename.startswith(file_prefix)
+        elements.append(("table", table.start_index, table.end_index, table))
        if include_file:
            file_path = os.path.join(folder_path, filename)
            if os.path.isfile(file_path):
                with open(file_path, "r", encoding="utf-8") as f:
                    content = f.read()
                    cleaned_content = content.strip()
                    if cleaned_content:
                        contents.append(cleaned_content)
                        filenames.append(filename)
-    if concatenate:
+    elements.sort(key=lambda item: (item[1], item[2]))
-        concatenated = "\n".join(contents)
+
-        processed_content = replace_javascript_with_avap(concatenated)
+    merged_chunks = []
-        title = file_prefix if file_prefix is not None else "all_files"
+    current_chunk = None
-        return [{"content": processed_content, "title": title}]
+    current_parts = []
-    else:
+    current_end_index = None
-        return [
+    current_token_count = None
-            {"content": replace_javascript_with_avap(content), "title": filename}
+
-            for content, filename in zip(contents, filenames)
+    def flush():
-        ]
+        nonlocal current_chunk, current_parts, current_end_index, current_token_count
        if current_chunk is None:
            return
        merged_text = "\n\n".join(part for part in current_parts if part)
        merged_chunks.append(
            replace(
                current_chunk,
                text=merged_text,
                end_index=current_end_index,
                token_count=current_token_count,
            )
        )
        current_chunk = None
        current_parts = []
        current_end_index = None
        current_token_count = None
    for kind, _, _, element in elements:
        if kind == "chunk":
            flush()
            current_chunk = element
            current_parts = [_get_text(element)]
            current_end_index = element.end_index
            current_token_count = element.token_count
            continue
        if current_chunk is None:
            continue
        current_parts.append(_get_text(element))
        current_end_index = max(current_end_index, element.end_index)
        current_token_count += getattr(element, "token_count", 0)
    flush()
    new_doc = deepcopy(doc)
    new_doc.chunks = merged_chunks
    new_doc.code = doc.code
    new_doc.tables = doc.tables
    return new_doc
-def get_chunk_docs(docs: list[dict], chunker: SemanticChunker) -> list[list[Chunk]]:
+def process_documents(docs_path: list[Path], docs_extension: str) -> list[Chunk]:
-    """
+    processed_docs = []
-    Chunk the content of the documents using the provided chunker.
+    chunked_docs = []
    custom_tokenizer = AutoTokenizer.from_pretrained(settings.hf_emb_model_name)
-    Args:
+    if docs_extension == ".md":
-        docs: A list of dictionaries, each containing 'content' and 'title' keys.
+        chef = MarkdownChef(tokenizer=custom_tokenizer)
-        chunker: An instance of SemanticChunker to use for chunking the content.
+        for doc in docs_path:
            processed_doc = chef.process(doc)
            processed_docs.append((processed_doc, doc.name))
-    Returns:
+        for processed_doc, filename in processed_docs:
-        A list of lists of Chunk objects, where each inner list corresponds to the chunks of a
+            fused_doc = _merge_markdown_document(processed_doc)
-        single document.
+            chunked_docs.extend(fused_doc.chunks)
    """
    list_chunks = []
-    for doc in docs:
+    elif docs_extension == ".avap":
-        content = doc["content"]
+        chef = TextChef()
-        chunks = chunker.chunk(content)
+        chunker = TokenChunker(tokenizer=custom_tokenizer)
-        for chunk in chunks:
+        for doc in docs_path:
-            chunk.context = {"source": doc["title"]}
+            processed_doc = chef.process(doc)
-        list_chunks.append(chunks)
+            processed_docs.append((processed_doc, doc.name))
        logger.info(f"Finished chunking {doc['title']}")
-    return list_chunks
+        for processed_doc, filename in processed_docs:
            chunked_doc = chunker.chunk(processed_doc.content)
            chunked_docs.extend(chunked_doc)
    return chunked_docs
-def convert_chunks_to_document(chunks: list[dict] | list[list[Chunk]]) -> list[Document]:
+def ingest_documents(
-    """
+    chunked_docs: list[Chunk],
-    Convert the chunked content into a list of Document objects.
+    es_index: str,
    es_request_timeout: int,
    es_max_retries: int,
    es_retry_on_timeout: bool,
    delete_es_index: bool,
 ) -> None:
-    Args:
+    logger.info(
-        chunks: A list of dictionaries containing 'content' and 'title' keys.
+        f"Instantiating Elasticsearch client with URL: {settings.elasticsearch_local_url}..."
    )
    es = Elasticsearch(
        hosts=settings.elasticsearch_local_url,
        request_timeout=es_request_timeout,
        max_retries=es_max_retries,
        retry_on_timeout=es_retry_on_timeout,
    )
-    Returns:
+    if delete_es_index and es.indices.exists(index=es_index):
-        A list of Document objects created from the chunked content.
+        logger.info(f"Deleting existing Elasticsearch index: {es_index}...")
-    """
+        es.indices.delete(index=es_index)
    documents = []
-    if isinstance(chunks[0], dict):
+    handshake = ElasticHandshake(
-        for chunk in chunks:
+        client=es,
-            content = chunk["content"]
+        index_name=es_index,
-            title = chunk["title"]
+        embedding_model=OllamaEmbeddings(model=settings.ollama_emb_model_name),
-            documents.append(Document(id=str(uuid.uuid4()),
+    )
                                      page_content=content, 
                                      metadata={"source": title}))
-    else:
+    logger.info(
-        for chunk_list in chunks:
+        f"Ingesting {len(chunked_docs)} chunks into Elasticsearch index: {es_index}..."
-            for chunk in chunk_list:
+    )
-                content = chunk.text
+    handshake.write(chunked_docs)
                title = chunk.context.get("source", "unknown")
                documents.append(Document(id=str(uuid.uuid4()),
                                          page_content=content, 
                                          metadata={"source": title}))
    return documents
--- a/scripts/pipelines/tasks/embeddings.py
+++ b/scripts/pipelines/tasks/embeddings.py
@ -0,0 +1,125 @@
 import requests
 from typing import Any, Callable
 import numpy as np
 from chonkie.embeddings import BaseEmbeddings
 from src.config import settings
 class OllamaEmbeddings(BaseEmbeddings):
    """Chonkie embeddings adapter for a local Ollama embedding model."""
    def __init__(
        self,
        model: str,
        base_url: str = settings.ollama_local_url,
        timeout: float = 60.0,
        truncate: bool = True,
        keep_alive: str = "5m",
    ) -> None:
        self.model = model
        self.base_url = base_url.rstrip("/")
        self.timeout = timeout
        self.truncate = truncate
        self.keep_alive = keep_alive
        self._dimension: int | None = None
    @property
    def dimension(self) -> int:
        if self._dimension is None:
            # Lazy-load the dimension from a real embedding response.
            self._dimension = int(self.embed(" ").shape[0])
        return self._dimension
    def embed(self, text: str) -> np.ndarray:
        embeddings = self._embed_api(text)
        vector = np.asarray(embeddings[0], dtype=np.float32)
        if self._dimension is None:
            self._dimension = int(vector.shape[0])
        return vector
    def embed_batch(self, texts: list[str]) -> list[np.ndarray]:
        if not texts:
            return []
        embeddings = self._embed_api(texts)
        vectors = [np.asarray(vector, dtype=np.float32) for vector in embeddings]
        if vectors and self._dimension is None:
            self._dimension = int(vectors[0].shape[0])
        return vectors
    def count_tokens(self, text: str) -> int:
        payload = self._build_payload(text)
        response = self._post_embed(payload)
        return int(response["prompt_eval_count"])
    def count_tokens_batch(self, texts: list[str]) -> list[int]:
        # Ollama returns a single prompt_eval_count for the whole request,
        # not one count per input item, so we compute them individually.
        return [self.count_tokens(text) for text in texts]
    def get_tokenizer(self) -> Callable[[str], int]:
        # Chonkie mainly needs something usable for token counting.
        return self.count_tokens
    @classmethod
    def is_available(cls) -> bool:
        try:
            response = requests.get(
                f"{settings.ollama_local_url}/api/tags",
                timeout=5.0,
            )
            response.raise_for_status()
            return True
        except requests.RequestException:
            return False
    def __repr__(self) -> str:
        return (
            f"OllamaEmbeddings("
            f"model={self.model!r}, "
            f"base_url={self.base_url!r}, "
            f"dimension={self._dimension!r}"
            f")"
        )
    def _build_payload(self, text_or_texts: str | list[str]) -> dict[str, Any]:
        return {
            "model": self.model,
            "input": text_or_texts,
            "truncate": self.truncate,
            "keep_alive": self.keep_alive,
        }
    def _post_embed(self, payload: dict[str, Any]) -> dict[str, Any]:
        try:
            response = requests.post(
                f"{self.base_url}/api/embed",
                json=payload,
                timeout=self.timeout,
            )
            response.raise_for_status()
            data = response.json()
        except requests.RequestException as exc:
            raise RuntimeError(
                f"Failed to call Ollama embeddings endpoint at "
                f"{self.base_url}/api/embed"
            ) from exc
        if "embeddings" not in data:
            raise RuntimeError(
                "Ollama response did not include 'embeddings'. "
                f"Response keys: {list(data.keys())}"
            )
        return data
    def _embed_api(self, text_or_texts: str | list[str]) -> list[list[float]]:
        payload = self._build_payload(text_or_texts)
        data = self._post_embed(payload)
        return data["embeddings"]
--- a/uv.lock
+++ b/uv.lock
@ -250,7 +250,6 @@ name = "assistance-engine"
 version = "0.1.0"
 source = { virtual = "." }
 dependencies = [
    { name = "chonkie", extra = ["semantic"] },
    { name = "grpcio" },
    { name = "grpcio-reflection" },
    { name = "grpcio-tools" },
@ -273,7 +272,9 @@ dependencies = [
 dev = [
    { name = "beir" },
    { name = "boto3" },
    { name = "chonkie", extra = ["elastic", "semantic"] },
    { name = "evidently" },
    { name = "flatbuffers" },
    { name = "jupyter" },
    { name = "langfuse" },
    { name = "litellm" },
@ -288,7 +289,6 @@ dev = [
 [package.metadata]
 requires-dist = [
    { name = "chonkie", extras = ["semantic"], specifier = ">=1.5.6" },
    { name = "grpcio", specifier = ">=1.78.0" },
    { name = "grpcio-reflection", specifier = ">=1.78.0" },
    { name = "grpcio-tools", specifier = ">=1.78.0" },
@ -311,7 +311,9 @@ requires-dist = [
 dev = [
    { name = "beir", specifier = ">=2.2.0" },
    { name = "boto3", specifier = ">=1.42.58" },
    { name = "chonkie", extras = ["elastic", "semantic"], specifier = ">=1.6.0" },
    { name = "evidently", specifier = ">=0.7.20" },
    { name = "flatbuffers", specifier = ">=25.12.19" },
    { name = "jupyter", specifier = ">=1.1.1" },
    { name = "langfuse", specifier = "<3" },
    { name = "litellm", specifier = ">=1.82.0" },
@ -595,7 +597,7 @@ wheels = [
 [[package]]
 name = "chonkie"
-version = "1.5.6"
+version = "1.6.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
    { name = "chonkie-core" },
@ -603,12 +605,15 @@ dependencies = [
    { name = "tenacity" },
    { name = "tqdm" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/a4/16/e51295955f5a627ebb7867dc2e7fa48d4c6dc2a5f3cde3690de84812e929/chonkie-1.5.6.tar.gz", hash = "sha256:282a24c20b88c4c28d8cae893ac78bcbee531a87d28ec86b419897a9eea2ecf3", size = 172066, upload-time = "2026-02-16T21:44:01.336Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/e5/72/fdf8f89ff439f4ec357af0866c819512391936e4e61b6f15635a48434b8a/chonkie-1.6.0.tar.gz", hash = "sha256:14120d80610c1f549027fc7aa9a5ff604a729b545836f6cadd65d5ae83596279", size = 187056, upload-time = "2026-03-11T04:55:07.657Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/18/3a/24cf4cb377f4d44126231d55a19b48a645a0f78f891288a8d4300c95160d/chonkie-1.5.6-py3-none-any.whl", hash = "sha256:4c3be39a0f97315eb3c5efe6dc5d7933d3d27a1918b55c39ab211b403bb03df7", size = 210065, upload-time = "2026-02-16T21:43:59.926Z" },
+    { url = "https://files.pythonhosted.org/packages/ae/c2/7ea7d3409df220dd0e048b1113b44f47eccab9d517b00b037ab0e34c3c7a/chonkie-1.6.0-py3-none-any.whl", hash = "sha256:aa357e02f5cdacac6f8280c5e8651207c866b4137bcf20904db8670ee0808877", size = 232997, upload-time = "2026-03-11T04:55:05.252Z" },
 ]
 [package.optional-dependencies]
 elastic = [
    { name = "elasticsearch" },
 ]
 semantic = [
    { name = "model2vec" },
    { name = "tokenizers" },
@ -1061,6 +1066,14 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/9c/0f/5d0c71a1aefeb08efff26272149e07ab922b64f46c63363756224bd6872e/filelock-3.24.3-py3-none-any.whl", hash = "sha256:426e9a4660391f7f8a810d71b0555bce9008b0a1cc342ab1f6947d37639e002d", size = 24331, upload-time = "2026-02-19T00:48:18.465Z" },
 ]
 [[package]]
 name = "flatbuffers"
 version = "25.12.19"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
    { url = "https://files.pythonhosted.org/packages/e8/2d/d2a548598be01649e2d46231d151a6c56d10b964d94043a335ae56ea2d92/flatbuffers-25.12.19-py2.py3-none-any.whl", hash = "sha256:7634f50c427838bb021c2d66a3d1168e9d199b0607e6329399f04846d42e20b4", size = 26661, upload-time = "2025-12-19T23:16:13.622Z" },
 ]
 [[package]]
 name = "fqdn"
 version = "1.5.1"
@ -3112,14 +3125,14 @@ wheels = [
 [[package]]
 name = "opentelemetry-proto"
-version = "1.39.1"
+version = "1.40.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
    { name = "protobuf" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/49/1d/f25d76d8260c156c40c97c9ed4511ec0f9ce353f8108ca6e7561f82a06b2/opentelemetry_proto-1.39.1.tar.gz", hash = "sha256:6c8e05144fc0d3ed4d22c2289c6b126e03bcd0e6a7da0f16cedd2e1c2772e2c8", size = 46152, upload-time = "2025-12-11T13:32:48.681Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/4c/77/dd38991db037fdfce45849491cb61de5ab000f49824a00230afb112a4392/opentelemetry_proto-1.40.0.tar.gz", hash = "sha256:03f639ca129ba513f5819810f5b1f42bcb371391405d99c168fe6937c62febcd", size = 45667, upload-time = "2026-03-04T14:17:31.194Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/51/95/b40c96a7b5203005a0b03d8ce8cd212ff23f1793d5ba289c87a097571b18/opentelemetry_proto-1.39.1-py3-none-any.whl", hash = "sha256:22cdc78efd3b3765d09e68bfbd010d4fc254c9818afd0b6b423387d9dee46007", size = 72535, upload-time = "2025-12-11T13:32:33.866Z" },
+    { url = "https://files.pythonhosted.org/packages/b9/b2/189b2577dde745b15625b3214302605b1353436219d42b7912e77fa8dc24/opentelemetry_proto-1.40.0-py3-none-any.whl", hash = "sha256:266c4385d88923a23d63e353e9761af0f47a6ed0d486979777fe4de59dc9b25f", size = 72073, upload-time = "2026-03-04T14:17:16.673Z" },
 ]
 [[package]]