Refactor Elasticsearch ingestion pipeline and add MBPP generation script
- Updated `elasticsearch_ingestion.py` to streamline document processing and ingestion into Elasticsearch. - Introduced `generate_mbap.py` for generating benchmark problems in AVAP language from a provided LRM. - Created `prompts.py` to define prompts for converting Python problems to AVAP. - Enhanced chunk processing in `chunk.py` to support markdown and AVAP documents. - Added `OllamaEmbeddings` class in `embeddings.py` for handling embeddings with Ollama model. - Updated dependencies in `uv.lock` to include new packages and versions.
This commit is contained in:
parent
3caed4deb6
commit
5f21544e0b
35
README.md
35
README.md
|
|
@ -47,11 +47,12 @@ graph TD
|
||||||
├── changelog # Version tracking and release history
|
├── changelog # Version tracking and release history
|
||||||
├── pyproject.toml # Python project configuration
|
├── pyproject.toml # Python project configuration
|
||||||
├── docs/
|
├── docs/
|
||||||
| ├── AVAP Language: ... # AVAP DSL Documentation
|
│ ├── AVAP Language: ... # AVAP DSL Documentation
|
||||||
| | └── AVAP.md
|
│ │ └── AVAP.md
|
||||||
│ ├── developer.avapfr... # Documents on developer web page
|
│ ├── developer.avapfr... # Documents on developer web page
|
||||||
| └── LRM/ # AVAP LRM documentation
|
│ ├── LRM/ # AVAP LRM documentation
|
||||||
| └── avap.md
|
│ │ └── avap.md
|
||||||
|
│ └── samples/ # AVAP code samples
|
||||||
├── Docker/
|
├── Docker/
|
||||||
│ ├── protos/
|
│ ├── protos/
|
||||||
│ │ └── brunix.proto # Protocol Buffers: The source of truth for the API
|
│ │ └── brunix.proto # Protocol Buffers: The source of truth for the API
|
||||||
|
|
@ -64,26 +65,17 @@ graph TD
|
||||||
│ ├── Dockerfile # Container definition for the Engine
|
│ ├── Dockerfile # Container definition for the Engine
|
||||||
│ ├── docker-compose.yaml # Local orchestration for dev environment
|
│ ├── docker-compose.yaml # Local orchestration for dev environment
|
||||||
│ ├── requirements.txt # Python dependencies for Docker
|
│ ├── requirements.txt # Python dependencies for Docker
|
||||||
│ ├── protos/
|
│ └── .dockerignore # Docker ignore files
|
||||||
│ │ └── brunix.proto # Protocol Buffers: The source of truth for the API
|
|
||||||
│ └── src/
|
|
||||||
│ ├── graph.py # Workflow graph orchestration
|
|
||||||
│ ├── prompts.py # Centralized prompt definitions
|
|
||||||
│ ├── server.py # gRPC Server & RAG Orchestration
|
|
||||||
│ ├── state.py # Shared state management
|
|
||||||
│ └── utils/ # Utility modules
|
|
||||||
├── ingestion/
|
|
||||||
│ └── docs/ # AVAP documentation chunks
|
|
||||||
├── kubernetes/
|
├── kubernetes/
|
||||||
│ └── kubeconfig.yaml # Kubernetes cluster configuration
|
│ └── kubeconfig.yaml # Kubernetes cluster configuration
|
||||||
├── scripts/
|
├── scripts/
|
||||||
│ └── pipelines/
|
│ └── pipelines/
|
||||||
| ├── samples_generator/ # AVAP Sample generator
|
│ ├── flows/ # Processing pipelines
|
||||||
| | └─ generate_mbap.py
|
│ ├── tasks/ # Modules used by the flows
|
||||||
│ └── flows/ # Data processing flows
|
│ └── inputs/ # Inputs used by the flows
|
||||||
| └─ elasticsearch_ingestion.py
|
|
||||||
└── src/
|
└── src/
|
||||||
├── __init__.py
|
├── __init__.py
|
||||||
|
├── config.py # Environment variables configuration file
|
||||||
└── utils/
|
└── utils/
|
||||||
├── emb_factory.py # Embedding model factory
|
├── emb_factory.py # Embedding model factory
|
||||||
├── llm_factory.py # LLM model factory
|
├── llm_factory.py # LLM model factory
|
||||||
|
|
@ -157,6 +149,7 @@ OLLAMA_URL=http://host.docker.internal:11434
|
||||||
OLLAMA_LOCAL_URL=http://localhost:11434
|
OLLAMA_LOCAL_URL=http://localhost:11434
|
||||||
OLLAMA_MODEL_NAME=qwen2.5:1.5b
|
OLLAMA_MODEL_NAME=qwen2.5:1.5b
|
||||||
OLLAMA_EMB_MODEL_NAME=qwen3-0.6B-emb:latest
|
OLLAMA_EMB_MODEL_NAME=qwen3-0.6B-emb:latest
|
||||||
|
HF_TOKEN=hf_...
|
||||||
HF_EMB_MODEL_NAME=Qwen/Qwen3-Embedding-0.6B
|
HF_EMB_MODEL_NAME=Qwen/Qwen3-Embedding-0.6B
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
@ -183,13 +176,13 @@ Open a terminal and establish the connection to the Devaron Cluster:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# 1. AI Model Tunnel (Ollama)
|
# 1. AI Model Tunnel (Ollama)
|
||||||
kubectl port-forward --address 0.0.0.0 svc/ollama-light-service 11434:11434 -n brunix --kubeconfig ./kubernetes/ivar.yaml &
|
kubectl port-forward --address 0.0.0.0 svc/ollama-light-service 11434:11434 -n brunix --kubeconfig ./kubernetes/kubeconfig.yaml &
|
||||||
|
|
||||||
# 2. Knowledge Base Tunnel (Elasticsearch)
|
# 2. Knowledge Base Tunnel (Elasticsearch)
|
||||||
kubectl port-forward --address 0.0.0.0 svc/brunix-vector-db 9200:9200 -n brunix --kubeconfig ./kubernetes/ivar.yaml &
|
kubectl port-forward --address 0.0.0.0 svc/brunix-vector-db 9200:9200 -n brunix --kubeconfig ./kubernetes/kubeconfig.yaml &
|
||||||
|
|
||||||
# 3. Observability DB Tunnel (PostgreSQL)
|
# 3. Observability DB Tunnel (PostgreSQL)
|
||||||
kubectl port-forward --address 0.0.0.0 svc/brunix-postgres 5432:5432 -n brunix --kubeconfig ./kubernetes/ivar.yaml &
|
kubectl port-forward --address 0.0.0.0 svc/brunix-postgres 5432:5432 -n brunix --kubeconfig ./kubernetes/kubeconfig.yaml &
|
||||||
```
|
```
|
||||||
|
|
||||||
### 5. Launch the Engine
|
### 5. Launch the Engine
|
||||||
|
|
|
||||||
|
|
@ -15,13 +15,13 @@ All notable changes to the **Brunix Assistance Engine** will be documented in th
|
||||||
- `src/config.py`: environment variables configuration file.
|
- `src/config.py`: environment variables configuration file.
|
||||||
|
|
||||||
### Changed
|
### Changed
|
||||||
- REFACTORED: `scripts/pipelines/flows/elasticsearch_ingestion.py` now uses `docs` documents instead of pre chunked files.
|
- REFACTORED: `scripts/pipelines/flows/elasticsearch_ingestion.py` now uses `docs/LRM` or `docs/samples` documents instead of pre chunked files.
|
||||||
- RENAMED `docs/AVAP Language: Core Commands & Functional Specification` to `docs/avap_language_github_docs`.
|
- RENAMED `docs/AVAP Language: Core Commands & Functional Specification` to `docs/avap_language_github_docs`.
|
||||||
- REMOVED: `Makefile` file.
|
- REMOVED: `Makefile` file.
|
||||||
- REMOVED: `scripts/start-tunnels.sh` script.
|
- REMOVED: `scripts/start-tunnels.sh` script.
|
||||||
- REMOVED `ingestion` folder.
|
- REMOVED `ingestion` folder.
|
||||||
- DEPENDENCIES: `requirements.txt` updated with new libraries required by the new modules.
|
- DEPENDENCIES: `requirements.txt` updated with new libraries required by the new modules.
|
||||||
- MOVED `scripts/generate_mbpp_avap.py` into `scripts/flows/generate_mbpp_avap.py`
|
- MOVED `scripts/generate_mbap.py` into `scripts/flows/generate_mbap.py`.
|
||||||
|
|
||||||
|
|
||||||
## [1.4.0] - 2026-03-10
|
## [1.4.0] - 2026-03-10
|
||||||
|
|
|
||||||
2305
docs/AVAP_dev.md
2305
docs/AVAP_dev.md
File diff suppressed because it is too large
Load Diff
3191
docs/avap.txt
3191
docs/avap.txt
File diff suppressed because it is too large
Load Diff
|
|
@ -5,7 +5,6 @@ description = "Add your description here"
|
||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
requires-python = ">=3.11"
|
requires-python = ">=3.11"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"chonkie[semantic]>=1.5.6",
|
|
||||||
"grpcio>=1.78.0",
|
"grpcio>=1.78.0",
|
||||||
"grpcio-reflection>=1.78.0",
|
"grpcio-reflection>=1.78.0",
|
||||||
"grpcio-tools>=1.78.0",
|
"grpcio-tools>=1.78.0",
|
||||||
|
|
@ -28,7 +27,9 @@ dependencies = [
|
||||||
dev = [
|
dev = [
|
||||||
"beir>=2.2.0",
|
"beir>=2.2.0",
|
||||||
"boto3>=1.42.58",
|
"boto3>=1.42.58",
|
||||||
|
"chonkie[elastic,semantic]>=1.6.0",
|
||||||
"evidently>=0.7.20",
|
"evidently>=0.7.20",
|
||||||
|
"flatbuffers>=25.12.19",
|
||||||
"jupyter>=1.1.1",
|
"jupyter>=1.1.1",
|
||||||
"langfuse<3",
|
"langfuse<3",
|
||||||
"litellm>=1.82.0",
|
"litellm>=1.82.0",
|
||||||
|
|
|
||||||
|
|
@ -1,30 +1,29 @@
|
||||||
import re
|
|
||||||
import hashlib
|
|
||||||
from typing import Any
|
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
import typer
|
import typer
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
from elasticsearch import Elasticsearch
|
from elasticsearch import Elasticsearch
|
||||||
from langchain_core.documents import Document
|
|
||||||
from langchain_elasticsearch import ElasticsearchStore
|
from langchain_elasticsearch import ElasticsearchStore
|
||||||
from langchain_community.embeddings import HuggingFaceEmbeddings
|
from chonkie import SemanticChunker, MarkdownChef
|
||||||
from langchain_experimental.text_splitter import SemanticChunker
|
from transformers import AutoTokenizer
|
||||||
|
|
||||||
from src.utils.emb_factory import create_embedding_model
|
from src.utils.emb_factory import create_embedding_model
|
||||||
from scripts.pipelines.tasks.chunk import scrape_avap_docs
|
from scripts.pipelines.tasks.chunk import (
|
||||||
|
read_files,
|
||||||
|
get_chunk_docs,
|
||||||
|
convert_chunks_to_document
|
||||||
|
)
|
||||||
|
|
||||||
app = typer.Typer()
|
app = typer.Typer()
|
||||||
|
|
||||||
ELASTICSEARCH_LOCAL_URL = os.getenv("ELASTICSEARCH_LOCAL_URL")
|
ELASTICSEARCH_LOCAL_URL = os.getenv("ELASTICSEARCH_LOCAL_URL")
|
||||||
OLLAMA_LOCAL_URL = os.getenv("OLLAMA_LOCAL_URL")
|
OLLAMA_LOCAL_URL = os.getenv("OLLAMA_LOCAL_URL")
|
||||||
ELASTICSEARCH_INDEX = os.getenv("ELASTICSEARCH_INDEX")
|
|
||||||
OLLAMA_URL = os.getenv("OLLAMA_URL")
|
OLLAMA_URL = os.getenv("OLLAMA_URL")
|
||||||
OLLAMA_EMB_MODEL_NAME = os.getenv("OLLAMA_EMB_MODEL_NAME")
|
OLLAMA_EMB_MODEL_NAME = os.getenv("OLLAMA_EMB_MODEL_NAME")
|
||||||
AVAP_WEB_DOCS_URL = os.getenv("AVAP_WEB_DOCS_URL")
|
AVAP_WEB_DOCS_URL = os.getenv("AVAP_WEB_DOCS_URL")
|
||||||
|
HF_EMB_MODEL_NAME = os.getenv("HF_EMB_MODEL_NAME")
|
||||||
|
|
||||||
class DistanceStrategy(str, Enum):
|
class DistanceStrategy(str, Enum):
|
||||||
euclidean = "EUCLIDEAN_DISTANCE"
|
euclidean = "EUCLIDEAN_DISTANCE"
|
||||||
|
|
@ -33,55 +32,45 @@ class DistanceStrategy(str, Enum):
|
||||||
jaccard = "JACCARD"
|
jaccard = "JACCARD"
|
||||||
cosine = "COSINE"
|
cosine = "COSINE"
|
||||||
|
|
||||||
def clean_text(text: str) -> str:
|
|
||||||
text = text.replace("\u00a0", " ")
|
|
||||||
text = re.sub(r"\s+", " ", text).strip()
|
|
||||||
return text
|
|
||||||
|
|
||||||
def build_documents_from_folder(
|
|
||||||
folder_path: str,
|
|
||||||
) -> list[Document]:
|
|
||||||
|
|
||||||
folder = Path(folder_path)
|
|
||||||
|
|
||||||
if not folder.exists() or not folder.is_dir():
|
|
||||||
raise ValueError(f"Invalid folder path: {folder_path}")
|
|
||||||
|
|
||||||
all_documents: list[Document] = []
|
|
||||||
|
|
||||||
for file_path in folder.glob("*.txt"):
|
|
||||||
doc_text = file_path.read_text(encoding="utf-8")
|
|
||||||
|
|
||||||
if not doc_text.strip():
|
|
||||||
continue
|
|
||||||
|
|
||||||
metadata: dict[str, Any] = {
|
|
||||||
"source": file_path.name,
|
|
||||||
}
|
|
||||||
|
|
||||||
doc_text = clean_text(doc_text)
|
|
||||||
document = Document(
|
|
||||||
id=hashlib.md5(file_path.name.encode()).hexdigest(),
|
|
||||||
page_content=doc_text,
|
|
||||||
metadata={**metadata}
|
|
||||||
)
|
|
||||||
|
|
||||||
all_documents.append(document)
|
|
||||||
|
|
||||||
return all_documents
|
|
||||||
|
|
||||||
|
|
||||||
@app.command()
|
@app.command()
|
||||||
def elasticsearch_ingestion(
|
def elasticsearch_ingestion(
|
||||||
docs_folder_path: str = "ingestion/docs",
|
docs_folder_path: str = "docs",
|
||||||
|
es_index: str = "avap-docs-test-v2",
|
||||||
es_request_timeout: int = 120,
|
es_request_timeout: int = 120,
|
||||||
es_max_retries: int = 5,
|
es_max_retries: int = 5,
|
||||||
es_retry_on_timeout: bool = True,
|
es_retry_on_timeout: bool = True,
|
||||||
distance_strategy: DistanceStrategy = DistanceStrategy.cosine,
|
distance_strategy: DistanceStrategy = DistanceStrategy.cosine,
|
||||||
|
chunk_size: int = 2048,
|
||||||
|
chunk_threshold: float = 0.5,
|
||||||
|
chunk_similarity_window: int = 3,
|
||||||
|
chunk_skip_window: int = 1,
|
||||||
):
|
):
|
||||||
logger.info("Starting Elasticsearch ingestion pipeline...")
|
logger.info("Starting Elasticsearch ingestion pipeline...")
|
||||||
logger.info(f"Using docs folder path: {docs_folder_path}")
|
logger.info(f"Reading files from folder: {docs_folder_path}/LRM and {docs_folder_path}/samples...")
|
||||||
documents = build_documents_from_folder(folder_path=docs_folder_path)
|
avap_code_docs = read_files(f"{docs_folder_path}/samples")
|
||||||
|
avap_language_docs = read_files(f"{docs_folder_path}/LRM")
|
||||||
|
|
||||||
|
logger.info("Instantiating semantic chunker and chef...")
|
||||||
|
custom_tokenizer = AutoTokenizer.from_pretrained(HF_EMB_MODEL_NAME)
|
||||||
|
chef = MarkdownChef(tokenizer=custom_tokenizer)
|
||||||
|
chunker = SemanticChunker(
|
||||||
|
embedding_model=HF_EMB_MODEL_NAME,
|
||||||
|
chunk_size=chunk_size,
|
||||||
|
threshold=chunk_threshold,
|
||||||
|
similarity_window=chunk_similarity_window,
|
||||||
|
skip_window=chunk_skip_window
|
||||||
|
)
|
||||||
|
logger.info("Processing Markdown docs with chef...")
|
||||||
|
doc = chef.process(f"{docs_folder_path}/LRM/avap.md")
|
||||||
|
|
||||||
|
logger.info("Chunking AVAP Language docs...")
|
||||||
|
avap_language_docs_chunks = get_chunk_docs(avap_language_docs, chunker)
|
||||||
|
|
||||||
|
logger.info("Creating Langchain Document to index...")
|
||||||
|
avap_language_langchain_docs = convert_chunks_to_document(avap_language_docs_chunks)
|
||||||
|
avap_code_langchain_docs = convert_chunks_to_document(avap_code_docs)
|
||||||
|
avap_documents = avap_language_langchain_docs + avap_code_langchain_docs
|
||||||
|
|
||||||
logger.info("Connecting to Elasticsearch...")
|
logger.info("Connecting to Elasticsearch...")
|
||||||
try:
|
try:
|
||||||
|
|
@ -106,15 +95,19 @@ def elasticsearch_ingestion(
|
||||||
logger.exception("Failed to instantiate embeddings model.")
|
logger.exception("Failed to instantiate embeddings model.")
|
||||||
raise
|
raise
|
||||||
|
|
||||||
logger.info(f"Uploading documents to index {ELASTICSEARCH_INDEX}...")
|
logger.info(f"Checking if index {es_index} exists and deleting if it does...")
|
||||||
|
if es.indices.exists(index=es_index):
|
||||||
|
es.indices.delete(index=es_index)
|
||||||
|
|
||||||
|
logger.info(f"Uploading documents to index {es_index}...")
|
||||||
ElasticsearchStore.from_documents(
|
ElasticsearchStore.from_documents(
|
||||||
documents,
|
avap_documents,
|
||||||
embeddings,
|
embeddings,
|
||||||
client=es,
|
client=es,
|
||||||
index_name=ELASTICSEARCH_INDEX,
|
index_name=es_index,
|
||||||
distance_strategy=distance_strategy.value,
|
distance_strategy=distance_strategy.value,
|
||||||
)
|
)
|
||||||
logger.info(f"Finished uploading documents to index {ELASTICSEARCH_INDEX}.")
|
logger.info(f"Finished uploading documents to index {es_index}.")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,122 @@
|
||||||
|
import typer
|
||||||
|
import logging
|
||||||
|
|
||||||
|
from loguru import logger
|
||||||
|
from elasticsearch import Elasticsearch
|
||||||
|
from chonkie import MarkdownChef, FileFetcher, ElasticHandshake
|
||||||
|
from transformers import AutoTokenizer
|
||||||
|
|
||||||
|
from src.config import settings
|
||||||
|
from scripts.pipelines.tasks.embeddings import OllamaEmbeddings
|
||||||
|
from scripts.pipelines.tasks.chunk import merge_markdown_document
|
||||||
|
|
||||||
|
app = typer.Typer()
|
||||||
|
|
||||||
|
def get_processing_and_chunking_config(docs_extension: str, chunk_size: int,
|
||||||
|
chunk_threshold: float | None,
|
||||||
|
chunk_similarity_window: int| None,
|
||||||
|
chunk_skip_window: int | None) -> tuple[str, dict, str, dict]:
|
||||||
|
"""
|
||||||
|
Check the file extension and return the appropriate processing and chunking strategies and their kwargs.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
docs_extension (str): The file extension of the documents to be ingested.
|
||||||
|
chunk_size (int): The size of the chunks to be created.
|
||||||
|
chunk_threshold (float, optional): The threshold for semantic chunking. Required if docs_extension is .md.
|
||||||
|
chunk_similarity_window (int, optional): The similarity window for semantic chunking
|
||||||
|
chunk_skip_window (int, optional): The skip window for semantic chunking.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
tuple[str, dict, str, dict]: A tuple containing the processing strategy, its kwargs, the chunking strategy, and its kwargs.
|
||||||
|
"""
|
||||||
|
if docs_extension == ".md":
|
||||||
|
process_type = "markdown"
|
||||||
|
custom_tokenizer = AutoTokenizer.from_pretrained(settings.hf_emb_model_name)
|
||||||
|
process_kwargs = {"tokenizer": custom_tokenizer}
|
||||||
|
# process_type = "text"
|
||||||
|
# process_kwargs = {}
|
||||||
|
chunk_strat = "semantic"
|
||||||
|
chunk_kwargs = {"embedding_model": settings.hf_emb_model_name, "threshold": chunk_threshold, "chunk_size": chunk_size,
|
||||||
|
"similarity_window": chunk_similarity_window, "skip_window": chunk_skip_window}
|
||||||
|
|
||||||
|
elif docs_extension == ".avap":
|
||||||
|
process_type = "text"
|
||||||
|
process_kwargs = {}
|
||||||
|
chunk_strat = "recursive" # Once we have the BNF and uploaded to tree-sitter, we can use code (?)
|
||||||
|
chunk_kwargs = {"chunk_size": chunk_size}
|
||||||
|
|
||||||
|
return process_type, process_kwargs, chunk_strat, chunk_kwargs
|
||||||
|
|
||||||
|
|
||||||
|
@app.command()
|
||||||
|
def elasticsearch_ingestion(
|
||||||
|
docs_folder_path: str = "docs/LRM",
|
||||||
|
docs_extension: str = ".md",
|
||||||
|
es_index: str = "avap-docs-test-v3",
|
||||||
|
es_request_timeout: int = 120,
|
||||||
|
es_max_retries: int = 5,
|
||||||
|
es_retry_on_timeout: bool = True,
|
||||||
|
delete_es_index: bool = True,
|
||||||
|
chunk_size: int = 2048,
|
||||||
|
chunk_threshold: float | None = 0.5,
|
||||||
|
chunk_similarity_window: int | None = 3,
|
||||||
|
chunk_skip_window: int | None = 1
|
||||||
|
):
|
||||||
|
custom_tokenizer = AutoTokenizer.from_pretrained(settings.hf_emb_model_name)
|
||||||
|
processed_docs = []
|
||||||
|
fused_docs = []
|
||||||
|
logger.info(f"Instantiating Elasticsearch client with URL: {settings.elasticsearch_local_url}...")
|
||||||
|
es = Elasticsearch(
|
||||||
|
hosts=settings.elasticsearch_local_url,
|
||||||
|
request_timeout=es_request_timeout,
|
||||||
|
max_retries=es_max_retries,
|
||||||
|
retry_on_timeout=es_retry_on_timeout,
|
||||||
|
)
|
||||||
|
|
||||||
|
if delete_es_index and es.indices.exists(index=es_index):
|
||||||
|
logger.info(f"Deleting existing Elasticsearch index: {es_index}...")
|
||||||
|
es.indices.delete(index=es_index)
|
||||||
|
|
||||||
|
logger.info("Starting Elasticsearch ingestion pipeline...")
|
||||||
|
(process_type,
|
||||||
|
process_kwargs,
|
||||||
|
chunk_strat,
|
||||||
|
chunk_kwargs) = get_processing_and_chunking_config(docs_extension, chunk_size, chunk_threshold, chunk_similarity_window, chunk_skip_window)
|
||||||
|
|
||||||
|
logger.info(f"Fetching files from {docs_folder_path}...")
|
||||||
|
fetcher = FileFetcher()
|
||||||
|
docs = fetcher.fetch(dir=f"{settings.proj_root}/{docs_folder_path}")
|
||||||
|
|
||||||
|
logger.info(f"Processing documents with process_type: {process_type}...")
|
||||||
|
chef = MarkdownChef(tokenizer=custom_tokenizer)
|
||||||
|
for doc in docs:
|
||||||
|
processed_doc = chef.process(doc)
|
||||||
|
processed_docs.append(processed_doc)
|
||||||
|
|
||||||
|
logger.info(f"Chunking documents with chunk_strat: {chunk_strat}...")
|
||||||
|
for processed_doc in processed_docs:
|
||||||
|
fused_doc = merge_markdown_document(processed_doc)
|
||||||
|
fused_docs.append(fused_doc)
|
||||||
|
|
||||||
|
logger.info(f"Ingesting chunks in Elasticsearch index: {es_index}...")
|
||||||
|
handshake = ElasticHandshake(
|
||||||
|
client=es,
|
||||||
|
index_name=es_index,
|
||||||
|
embedding_model=OllamaEmbeddings(model=settings.ollama_emb_model_name)
|
||||||
|
)
|
||||||
|
for fused_doc in fused_docs:
|
||||||
|
handshake.write(fused_doc.chunks)
|
||||||
|
|
||||||
|
logger.info(f"Finished ingesting in {es_index}.")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
logging.basicConfig(
|
||||||
|
level=logging.INFO,
|
||||||
|
format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
app()
|
||||||
|
except Exception as exc:
|
||||||
|
logger.exception(exc)
|
||||||
|
raise
|
||||||
|
|
@ -1,134 +1,38 @@
|
||||||
from enum import Enum
|
|
||||||
import typer
|
import typer
|
||||||
import logging
|
import logging
|
||||||
import os
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
from elasticsearch import Elasticsearch
|
from chonkie import FileFetcher
|
||||||
from langchain_elasticsearch import ElasticsearchStore
|
|
||||||
from chonkie import SemanticChunker
|
|
||||||
|
|
||||||
from src.utils.emb_factory import create_embedding_model
|
from src.config import settings
|
||||||
from scripts.pipelines.tasks.chunk import (
|
from scripts.pipelines.tasks.chunk import process_documents, ingest_documents
|
||||||
read_files,
|
|
||||||
get_chunk_docs,
|
|
||||||
convert_chunks_to_document
|
|
||||||
)
|
|
||||||
|
|
||||||
app = typer.Typer()
|
app = typer.Typer()
|
||||||
|
|
||||||
ELASTICSEARCH_LOCAL_URL = os.getenv("ELASTICSEARCH_LOCAL_URL")
|
|
||||||
OLLAMA_LOCAL_URL = os.getenv("OLLAMA_LOCAL_URL")
|
|
||||||
ELASTICSEARCH_INDEX = os.getenv("ELASTICSEARCH_INDEX")
|
|
||||||
OLLAMA_URL = os.getenv("OLLAMA_URL")
|
|
||||||
OLLAMA_EMB_MODEL_NAME = os.getenv("OLLAMA_EMB_MODEL_NAME")
|
|
||||||
AVAP_WEB_DOCS_URL = os.getenv("AVAP_WEB_DOCS_URL")
|
|
||||||
HF_EMB_MODEL_NAME = os.getenv("HF_EMB_MODEL_NAME")
|
|
||||||
|
|
||||||
class DistanceStrategy(str, Enum):
|
|
||||||
euclidean = "EUCLIDEAN_DISTANCE"
|
|
||||||
max_inner_product = "MAX_INNER_PRODUCT"
|
|
||||||
dot_product = "DOT_PRODUCT"
|
|
||||||
jaccard = "JACCARD"
|
|
||||||
cosine = "COSINE"
|
|
||||||
|
|
||||||
|
|
||||||
@app.command()
|
@app.command()
|
||||||
def elasticsearch_ingestion(
|
def elasticsearch_ingestion(
|
||||||
docs_folder_path: str = "docs",
|
docs_folder_path: str = "docs/LRM",
|
||||||
|
docs_extension: str = ".md",
|
||||||
|
es_index: str = "avap-docs-test-v3",
|
||||||
es_request_timeout: int = 120,
|
es_request_timeout: int = 120,
|
||||||
es_max_retries: int = 5,
|
es_max_retries: int = 5,
|
||||||
es_retry_on_timeout: bool = True,
|
es_retry_on_timeout: bool = True,
|
||||||
distance_strategy: DistanceStrategy = DistanceStrategy.cosine,
|
delete_es_index: bool = True
|
||||||
chunk_size: int = 2048,
|
|
||||||
chunk_threshold: float = 0.5,
|
|
||||||
chunk_similarity_window: int = 3,
|
|
||||||
chunk_skip_window: int = 1,
|
|
||||||
):
|
):
|
||||||
logger.info("Starting Elasticsearch ingestion pipeline...")
|
logger.info("Starting Elasticsearch ingestion pipeline...")
|
||||||
logger.info(f"Reading and concatenating files from folder: {docs_folder_path}/developer.avapframework.com")
|
logger.info(f"Fetching files from {docs_folder_path}...")
|
||||||
avap_github_docs = read_files(f"{docs_folder_path}/avap_language_github_docs", concatenate=False)
|
fetcher = FileFetcher()
|
||||||
avap_web_docs_intro = read_files(f"{docs_folder_path}/developer.avapframework.com", "intro", concatenate=True)
|
docs_path = fetcher.fetch(dir=f"{settings.proj_root}/{docs_folder_path}")
|
||||||
|
|
||||||
# Check chapters in developer.avapframework.com folder and read and concatenate files for each chapter
|
logger.info("Processing docs...")
|
||||||
chapters = sorted({
|
chunked_docs = process_documents(docs_path, docs_extension)
|
||||||
p.name.split("_")[0]
|
|
||||||
for p in Path(f"{docs_folder_path}/developer.avapframework.com").glob("chapter*.md")
|
|
||||||
})
|
|
||||||
|
|
||||||
avap_web_docs_chapters = [
|
logger.info(f"Ingesting chunks in Elasticsearch index: {es_index}...")
|
||||||
item
|
ingest_documents(chunked_docs, es_index, es_request_timeout, es_max_retries,
|
||||||
for chapter in chapters
|
es_retry_on_timeout, delete_es_index)
|
||||||
for item in read_files(
|
|
||||||
f"{docs_folder_path}/developer.avapframework.com",
|
|
||||||
f"{chapter}_",
|
|
||||||
concatenate=True
|
|
||||||
)
|
|
||||||
]
|
|
||||||
|
|
||||||
avap_web_docs_appendices = read_files(f"{docs_folder_path}/developer.avapframework.com", "appendices_", concatenate=False)
|
logger.info(f"Finished ingesting in {es_index}.")
|
||||||
avap_samples_docs = read_files(f"{docs_folder_path}/samples", concatenate=False)
|
|
||||||
|
|
||||||
logger.info("Instantiating semantic chunker...")
|
|
||||||
chunker = SemanticChunker(
|
|
||||||
embedding_model=HF_EMB_MODEL_NAME,
|
|
||||||
chunk_size=chunk_size,
|
|
||||||
threshold=chunk_threshold,
|
|
||||||
similarity_window=chunk_similarity_window,
|
|
||||||
skip_window=chunk_skip_window
|
|
||||||
)
|
|
||||||
|
|
||||||
logger.info("Chunking AVAP GitHub docs...")
|
|
||||||
avap_github_docs_chunks = get_chunk_docs(avap_github_docs, chunker)
|
|
||||||
|
|
||||||
logger.info("Chunking AVAP web docs chapters...")
|
|
||||||
avap_web_docs_chapters_chunks = get_chunk_docs(avap_web_docs_chapters, chunker)
|
|
||||||
|
|
||||||
logger.info("Creating Langchain Document to index...")
|
|
||||||
avap_github_langchain_docs = convert_chunks_to_document(avap_github_docs_chunks)
|
|
||||||
avap_web_chapters_langchain_docs = convert_chunks_to_document(avap_web_docs_chapters_chunks)
|
|
||||||
avap_web_intro_langchain_docs = convert_chunks_to_document(avap_web_docs_intro)
|
|
||||||
avap_web_appendices_langchain_docs = convert_chunks_to_document(avap_web_docs_appendices)
|
|
||||||
avap_samples_langchain_docs = convert_chunks_to_document(avap_samples_docs)
|
|
||||||
avap_documents = avap_github_langchain_docs + avap_web_chapters_langchain_docs + avap_web_intro_langchain_docs + avap_web_appendices_langchain_docs + avap_samples_langchain_docs
|
|
||||||
|
|
||||||
logger.info("Connecting to Elasticsearch...")
|
|
||||||
try:
|
|
||||||
es = Elasticsearch(
|
|
||||||
ELASTICSEARCH_LOCAL_URL,
|
|
||||||
request_timeout=es_request_timeout,
|
|
||||||
max_retries=es_max_retries,
|
|
||||||
retry_on_timeout=es_retry_on_timeout,
|
|
||||||
)
|
|
||||||
except:
|
|
||||||
logger.exception("Failed to connect to Elasticsearch.")
|
|
||||||
raise
|
|
||||||
|
|
||||||
logger.info("Instantiating embeddings model...")
|
|
||||||
try:
|
|
||||||
embeddings = create_embedding_model(
|
|
||||||
provider="ollama",
|
|
||||||
model=OLLAMA_EMB_MODEL_NAME,
|
|
||||||
base_url=OLLAMA_LOCAL_URL,
|
|
||||||
)
|
|
||||||
except:
|
|
||||||
logger.exception("Failed to instantiate embeddings model.")
|
|
||||||
raise
|
|
||||||
|
|
||||||
logger.info(f"Checking if index {ELASTICSEARCH_INDEX} exists and deleting if it does...")
|
|
||||||
if es.indices.exists(index=ELASTICSEARCH_INDEX):
|
|
||||||
es.indices.delete(index=ELASTICSEARCH_INDEX)
|
|
||||||
|
|
||||||
logger.info(f"Uploading documents to index {ELASTICSEARCH_INDEX}...")
|
|
||||||
ElasticsearchStore.from_documents(
|
|
||||||
avap_documents,
|
|
||||||
embeddings,
|
|
||||||
client=es,
|
|
||||||
index_name=ELASTICSEARCH_INDEX,
|
|
||||||
distance_strategy=distance_strategy.value,
|
|
||||||
)
|
|
||||||
logger.info(f"Finished uploading documents to index {ELASTICSEARCH_INDEX}.")
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
|
||||||
|
|
@ -1,136 +1,159 @@
|
||||||
import os
|
from copy import deepcopy
|
||||||
import re
|
from dataclasses import replace
|
||||||
import uuid
|
from pathlib import Path
|
||||||
|
|
||||||
|
from chonkie import (
|
||||||
|
Chunk,
|
||||||
|
ElasticHandshake,
|
||||||
|
MarkdownChef,
|
||||||
|
TextChef,
|
||||||
|
TokenChunker,
|
||||||
|
)
|
||||||
|
from elasticsearch import Elasticsearch
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
from chonkie import Chunk, SemanticChunker
|
from transformers import AutoTokenizer
|
||||||
from langchain_core.documents import Document
|
|
||||||
|
from scripts.pipelines.tasks.embeddings import OllamaEmbeddings
|
||||||
|
from src.config import settings
|
||||||
|
|
||||||
|
|
||||||
def replace_javascript_with_avap(text: str) -> str:
|
def _get_text(element) -> str:
|
||||||
"""
|
for attr in ("text", "content", "markdown"):
|
||||||
Replace mentions of javascript language with avap in the text.
|
value = getattr(element, attr, None)
|
||||||
Handles code blocks, language identifiers, and references.
|
if isinstance(value, str):
|
||||||
|
return value
|
||||||
Args:
|
raise AttributeError(
|
||||||
text: The text to process.
|
f"Could not extract text from element of type {type(element).__name__}"
|
||||||
|
|
||||||
Returns:
|
|
||||||
The text with javascript references replaced with avap.
|
|
||||||
"""
|
|
||||||
# Replace ```javascript with ```avap
|
|
||||||
text = text.replace("```javascript", "```avap")
|
|
||||||
|
|
||||||
# Replace ```js with ```avap
|
|
||||||
text = text.replace("```js", "```avap")
|
|
||||||
|
|
||||||
# Replace common phrases (case-insensitive)
|
|
||||||
text = re.sub(r"\bjavascript\s+code\b", "avap code", text, flags=re.IGNORECASE)
|
|
||||||
text = re.sub(
|
|
||||||
r"\bjavascript\s+example\b", "avap example", text, flags=re.IGNORECASE
|
|
||||||
)
|
)
|
||||||
text = re.sub(r"\bjavascript\b(?!\s+file)", "avap", text, flags=re.IGNORECASE)
|
|
||||||
|
|
||||||
return text
|
|
||||||
|
|
||||||
|
|
||||||
def read_files(
|
def _merge_markdown_document(doc):
|
||||||
folder_path: str, file_prefix: str | None = None, concatenate: bool = True
|
elements = []
|
||||||
) -> list[dict]:
|
|
||||||
"""
|
|
||||||
Read files in a folder whose names start with a given prefix.
|
|
||||||
Replaces javascript language markers with avap.
|
|
||||||
|
|
||||||
Args:
|
for chunk in doc.chunks:
|
||||||
folder_path: Path to the folder to search in.
|
elements.append(("chunk", chunk.start_index, chunk.end_index, chunk))
|
||||||
file_prefix: The prefix that file names must start with.
|
|
||||||
If None, all files in the folder are included.
|
|
||||||
concatenate: Whether to concatenate the contents of the files.
|
|
||||||
|
|
||||||
Returns:
|
for code in doc.code:
|
||||||
A list of dictionaries, each containing 'content' and 'title' keys.
|
elements.append(("code", code.start_index, code.end_index, code))
|
||||||
If concatenate is True, returns a single dict with concatenated content and title as 'appendix'.
|
|
||||||
If concatenate is False, returns one dict per file with filename as title.
|
|
||||||
"""
|
|
||||||
contents = []
|
|
||||||
filenames = []
|
|
||||||
|
|
||||||
for filename in sorted(os.listdir(folder_path)):
|
for table in doc.tables:
|
||||||
include_file = file_prefix is None or filename.startswith(file_prefix)
|
elements.append(("table", table.start_index, table.end_index, table))
|
||||||
if include_file:
|
|
||||||
file_path = os.path.join(folder_path, filename)
|
|
||||||
if os.path.isfile(file_path):
|
|
||||||
with open(file_path, "r", encoding="utf-8") as f:
|
|
||||||
content = f.read()
|
|
||||||
cleaned_content = content.strip()
|
|
||||||
if cleaned_content:
|
|
||||||
contents.append(cleaned_content)
|
|
||||||
filenames.append(filename)
|
|
||||||
|
|
||||||
if concatenate:
|
elements.sort(key=lambda item: (item[1], item[2]))
|
||||||
concatenated = "\n".join(contents)
|
|
||||||
processed_content = replace_javascript_with_avap(concatenated)
|
merged_chunks = []
|
||||||
title = file_prefix if file_prefix is not None else "all_files"
|
current_chunk = None
|
||||||
return [{"content": processed_content, "title": title}]
|
current_parts = []
|
||||||
else:
|
current_end_index = None
|
||||||
return [
|
current_token_count = None
|
||||||
{"content": replace_javascript_with_avap(content), "title": filename}
|
|
||||||
for content, filename in zip(contents, filenames)
|
def flush():
|
||||||
]
|
nonlocal current_chunk, current_parts, current_end_index, current_token_count
|
||||||
|
|
||||||
|
if current_chunk is None:
|
||||||
|
return
|
||||||
|
|
||||||
|
merged_text = "\n\n".join(part for part in current_parts if part)
|
||||||
|
|
||||||
|
merged_chunks.append(
|
||||||
|
replace(
|
||||||
|
current_chunk,
|
||||||
|
text=merged_text,
|
||||||
|
end_index=current_end_index,
|
||||||
|
token_count=current_token_count,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
current_chunk = None
|
||||||
|
current_parts = []
|
||||||
|
current_end_index = None
|
||||||
|
current_token_count = None
|
||||||
|
|
||||||
|
for kind, _, _, element in elements:
|
||||||
|
if kind == "chunk":
|
||||||
|
flush()
|
||||||
|
current_chunk = element
|
||||||
|
current_parts = [_get_text(element)]
|
||||||
|
current_end_index = element.end_index
|
||||||
|
current_token_count = element.token_count
|
||||||
|
continue
|
||||||
|
|
||||||
|
if current_chunk is None:
|
||||||
|
continue
|
||||||
|
|
||||||
|
current_parts.append(_get_text(element))
|
||||||
|
current_end_index = max(current_end_index, element.end_index)
|
||||||
|
current_token_count += getattr(element, "token_count", 0)
|
||||||
|
|
||||||
|
flush()
|
||||||
|
|
||||||
|
new_doc = deepcopy(doc)
|
||||||
|
new_doc.chunks = merged_chunks
|
||||||
|
new_doc.code = doc.code
|
||||||
|
new_doc.tables = doc.tables
|
||||||
|
|
||||||
|
return new_doc
|
||||||
|
|
||||||
|
|
||||||
def get_chunk_docs(docs: list[dict], chunker: SemanticChunker) -> list[list[Chunk]]:
|
def process_documents(docs_path: list[Path], docs_extension: str) -> list[Chunk]:
|
||||||
"""
|
processed_docs = []
|
||||||
Chunk the content of the documents using the provided chunker.
|
chunked_docs = []
|
||||||
|
custom_tokenizer = AutoTokenizer.from_pretrained(settings.hf_emb_model_name)
|
||||||
|
|
||||||
Args:
|
if docs_extension == ".md":
|
||||||
docs: A list of dictionaries, each containing 'content' and 'title' keys.
|
chef = MarkdownChef(tokenizer=custom_tokenizer)
|
||||||
chunker: An instance of SemanticChunker to use for chunking the content.
|
for doc in docs_path:
|
||||||
|
processed_doc = chef.process(doc)
|
||||||
|
processed_docs.append((processed_doc, doc.name))
|
||||||
|
|
||||||
Returns:
|
for processed_doc, filename in processed_docs:
|
||||||
A list of lists of Chunk objects, where each inner list corresponds to the chunks of a
|
fused_doc = _merge_markdown_document(processed_doc)
|
||||||
single document.
|
chunked_docs.extend(fused_doc.chunks)
|
||||||
"""
|
|
||||||
list_chunks = []
|
|
||||||
|
|
||||||
for doc in docs:
|
elif docs_extension == ".avap":
|
||||||
content = doc["content"]
|
chef = TextChef()
|
||||||
chunks = chunker.chunk(content)
|
chunker = TokenChunker(tokenizer=custom_tokenizer)
|
||||||
for chunk in chunks:
|
for doc in docs_path:
|
||||||
chunk.context = {"source": doc["title"]}
|
processed_doc = chef.process(doc)
|
||||||
list_chunks.append(chunks)
|
processed_docs.append((processed_doc, doc.name))
|
||||||
logger.info(f"Finished chunking {doc['title']}")
|
|
||||||
|
|
||||||
return list_chunks
|
for processed_doc, filename in processed_docs:
|
||||||
|
chunked_doc = chunker.chunk(processed_doc.content)
|
||||||
|
chunked_docs.extend(chunked_doc)
|
||||||
|
|
||||||
|
return chunked_docs
|
||||||
|
|
||||||
|
|
||||||
def convert_chunks_to_document(chunks: list[dict] | list[list[Chunk]]) -> list[Document]:
|
def ingest_documents(
|
||||||
"""
|
chunked_docs: list[Chunk],
|
||||||
Convert the chunked content into a list of Document objects.
|
es_index: str,
|
||||||
|
es_request_timeout: int,
|
||||||
|
es_max_retries: int,
|
||||||
|
es_retry_on_timeout: bool,
|
||||||
|
delete_es_index: bool,
|
||||||
|
) -> None:
|
||||||
|
|
||||||
Args:
|
logger.info(
|
||||||
chunks: A list of dictionaries containing 'content' and 'title' keys.
|
f"Instantiating Elasticsearch client with URL: {settings.elasticsearch_local_url}..."
|
||||||
|
)
|
||||||
|
es = Elasticsearch(
|
||||||
|
hosts=settings.elasticsearch_local_url,
|
||||||
|
request_timeout=es_request_timeout,
|
||||||
|
max_retries=es_max_retries,
|
||||||
|
retry_on_timeout=es_retry_on_timeout,
|
||||||
|
)
|
||||||
|
|
||||||
Returns:
|
if delete_es_index and es.indices.exists(index=es_index):
|
||||||
A list of Document objects created from the chunked content.
|
logger.info(f"Deleting existing Elasticsearch index: {es_index}...")
|
||||||
"""
|
es.indices.delete(index=es_index)
|
||||||
documents = []
|
|
||||||
|
|
||||||
if isinstance(chunks[0], dict):
|
handshake = ElasticHandshake(
|
||||||
for chunk in chunks:
|
client=es,
|
||||||
content = chunk["content"]
|
index_name=es_index,
|
||||||
title = chunk["title"]
|
embedding_model=OllamaEmbeddings(model=settings.ollama_emb_model_name),
|
||||||
documents.append(Document(id=str(uuid.uuid4()),
|
)
|
||||||
page_content=content,
|
|
||||||
metadata={"source": title}))
|
|
||||||
|
|
||||||
else:
|
logger.info(
|
||||||
for chunk_list in chunks:
|
f"Ingesting {len(chunked_docs)} chunks into Elasticsearch index: {es_index}..."
|
||||||
for chunk in chunk_list:
|
)
|
||||||
content = chunk.text
|
handshake.write(chunked_docs)
|
||||||
title = chunk.context.get("source", "unknown")
|
|
||||||
documents.append(Document(id=str(uuid.uuid4()),
|
|
||||||
page_content=content,
|
|
||||||
metadata={"source": title}))
|
|
||||||
|
|
||||||
return documents
|
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,125 @@
|
||||||
|
import requests
|
||||||
|
from typing import Any, Callable
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
from chonkie.embeddings import BaseEmbeddings
|
||||||
|
|
||||||
|
from src.config import settings
|
||||||
|
|
||||||
|
|
||||||
|
class OllamaEmbeddings(BaseEmbeddings):
|
||||||
|
"""Chonkie embeddings adapter for a local Ollama embedding model."""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
model: str,
|
||||||
|
base_url: str = settings.ollama_local_url,
|
||||||
|
timeout: float = 60.0,
|
||||||
|
truncate: bool = True,
|
||||||
|
keep_alive: str = "5m",
|
||||||
|
) -> None:
|
||||||
|
self.model = model
|
||||||
|
self.base_url = base_url.rstrip("/")
|
||||||
|
self.timeout = timeout
|
||||||
|
self.truncate = truncate
|
||||||
|
self.keep_alive = keep_alive
|
||||||
|
self._dimension: int | None = None
|
||||||
|
|
||||||
|
@property
|
||||||
|
def dimension(self) -> int:
|
||||||
|
if self._dimension is None:
|
||||||
|
# Lazy-load the dimension from a real embedding response.
|
||||||
|
self._dimension = int(self.embed(" ").shape[0])
|
||||||
|
return self._dimension
|
||||||
|
|
||||||
|
def embed(self, text: str) -> np.ndarray:
|
||||||
|
embeddings = self._embed_api(text)
|
||||||
|
vector = np.asarray(embeddings[0], dtype=np.float32)
|
||||||
|
|
||||||
|
if self._dimension is None:
|
||||||
|
self._dimension = int(vector.shape[0])
|
||||||
|
|
||||||
|
return vector
|
||||||
|
|
||||||
|
def embed_batch(self, texts: list[str]) -> list[np.ndarray]:
|
||||||
|
if not texts:
|
||||||
|
return []
|
||||||
|
|
||||||
|
embeddings = self._embed_api(texts)
|
||||||
|
vectors = [np.asarray(vector, dtype=np.float32) for vector in embeddings]
|
||||||
|
|
||||||
|
if vectors and self._dimension is None:
|
||||||
|
self._dimension = int(vectors[0].shape[0])
|
||||||
|
|
||||||
|
return vectors
|
||||||
|
|
||||||
|
def count_tokens(self, text: str) -> int:
|
||||||
|
payload = self._build_payload(text)
|
||||||
|
response = self._post_embed(payload)
|
||||||
|
return int(response["prompt_eval_count"])
|
||||||
|
|
||||||
|
def count_tokens_batch(self, texts: list[str]) -> list[int]:
|
||||||
|
# Ollama returns a single prompt_eval_count for the whole request,
|
||||||
|
# not one count per input item, so we compute them individually.
|
||||||
|
return [self.count_tokens(text) for text in texts]
|
||||||
|
|
||||||
|
def get_tokenizer(self) -> Callable[[str], int]:
|
||||||
|
# Chonkie mainly needs something usable for token counting.
|
||||||
|
return self.count_tokens
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def is_available(cls) -> bool:
|
||||||
|
try:
|
||||||
|
response = requests.get(
|
||||||
|
f"{settings.ollama_local_url}/api/tags",
|
||||||
|
timeout=5.0,
|
||||||
|
)
|
||||||
|
response.raise_for_status()
|
||||||
|
return True
|
||||||
|
except requests.RequestException:
|
||||||
|
return False
|
||||||
|
|
||||||
|
def __repr__(self) -> str:
|
||||||
|
return (
|
||||||
|
f"OllamaEmbeddings("
|
||||||
|
f"model={self.model!r}, "
|
||||||
|
f"base_url={self.base_url!r}, "
|
||||||
|
f"dimension={self._dimension!r}"
|
||||||
|
f")"
|
||||||
|
)
|
||||||
|
|
||||||
|
def _build_payload(self, text_or_texts: str | list[str]) -> dict[str, Any]:
|
||||||
|
return {
|
||||||
|
"model": self.model,
|
||||||
|
"input": text_or_texts,
|
||||||
|
"truncate": self.truncate,
|
||||||
|
"keep_alive": self.keep_alive,
|
||||||
|
}
|
||||||
|
|
||||||
|
def _post_embed(self, payload: dict[str, Any]) -> dict[str, Any]:
|
||||||
|
try:
|
||||||
|
response = requests.post(
|
||||||
|
f"{self.base_url}/api/embed",
|
||||||
|
json=payload,
|
||||||
|
timeout=self.timeout,
|
||||||
|
)
|
||||||
|
response.raise_for_status()
|
||||||
|
data = response.json()
|
||||||
|
except requests.RequestException as exc:
|
||||||
|
raise RuntimeError(
|
||||||
|
f"Failed to call Ollama embeddings endpoint at "
|
||||||
|
f"{self.base_url}/api/embed"
|
||||||
|
) from exc
|
||||||
|
|
||||||
|
if "embeddings" not in data:
|
||||||
|
raise RuntimeError(
|
||||||
|
"Ollama response did not include 'embeddings'. "
|
||||||
|
f"Response keys: {list(data.keys())}"
|
||||||
|
)
|
||||||
|
|
||||||
|
return data
|
||||||
|
|
||||||
|
def _embed_api(self, text_or_texts: str | list[str]) -> list[list[float]]:
|
||||||
|
payload = self._build_payload(text_or_texts)
|
||||||
|
data = self._post_embed(payload)
|
||||||
|
return data["embeddings"]
|
||||||
29
uv.lock
29
uv.lock
|
|
@ -250,7 +250,6 @@ name = "assistance-engine"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
source = { virtual = "." }
|
source = { virtual = "." }
|
||||||
dependencies = [
|
dependencies = [
|
||||||
{ name = "chonkie", extra = ["semantic"] },
|
|
||||||
{ name = "grpcio" },
|
{ name = "grpcio" },
|
||||||
{ name = "grpcio-reflection" },
|
{ name = "grpcio-reflection" },
|
||||||
{ name = "grpcio-tools" },
|
{ name = "grpcio-tools" },
|
||||||
|
|
@ -273,7 +272,9 @@ dependencies = [
|
||||||
dev = [
|
dev = [
|
||||||
{ name = "beir" },
|
{ name = "beir" },
|
||||||
{ name = "boto3" },
|
{ name = "boto3" },
|
||||||
|
{ name = "chonkie", extra = ["elastic", "semantic"] },
|
||||||
{ name = "evidently" },
|
{ name = "evidently" },
|
||||||
|
{ name = "flatbuffers" },
|
||||||
{ name = "jupyter" },
|
{ name = "jupyter" },
|
||||||
{ name = "langfuse" },
|
{ name = "langfuse" },
|
||||||
{ name = "litellm" },
|
{ name = "litellm" },
|
||||||
|
|
@ -288,7 +289,6 @@ dev = [
|
||||||
|
|
||||||
[package.metadata]
|
[package.metadata]
|
||||||
requires-dist = [
|
requires-dist = [
|
||||||
{ name = "chonkie", extras = ["semantic"], specifier = ">=1.5.6" },
|
|
||||||
{ name = "grpcio", specifier = ">=1.78.0" },
|
{ name = "grpcio", specifier = ">=1.78.0" },
|
||||||
{ name = "grpcio-reflection", specifier = ">=1.78.0" },
|
{ name = "grpcio-reflection", specifier = ">=1.78.0" },
|
||||||
{ name = "grpcio-tools", specifier = ">=1.78.0" },
|
{ name = "grpcio-tools", specifier = ">=1.78.0" },
|
||||||
|
|
@ -311,7 +311,9 @@ requires-dist = [
|
||||||
dev = [
|
dev = [
|
||||||
{ name = "beir", specifier = ">=2.2.0" },
|
{ name = "beir", specifier = ">=2.2.0" },
|
||||||
{ name = "boto3", specifier = ">=1.42.58" },
|
{ name = "boto3", specifier = ">=1.42.58" },
|
||||||
|
{ name = "chonkie", extras = ["elastic", "semantic"], specifier = ">=1.6.0" },
|
||||||
{ name = "evidently", specifier = ">=0.7.20" },
|
{ name = "evidently", specifier = ">=0.7.20" },
|
||||||
|
{ name = "flatbuffers", specifier = ">=25.12.19" },
|
||||||
{ name = "jupyter", specifier = ">=1.1.1" },
|
{ name = "jupyter", specifier = ">=1.1.1" },
|
||||||
{ name = "langfuse", specifier = "<3" },
|
{ name = "langfuse", specifier = "<3" },
|
||||||
{ name = "litellm", specifier = ">=1.82.0" },
|
{ name = "litellm", specifier = ">=1.82.0" },
|
||||||
|
|
@ -595,7 +597,7 @@ wheels = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "chonkie"
|
name = "chonkie"
|
||||||
version = "1.5.6"
|
version = "1.6.0"
|
||||||
source = { registry = "https://pypi.org/simple" }
|
source = { registry = "https://pypi.org/simple" }
|
||||||
dependencies = [
|
dependencies = [
|
||||||
{ name = "chonkie-core" },
|
{ name = "chonkie-core" },
|
||||||
|
|
@ -603,12 +605,15 @@ dependencies = [
|
||||||
{ name = "tenacity" },
|
{ name = "tenacity" },
|
||||||
{ name = "tqdm" },
|
{ name = "tqdm" },
|
||||||
]
|
]
|
||||||
sdist = { url = "https://files.pythonhosted.org/packages/a4/16/e51295955f5a627ebb7867dc2e7fa48d4c6dc2a5f3cde3690de84812e929/chonkie-1.5.6.tar.gz", hash = "sha256:282a24c20b88c4c28d8cae893ac78bcbee531a87d28ec86b419897a9eea2ecf3", size = 172066, upload-time = "2026-02-16T21:44:01.336Z" }
|
sdist = { url = "https://files.pythonhosted.org/packages/e5/72/fdf8f89ff439f4ec357af0866c819512391936e4e61b6f15635a48434b8a/chonkie-1.6.0.tar.gz", hash = "sha256:14120d80610c1f549027fc7aa9a5ff604a729b545836f6cadd65d5ae83596279", size = 187056, upload-time = "2026-03-11T04:55:07.657Z" }
|
||||||
wheels = [
|
wheels = [
|
||||||
{ url = "https://files.pythonhosted.org/packages/18/3a/24cf4cb377f4d44126231d55a19b48a645a0f78f891288a8d4300c95160d/chonkie-1.5.6-py3-none-any.whl", hash = "sha256:4c3be39a0f97315eb3c5efe6dc5d7933d3d27a1918b55c39ab211b403bb03df7", size = 210065, upload-time = "2026-02-16T21:43:59.926Z" },
|
{ url = "https://files.pythonhosted.org/packages/ae/c2/7ea7d3409df220dd0e048b1113b44f47eccab9d517b00b037ab0e34c3c7a/chonkie-1.6.0-py3-none-any.whl", hash = "sha256:aa357e02f5cdacac6f8280c5e8651207c866b4137bcf20904db8670ee0808877", size = 232997, upload-time = "2026-03-11T04:55:05.252Z" },
|
||||||
]
|
]
|
||||||
|
|
||||||
[package.optional-dependencies]
|
[package.optional-dependencies]
|
||||||
|
elastic = [
|
||||||
|
{ name = "elasticsearch" },
|
||||||
|
]
|
||||||
semantic = [
|
semantic = [
|
||||||
{ name = "model2vec" },
|
{ name = "model2vec" },
|
||||||
{ name = "tokenizers" },
|
{ name = "tokenizers" },
|
||||||
|
|
@ -1061,6 +1066,14 @@ wheels = [
|
||||||
{ url = "https://files.pythonhosted.org/packages/9c/0f/5d0c71a1aefeb08efff26272149e07ab922b64f46c63363756224bd6872e/filelock-3.24.3-py3-none-any.whl", hash = "sha256:426e9a4660391f7f8a810d71b0555bce9008b0a1cc342ab1f6947d37639e002d", size = 24331, upload-time = "2026-02-19T00:48:18.465Z" },
|
{ url = "https://files.pythonhosted.org/packages/9c/0f/5d0c71a1aefeb08efff26272149e07ab922b64f46c63363756224bd6872e/filelock-3.24.3-py3-none-any.whl", hash = "sha256:426e9a4660391f7f8a810d71b0555bce9008b0a1cc342ab1f6947d37639e002d", size = 24331, upload-time = "2026-02-19T00:48:18.465Z" },
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "flatbuffers"
|
||||||
|
version = "25.12.19"
|
||||||
|
source = { registry = "https://pypi.org/simple" }
|
||||||
|
wheels = [
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/e8/2d/d2a548598be01649e2d46231d151a6c56d10b964d94043a335ae56ea2d92/flatbuffers-25.12.19-py2.py3-none-any.whl", hash = "sha256:7634f50c427838bb021c2d66a3d1168e9d199b0607e6329399f04846d42e20b4", size = 26661, upload-time = "2025-12-19T23:16:13.622Z" },
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "fqdn"
|
name = "fqdn"
|
||||||
version = "1.5.1"
|
version = "1.5.1"
|
||||||
|
|
@ -3112,14 +3125,14 @@ wheels = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "opentelemetry-proto"
|
name = "opentelemetry-proto"
|
||||||
version = "1.39.1"
|
version = "1.40.0"
|
||||||
source = { registry = "https://pypi.org/simple" }
|
source = { registry = "https://pypi.org/simple" }
|
||||||
dependencies = [
|
dependencies = [
|
||||||
{ name = "protobuf" },
|
{ name = "protobuf" },
|
||||||
]
|
]
|
||||||
sdist = { url = "https://files.pythonhosted.org/packages/49/1d/f25d76d8260c156c40c97c9ed4511ec0f9ce353f8108ca6e7561f82a06b2/opentelemetry_proto-1.39.1.tar.gz", hash = "sha256:6c8e05144fc0d3ed4d22c2289c6b126e03bcd0e6a7da0f16cedd2e1c2772e2c8", size = 46152, upload-time = "2025-12-11T13:32:48.681Z" }
|
sdist = { url = "https://files.pythonhosted.org/packages/4c/77/dd38991db037fdfce45849491cb61de5ab000f49824a00230afb112a4392/opentelemetry_proto-1.40.0.tar.gz", hash = "sha256:03f639ca129ba513f5819810f5b1f42bcb371391405d99c168fe6937c62febcd", size = 45667, upload-time = "2026-03-04T14:17:31.194Z" }
|
||||||
wheels = [
|
wheels = [
|
||||||
{ url = "https://files.pythonhosted.org/packages/51/95/b40c96a7b5203005a0b03d8ce8cd212ff23f1793d5ba289c87a097571b18/opentelemetry_proto-1.39.1-py3-none-any.whl", hash = "sha256:22cdc78efd3b3765d09e68bfbd010d4fc254c9818afd0b6b423387d9dee46007", size = 72535, upload-time = "2025-12-11T13:32:33.866Z" },
|
{ url = "https://files.pythonhosted.org/packages/b9/b2/189b2577dde745b15625b3214302605b1353436219d42b7912e77fa8dc24/opentelemetry_proto-1.40.0-py3-none-any.whl", hash = "sha256:266c4385d88923a23d63e353e9761af0f47a6ed0d486979777fe4de59dc9b25f", size = 72073, upload-time = "2026-03-04T14:17:16.673Z" },
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue