Refactor Elasticsearch ingestion pipeline and add MBPP generation script
- Updated `elasticsearch_ingestion.py` to streamline document processing and ingestion into Elasticsearch. - Introduced `generate_mbap.py` for generating benchmark problems in AVAP language from a provided LRM. - Created `prompts.py` to define prompts for converting Python problems to AVAP. - Enhanced chunk processing in `chunk.py` to support markdown and AVAP documents. - Added `OllamaEmbeddings` class in `embeddings.py` for handling embeddings with Ollama model. - Updated dependencies in `uv.lock` to include new packages and versions.
This commit is contained in:
parent
3caed4deb6
commit
5f21544e0b
35
README.md
35
README.md
|
|
@ -47,11 +47,12 @@ graph TD
|
|||
├── changelog # Version tracking and release history
|
||||
├── pyproject.toml # Python project configuration
|
||||
├── docs/
|
||||
| ├── AVAP Language: ... # AVAP DSL Documentation
|
||||
| | └── AVAP.md
|
||||
│ ├── AVAP Language: ... # AVAP DSL Documentation
|
||||
│ │ └── AVAP.md
|
||||
│ ├── developer.avapfr... # Documents on developer web page
|
||||
| └── LRM/ # AVAP LRM documentation
|
||||
| └── avap.md
|
||||
│ ├── LRM/ # AVAP LRM documentation
|
||||
│ │ └── avap.md
|
||||
│ └── samples/ # AVAP code samples
|
||||
├── Docker/
|
||||
│ ├── protos/
|
||||
│ │ └── brunix.proto # Protocol Buffers: The source of truth for the API
|
||||
|
|
@ -64,26 +65,17 @@ graph TD
|
|||
│ ├── Dockerfile # Container definition for the Engine
|
||||
│ ├── docker-compose.yaml # Local orchestration for dev environment
|
||||
│ ├── requirements.txt # Python dependencies for Docker
|
||||
│ ├── protos/
|
||||
│ │ └── brunix.proto # Protocol Buffers: The source of truth for the API
|
||||
│ └── src/
|
||||
│ ├── graph.py # Workflow graph orchestration
|
||||
│ ├── prompts.py # Centralized prompt definitions
|
||||
│ ├── server.py # gRPC Server & RAG Orchestration
|
||||
│ ├── state.py # Shared state management
|
||||
│ └── utils/ # Utility modules
|
||||
├── ingestion/
|
||||
│ └── docs/ # AVAP documentation chunks
|
||||
│ └── .dockerignore # Docker ignore files
|
||||
├── kubernetes/
|
||||
│ └── kubeconfig.yaml # Kubernetes cluster configuration
|
||||
├── scripts/
|
||||
│ └── pipelines/
|
||||
| ├── samples_generator/ # AVAP Sample generator
|
||||
| | └─ generate_mbap.py
|
||||
│ └── flows/ # Data processing flows
|
||||
| └─ elasticsearch_ingestion.py
|
||||
│ ├── flows/ # Processing pipelines
|
||||
│ ├── tasks/ # Modules used by the flows
|
||||
│ └── inputs/ # Inputs used by the flows
|
||||
└── src/
|
||||
├── __init__.py
|
||||
├── config.py # Environment variables configuration file
|
||||
└── utils/
|
||||
├── emb_factory.py # Embedding model factory
|
||||
├── llm_factory.py # LLM model factory
|
||||
|
|
@ -157,6 +149,7 @@ OLLAMA_URL=http://host.docker.internal:11434
|
|||
OLLAMA_LOCAL_URL=http://localhost:11434
|
||||
OLLAMA_MODEL_NAME=qwen2.5:1.5b
|
||||
OLLAMA_EMB_MODEL_NAME=qwen3-0.6B-emb:latest
|
||||
HF_TOKEN=hf_...
|
||||
HF_EMB_MODEL_NAME=Qwen/Qwen3-Embedding-0.6B
|
||||
```
|
||||
|
||||
|
|
@ -183,13 +176,13 @@ Open a terminal and establish the connection to the Devaron Cluster:
|
|||
|
||||
```bash
|
||||
# 1. AI Model Tunnel (Ollama)
|
||||
kubectl port-forward --address 0.0.0.0 svc/ollama-light-service 11434:11434 -n brunix --kubeconfig ./kubernetes/ivar.yaml &
|
||||
kubectl port-forward --address 0.0.0.0 svc/ollama-light-service 11434:11434 -n brunix --kubeconfig ./kubernetes/kubeconfig.yaml &
|
||||
|
||||
# 2. Knowledge Base Tunnel (Elasticsearch)
|
||||
kubectl port-forward --address 0.0.0.0 svc/brunix-vector-db 9200:9200 -n brunix --kubeconfig ./kubernetes/ivar.yaml &
|
||||
kubectl port-forward --address 0.0.0.0 svc/brunix-vector-db 9200:9200 -n brunix --kubeconfig ./kubernetes/kubeconfig.yaml &
|
||||
|
||||
# 3. Observability DB Tunnel (PostgreSQL)
|
||||
kubectl port-forward --address 0.0.0.0 svc/brunix-postgres 5432:5432 -n brunix --kubeconfig ./kubernetes/ivar.yaml &
|
||||
kubectl port-forward --address 0.0.0.0 svc/brunix-postgres 5432:5432 -n brunix --kubeconfig ./kubernetes/kubeconfig.yaml &
|
||||
```
|
||||
|
||||
### 5. Launch the Engine
|
||||
|
|
|
|||
|
|
@ -15,13 +15,13 @@ All notable changes to the **Brunix Assistance Engine** will be documented in th
|
|||
- `src/config.py`: environment variables configuration file.
|
||||
|
||||
### Changed
|
||||
- REFACTORED: `scripts/pipelines/flows/elasticsearch_ingestion.py` now uses `docs` documents instead of pre chunked files.
|
||||
- REFACTORED: `scripts/pipelines/flows/elasticsearch_ingestion.py` now uses `docs/LRM` or `docs/samples` documents instead of pre chunked files.
|
||||
- RENAMED `docs/AVAP Language: Core Commands & Functional Specification` to `docs/avap_language_github_docs`.
|
||||
- REMOVED: `Makefile` file.
|
||||
- REMOVED: `scripts/start-tunnels.sh` script.
|
||||
- REMOVED `ingestion` folder.
|
||||
- DEPENDENCIES: `requirements.txt` updated with new libraries required by the new modules.
|
||||
- MOVED `scripts/generate_mbpp_avap.py` into `scripts/flows/generate_mbpp_avap.py`
|
||||
- MOVED `scripts/generate_mbap.py` into `scripts/flows/generate_mbap.py`.
|
||||
|
||||
|
||||
## [1.4.0] - 2026-03-10
|
||||
|
|
|
|||
2305
docs/AVAP_dev.md
2305
docs/AVAP_dev.md
File diff suppressed because it is too large
Load Diff
3191
docs/avap.txt
3191
docs/avap.txt
File diff suppressed because it is too large
Load Diff
|
|
@ -5,7 +5,6 @@ description = "Add your description here"
|
|||
readme = "README.md"
|
||||
requires-python = ">=3.11"
|
||||
dependencies = [
|
||||
"chonkie[semantic]>=1.5.6",
|
||||
"grpcio>=1.78.0",
|
||||
"grpcio-reflection>=1.78.0",
|
||||
"grpcio-tools>=1.78.0",
|
||||
|
|
@ -28,7 +27,9 @@ dependencies = [
|
|||
dev = [
|
||||
"beir>=2.2.0",
|
||||
"boto3>=1.42.58",
|
||||
"chonkie[elastic,semantic]>=1.6.0",
|
||||
"evidently>=0.7.20",
|
||||
"flatbuffers>=25.12.19",
|
||||
"jupyter>=1.1.1",
|
||||
"langfuse<3",
|
||||
"litellm>=1.82.0",
|
||||
|
|
|
|||
|
|
@ -1,30 +1,29 @@
|
|||
import re
|
||||
import hashlib
|
||||
from typing import Any
|
||||
from enum import Enum
|
||||
import typer
|
||||
import logging
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
from loguru import logger
|
||||
from elasticsearch import Elasticsearch
|
||||
from langchain_core.documents import Document
|
||||
from langchain_elasticsearch import ElasticsearchStore
|
||||
from langchain_community.embeddings import HuggingFaceEmbeddings
|
||||
from langchain_experimental.text_splitter import SemanticChunker
|
||||
from chonkie import SemanticChunker, MarkdownChef
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
from src.utils.emb_factory import create_embedding_model
|
||||
from scripts.pipelines.tasks.chunk import scrape_avap_docs
|
||||
from scripts.pipelines.tasks.chunk import (
|
||||
read_files,
|
||||
get_chunk_docs,
|
||||
convert_chunks_to_document
|
||||
)
|
||||
|
||||
app = typer.Typer()
|
||||
|
||||
ELASTICSEARCH_LOCAL_URL = os.getenv("ELASTICSEARCH_LOCAL_URL")
|
||||
OLLAMA_LOCAL_URL = os.getenv("OLLAMA_LOCAL_URL")
|
||||
ELASTICSEARCH_INDEX = os.getenv("ELASTICSEARCH_INDEX")
|
||||
OLLAMA_URL = os.getenv("OLLAMA_URL")
|
||||
OLLAMA_EMB_MODEL_NAME = os.getenv("OLLAMA_EMB_MODEL_NAME")
|
||||
AVAP_WEB_DOCS_URL = os.getenv("AVAP_WEB_DOCS_URL")
|
||||
HF_EMB_MODEL_NAME = os.getenv("HF_EMB_MODEL_NAME")
|
||||
|
||||
class DistanceStrategy(str, Enum):
|
||||
euclidean = "EUCLIDEAN_DISTANCE"
|
||||
|
|
@ -33,55 +32,45 @@ class DistanceStrategy(str, Enum):
|
|||
jaccard = "JACCARD"
|
||||
cosine = "COSINE"
|
||||
|
||||
def clean_text(text: str) -> str:
|
||||
text = text.replace("\u00a0", " ")
|
||||
text = re.sub(r"\s+", " ", text).strip()
|
||||
return text
|
||||
|
||||
def build_documents_from_folder(
|
||||
folder_path: str,
|
||||
) -> list[Document]:
|
||||
|
||||
folder = Path(folder_path)
|
||||
|
||||
if not folder.exists() or not folder.is_dir():
|
||||
raise ValueError(f"Invalid folder path: {folder_path}")
|
||||
|
||||
all_documents: list[Document] = []
|
||||
|
||||
for file_path in folder.glob("*.txt"):
|
||||
doc_text = file_path.read_text(encoding="utf-8")
|
||||
|
||||
if not doc_text.strip():
|
||||
continue
|
||||
|
||||
metadata: dict[str, Any] = {
|
||||
"source": file_path.name,
|
||||
}
|
||||
|
||||
doc_text = clean_text(doc_text)
|
||||
document = Document(
|
||||
id=hashlib.md5(file_path.name.encode()).hexdigest(),
|
||||
page_content=doc_text,
|
||||
metadata={**metadata}
|
||||
)
|
||||
|
||||
all_documents.append(document)
|
||||
|
||||
return all_documents
|
||||
|
||||
|
||||
@app.command()
|
||||
def elasticsearch_ingestion(
|
||||
docs_folder_path: str = "ingestion/docs",
|
||||
docs_folder_path: str = "docs",
|
||||
es_index: str = "avap-docs-test-v2",
|
||||
es_request_timeout: int = 120,
|
||||
es_max_retries: int = 5,
|
||||
es_retry_on_timeout: bool = True,
|
||||
distance_strategy: DistanceStrategy = DistanceStrategy.cosine,
|
||||
):
|
||||
chunk_size: int = 2048,
|
||||
chunk_threshold: float = 0.5,
|
||||
chunk_similarity_window: int = 3,
|
||||
chunk_skip_window: int = 1,
|
||||
):
|
||||
logger.info("Starting Elasticsearch ingestion pipeline...")
|
||||
logger.info(f"Using docs folder path: {docs_folder_path}")
|
||||
documents = build_documents_from_folder(folder_path=docs_folder_path)
|
||||
logger.info(f"Reading files from folder: {docs_folder_path}/LRM and {docs_folder_path}/samples...")
|
||||
avap_code_docs = read_files(f"{docs_folder_path}/samples")
|
||||
avap_language_docs = read_files(f"{docs_folder_path}/LRM")
|
||||
|
||||
logger.info("Instantiating semantic chunker and chef...")
|
||||
custom_tokenizer = AutoTokenizer.from_pretrained(HF_EMB_MODEL_NAME)
|
||||
chef = MarkdownChef(tokenizer=custom_tokenizer)
|
||||
chunker = SemanticChunker(
|
||||
embedding_model=HF_EMB_MODEL_NAME,
|
||||
chunk_size=chunk_size,
|
||||
threshold=chunk_threshold,
|
||||
similarity_window=chunk_similarity_window,
|
||||
skip_window=chunk_skip_window
|
||||
)
|
||||
logger.info("Processing Markdown docs with chef...")
|
||||
doc = chef.process(f"{docs_folder_path}/LRM/avap.md")
|
||||
|
||||
logger.info("Chunking AVAP Language docs...")
|
||||
avap_language_docs_chunks = get_chunk_docs(avap_language_docs, chunker)
|
||||
|
||||
logger.info("Creating Langchain Document to index...")
|
||||
avap_language_langchain_docs = convert_chunks_to_document(avap_language_docs_chunks)
|
||||
avap_code_langchain_docs = convert_chunks_to_document(avap_code_docs)
|
||||
avap_documents = avap_language_langchain_docs + avap_code_langchain_docs
|
||||
|
||||
logger.info("Connecting to Elasticsearch...")
|
||||
try:
|
||||
|
|
@ -105,16 +94,20 @@ def elasticsearch_ingestion(
|
|||
except:
|
||||
logger.exception("Failed to instantiate embeddings model.")
|
||||
raise
|
||||
|
||||
logger.info(f"Checking if index {es_index} exists and deleting if it does...")
|
||||
if es.indices.exists(index=es_index):
|
||||
es.indices.delete(index=es_index)
|
||||
|
||||
logger.info(f"Uploading documents to index {ELASTICSEARCH_INDEX}...")
|
||||
logger.info(f"Uploading documents to index {es_index}...")
|
||||
ElasticsearchStore.from_documents(
|
||||
documents,
|
||||
avap_documents,
|
||||
embeddings,
|
||||
client=es,
|
||||
index_name=ELASTICSEARCH_INDEX,
|
||||
index_name=es_index,
|
||||
distance_strategy=distance_strategy.value,
|
||||
)
|
||||
logger.info(f"Finished uploading documents to index {ELASTICSEARCH_INDEX}.")
|
||||
logger.info(f"Finished uploading documents to index {es_index}.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
|||
|
|
@ -0,0 +1,122 @@
|
|||
import typer
|
||||
import logging
|
||||
|
||||
from loguru import logger
|
||||
from elasticsearch import Elasticsearch
|
||||
from chonkie import MarkdownChef, FileFetcher, ElasticHandshake
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
from src.config import settings
|
||||
from scripts.pipelines.tasks.embeddings import OllamaEmbeddings
|
||||
from scripts.pipelines.tasks.chunk import merge_markdown_document
|
||||
|
||||
app = typer.Typer()
|
||||
|
||||
def get_processing_and_chunking_config(docs_extension: str, chunk_size: int,
|
||||
chunk_threshold: float | None,
|
||||
chunk_similarity_window: int| None,
|
||||
chunk_skip_window: int | None) -> tuple[str, dict, str, dict]:
|
||||
"""
|
||||
Check the file extension and return the appropriate processing and chunking strategies and their kwargs.
|
||||
|
||||
Args:
|
||||
docs_extension (str): The file extension of the documents to be ingested.
|
||||
chunk_size (int): The size of the chunks to be created.
|
||||
chunk_threshold (float, optional): The threshold for semantic chunking. Required if docs_extension is .md.
|
||||
chunk_similarity_window (int, optional): The similarity window for semantic chunking
|
||||
chunk_skip_window (int, optional): The skip window for semantic chunking.
|
||||
|
||||
Returns:
|
||||
tuple[str, dict, str, dict]: A tuple containing the processing strategy, its kwargs, the chunking strategy, and its kwargs.
|
||||
"""
|
||||
if docs_extension == ".md":
|
||||
process_type = "markdown"
|
||||
custom_tokenizer = AutoTokenizer.from_pretrained(settings.hf_emb_model_name)
|
||||
process_kwargs = {"tokenizer": custom_tokenizer}
|
||||
# process_type = "text"
|
||||
# process_kwargs = {}
|
||||
chunk_strat = "semantic"
|
||||
chunk_kwargs = {"embedding_model": settings.hf_emb_model_name, "threshold": chunk_threshold, "chunk_size": chunk_size,
|
||||
"similarity_window": chunk_similarity_window, "skip_window": chunk_skip_window}
|
||||
|
||||
elif docs_extension == ".avap":
|
||||
process_type = "text"
|
||||
process_kwargs = {}
|
||||
chunk_strat = "recursive" # Once we have the BNF and uploaded to tree-sitter, we can use code (?)
|
||||
chunk_kwargs = {"chunk_size": chunk_size}
|
||||
|
||||
return process_type, process_kwargs, chunk_strat, chunk_kwargs
|
||||
|
||||
|
||||
@app.command()
|
||||
def elasticsearch_ingestion(
|
||||
docs_folder_path: str = "docs/LRM",
|
||||
docs_extension: str = ".md",
|
||||
es_index: str = "avap-docs-test-v3",
|
||||
es_request_timeout: int = 120,
|
||||
es_max_retries: int = 5,
|
||||
es_retry_on_timeout: bool = True,
|
||||
delete_es_index: bool = True,
|
||||
chunk_size: int = 2048,
|
||||
chunk_threshold: float | None = 0.5,
|
||||
chunk_similarity_window: int | None = 3,
|
||||
chunk_skip_window: int | None = 1
|
||||
):
|
||||
custom_tokenizer = AutoTokenizer.from_pretrained(settings.hf_emb_model_name)
|
||||
processed_docs = []
|
||||
fused_docs = []
|
||||
logger.info(f"Instantiating Elasticsearch client with URL: {settings.elasticsearch_local_url}...")
|
||||
es = Elasticsearch(
|
||||
hosts=settings.elasticsearch_local_url,
|
||||
request_timeout=es_request_timeout,
|
||||
max_retries=es_max_retries,
|
||||
retry_on_timeout=es_retry_on_timeout,
|
||||
)
|
||||
|
||||
if delete_es_index and es.indices.exists(index=es_index):
|
||||
logger.info(f"Deleting existing Elasticsearch index: {es_index}...")
|
||||
es.indices.delete(index=es_index)
|
||||
|
||||
logger.info("Starting Elasticsearch ingestion pipeline...")
|
||||
(process_type,
|
||||
process_kwargs,
|
||||
chunk_strat,
|
||||
chunk_kwargs) = get_processing_and_chunking_config(docs_extension, chunk_size, chunk_threshold, chunk_similarity_window, chunk_skip_window)
|
||||
|
||||
logger.info(f"Fetching files from {docs_folder_path}...")
|
||||
fetcher = FileFetcher()
|
||||
docs = fetcher.fetch(dir=f"{settings.proj_root}/{docs_folder_path}")
|
||||
|
||||
logger.info(f"Processing documents with process_type: {process_type}...")
|
||||
chef = MarkdownChef(tokenizer=custom_tokenizer)
|
||||
for doc in docs:
|
||||
processed_doc = chef.process(doc)
|
||||
processed_docs.append(processed_doc)
|
||||
|
||||
logger.info(f"Chunking documents with chunk_strat: {chunk_strat}...")
|
||||
for processed_doc in processed_docs:
|
||||
fused_doc = merge_markdown_document(processed_doc)
|
||||
fused_docs.append(fused_doc)
|
||||
|
||||
logger.info(f"Ingesting chunks in Elasticsearch index: {es_index}...")
|
||||
handshake = ElasticHandshake(
|
||||
client=es,
|
||||
index_name=es_index,
|
||||
embedding_model=OllamaEmbeddings(model=settings.ollama_emb_model_name)
|
||||
)
|
||||
for fused_doc in fused_docs:
|
||||
handshake.write(fused_doc.chunks)
|
||||
|
||||
logger.info(f"Finished ingesting in {es_index}.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
|
||||
)
|
||||
try:
|
||||
app()
|
||||
except Exception as exc:
|
||||
logger.exception(exc)
|
||||
raise
|
||||
|
|
@ -1,134 +1,38 @@
|
|||
from enum import Enum
|
||||
import typer
|
||||
import logging
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
from loguru import logger
|
||||
from elasticsearch import Elasticsearch
|
||||
from langchain_elasticsearch import ElasticsearchStore
|
||||
from chonkie import SemanticChunker
|
||||
from chonkie import FileFetcher
|
||||
|
||||
from src.utils.emb_factory import create_embedding_model
|
||||
from scripts.pipelines.tasks.chunk import (
|
||||
read_files,
|
||||
get_chunk_docs,
|
||||
convert_chunks_to_document
|
||||
)
|
||||
from src.config import settings
|
||||
from scripts.pipelines.tasks.chunk import process_documents, ingest_documents
|
||||
|
||||
app = typer.Typer()
|
||||
|
||||
ELASTICSEARCH_LOCAL_URL = os.getenv("ELASTICSEARCH_LOCAL_URL")
|
||||
OLLAMA_LOCAL_URL = os.getenv("OLLAMA_LOCAL_URL")
|
||||
ELASTICSEARCH_INDEX = os.getenv("ELASTICSEARCH_INDEX")
|
||||
OLLAMA_URL = os.getenv("OLLAMA_URL")
|
||||
OLLAMA_EMB_MODEL_NAME = os.getenv("OLLAMA_EMB_MODEL_NAME")
|
||||
AVAP_WEB_DOCS_URL = os.getenv("AVAP_WEB_DOCS_URL")
|
||||
HF_EMB_MODEL_NAME = os.getenv("HF_EMB_MODEL_NAME")
|
||||
|
||||
class DistanceStrategy(str, Enum):
|
||||
euclidean = "EUCLIDEAN_DISTANCE"
|
||||
max_inner_product = "MAX_INNER_PRODUCT"
|
||||
dot_product = "DOT_PRODUCT"
|
||||
jaccard = "JACCARD"
|
||||
cosine = "COSINE"
|
||||
|
||||
|
||||
@app.command()
|
||||
def elasticsearch_ingestion(
|
||||
docs_folder_path: str = "docs",
|
||||
docs_folder_path: str = "docs/LRM",
|
||||
docs_extension: str = ".md",
|
||||
es_index: str = "avap-docs-test-v3",
|
||||
es_request_timeout: int = 120,
|
||||
es_max_retries: int = 5,
|
||||
es_retry_on_timeout: bool = True,
|
||||
distance_strategy: DistanceStrategy = DistanceStrategy.cosine,
|
||||
chunk_size: int = 2048,
|
||||
chunk_threshold: float = 0.5,
|
||||
chunk_similarity_window: int = 3,
|
||||
chunk_skip_window: int = 1,
|
||||
delete_es_index: bool = True
|
||||
):
|
||||
logger.info("Starting Elasticsearch ingestion pipeline...")
|
||||
logger.info(f"Reading and concatenating files from folder: {docs_folder_path}/developer.avapframework.com")
|
||||
avap_github_docs = read_files(f"{docs_folder_path}/avap_language_github_docs", concatenate=False)
|
||||
avap_web_docs_intro = read_files(f"{docs_folder_path}/developer.avapframework.com", "intro", concatenate=True)
|
||||
|
||||
# Check chapters in developer.avapframework.com folder and read and concatenate files for each chapter
|
||||
chapters = sorted({
|
||||
p.name.split("_")[0]
|
||||
for p in Path(f"{docs_folder_path}/developer.avapframework.com").glob("chapter*.md")
|
||||
})
|
||||
logger.info(f"Fetching files from {docs_folder_path}...")
|
||||
fetcher = FileFetcher()
|
||||
docs_path = fetcher.fetch(dir=f"{settings.proj_root}/{docs_folder_path}")
|
||||
|
||||
avap_web_docs_chapters = [
|
||||
item
|
||||
for chapter in chapters
|
||||
for item in read_files(
|
||||
f"{docs_folder_path}/developer.avapframework.com",
|
||||
f"{chapter}_",
|
||||
concatenate=True
|
||||
)
|
||||
]
|
||||
logger.info("Processing docs...")
|
||||
chunked_docs = process_documents(docs_path, docs_extension)
|
||||
|
||||
avap_web_docs_appendices = read_files(f"{docs_folder_path}/developer.avapframework.com", "appendices_", concatenate=False)
|
||||
avap_samples_docs = read_files(f"{docs_folder_path}/samples", concatenate=False)
|
||||
logger.info(f"Ingesting chunks in Elasticsearch index: {es_index}...")
|
||||
ingest_documents(chunked_docs, es_index, es_request_timeout, es_max_retries,
|
||||
es_retry_on_timeout, delete_es_index)
|
||||
|
||||
logger.info("Instantiating semantic chunker...")
|
||||
chunker = SemanticChunker(
|
||||
embedding_model=HF_EMB_MODEL_NAME,
|
||||
chunk_size=chunk_size,
|
||||
threshold=chunk_threshold,
|
||||
similarity_window=chunk_similarity_window,
|
||||
skip_window=chunk_skip_window
|
||||
)
|
||||
|
||||
logger.info("Chunking AVAP GitHub docs...")
|
||||
avap_github_docs_chunks = get_chunk_docs(avap_github_docs, chunker)
|
||||
|
||||
logger.info("Chunking AVAP web docs chapters...")
|
||||
avap_web_docs_chapters_chunks = get_chunk_docs(avap_web_docs_chapters, chunker)
|
||||
|
||||
logger.info("Creating Langchain Document to index...")
|
||||
avap_github_langchain_docs = convert_chunks_to_document(avap_github_docs_chunks)
|
||||
avap_web_chapters_langchain_docs = convert_chunks_to_document(avap_web_docs_chapters_chunks)
|
||||
avap_web_intro_langchain_docs = convert_chunks_to_document(avap_web_docs_intro)
|
||||
avap_web_appendices_langchain_docs = convert_chunks_to_document(avap_web_docs_appendices)
|
||||
avap_samples_langchain_docs = convert_chunks_to_document(avap_samples_docs)
|
||||
avap_documents = avap_github_langchain_docs + avap_web_chapters_langchain_docs + avap_web_intro_langchain_docs + avap_web_appendices_langchain_docs + avap_samples_langchain_docs
|
||||
|
||||
logger.info("Connecting to Elasticsearch...")
|
||||
try:
|
||||
es = Elasticsearch(
|
||||
ELASTICSEARCH_LOCAL_URL,
|
||||
request_timeout=es_request_timeout,
|
||||
max_retries=es_max_retries,
|
||||
retry_on_timeout=es_retry_on_timeout,
|
||||
)
|
||||
except:
|
||||
logger.exception("Failed to connect to Elasticsearch.")
|
||||
raise
|
||||
|
||||
logger.info("Instantiating embeddings model...")
|
||||
try:
|
||||
embeddings = create_embedding_model(
|
||||
provider="ollama",
|
||||
model=OLLAMA_EMB_MODEL_NAME,
|
||||
base_url=OLLAMA_LOCAL_URL,
|
||||
)
|
||||
except:
|
||||
logger.exception("Failed to instantiate embeddings model.")
|
||||
raise
|
||||
|
||||
logger.info(f"Checking if index {ELASTICSEARCH_INDEX} exists and deleting if it does...")
|
||||
if es.indices.exists(index=ELASTICSEARCH_INDEX):
|
||||
es.indices.delete(index=ELASTICSEARCH_INDEX)
|
||||
|
||||
logger.info(f"Uploading documents to index {ELASTICSEARCH_INDEX}...")
|
||||
ElasticsearchStore.from_documents(
|
||||
avap_documents,
|
||||
embeddings,
|
||||
client=es,
|
||||
index_name=ELASTICSEARCH_INDEX,
|
||||
distance_strategy=distance_strategy.value,
|
||||
)
|
||||
logger.info(f"Finished uploading documents to index {ELASTICSEARCH_INDEX}.")
|
||||
logger.info(f"Finished ingesting in {es_index}.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
@ -140,4 +44,4 @@ if __name__ == "__main__":
|
|||
app()
|
||||
except Exception as exc:
|
||||
logger.exception(exc)
|
||||
raise
|
||||
raise
|
||||
|
|
@ -1,136 +1,159 @@
|
|||
import os
|
||||
import re
|
||||
import uuid
|
||||
from copy import deepcopy
|
||||
from dataclasses import replace
|
||||
from pathlib import Path
|
||||
|
||||
from chonkie import (
|
||||
Chunk,
|
||||
ElasticHandshake,
|
||||
MarkdownChef,
|
||||
TextChef,
|
||||
TokenChunker,
|
||||
)
|
||||
from elasticsearch import Elasticsearch
|
||||
from loguru import logger
|
||||
from chonkie import Chunk, SemanticChunker
|
||||
from langchain_core.documents import Document
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
from scripts.pipelines.tasks.embeddings import OllamaEmbeddings
|
||||
from src.config import settings
|
||||
|
||||
|
||||
def replace_javascript_with_avap(text: str) -> str:
|
||||
"""
|
||||
Replace mentions of javascript language with avap in the text.
|
||||
Handles code blocks, language identifiers, and references.
|
||||
|
||||
Args:
|
||||
text: The text to process.
|
||||
|
||||
Returns:
|
||||
The text with javascript references replaced with avap.
|
||||
"""
|
||||
# Replace ```javascript with ```avap
|
||||
text = text.replace("```javascript", "```avap")
|
||||
|
||||
# Replace ```js with ```avap
|
||||
text = text.replace("```js", "```avap")
|
||||
|
||||
# Replace common phrases (case-insensitive)
|
||||
text = re.sub(r"\bjavascript\s+code\b", "avap code", text, flags=re.IGNORECASE)
|
||||
text = re.sub(
|
||||
r"\bjavascript\s+example\b", "avap example", text, flags=re.IGNORECASE
|
||||
def _get_text(element) -> str:
|
||||
for attr in ("text", "content", "markdown"):
|
||||
value = getattr(element, attr, None)
|
||||
if isinstance(value, str):
|
||||
return value
|
||||
raise AttributeError(
|
||||
f"Could not extract text from element of type {type(element).__name__}"
|
||||
)
|
||||
text = re.sub(r"\bjavascript\b(?!\s+file)", "avap", text, flags=re.IGNORECASE)
|
||||
|
||||
return text
|
||||
|
||||
|
||||
def read_files(
|
||||
folder_path: str, file_prefix: str | None = None, concatenate: bool = True
|
||||
) -> list[dict]:
|
||||
"""
|
||||
Read files in a folder whose names start with a given prefix.
|
||||
Replaces javascript language markers with avap.
|
||||
def _merge_markdown_document(doc):
|
||||
elements = []
|
||||
|
||||
Args:
|
||||
folder_path: Path to the folder to search in.
|
||||
file_prefix: The prefix that file names must start with.
|
||||
If None, all files in the folder are included.
|
||||
concatenate: Whether to concatenate the contents of the files.
|
||||
for chunk in doc.chunks:
|
||||
elements.append(("chunk", chunk.start_index, chunk.end_index, chunk))
|
||||
|
||||
Returns:
|
||||
A list of dictionaries, each containing 'content' and 'title' keys.
|
||||
If concatenate is True, returns a single dict with concatenated content and title as 'appendix'.
|
||||
If concatenate is False, returns one dict per file with filename as title.
|
||||
"""
|
||||
contents = []
|
||||
filenames = []
|
||||
for code in doc.code:
|
||||
elements.append(("code", code.start_index, code.end_index, code))
|
||||
|
||||
for filename in sorted(os.listdir(folder_path)):
|
||||
include_file = file_prefix is None or filename.startswith(file_prefix)
|
||||
if include_file:
|
||||
file_path = os.path.join(folder_path, filename)
|
||||
if os.path.isfile(file_path):
|
||||
with open(file_path, "r", encoding="utf-8") as f:
|
||||
content = f.read()
|
||||
cleaned_content = content.strip()
|
||||
if cleaned_content:
|
||||
contents.append(cleaned_content)
|
||||
filenames.append(filename)
|
||||
for table in doc.tables:
|
||||
elements.append(("table", table.start_index, table.end_index, table))
|
||||
|
||||
if concatenate:
|
||||
concatenated = "\n".join(contents)
|
||||
processed_content = replace_javascript_with_avap(concatenated)
|
||||
title = file_prefix if file_prefix is not None else "all_files"
|
||||
return [{"content": processed_content, "title": title}]
|
||||
else:
|
||||
return [
|
||||
{"content": replace_javascript_with_avap(content), "title": filename}
|
||||
for content, filename in zip(contents, filenames)
|
||||
]
|
||||
elements.sort(key=lambda item: (item[1], item[2]))
|
||||
|
||||
merged_chunks = []
|
||||
current_chunk = None
|
||||
current_parts = []
|
||||
current_end_index = None
|
||||
current_token_count = None
|
||||
|
||||
def flush():
|
||||
nonlocal current_chunk, current_parts, current_end_index, current_token_count
|
||||
|
||||
if current_chunk is None:
|
||||
return
|
||||
|
||||
merged_text = "\n\n".join(part for part in current_parts if part)
|
||||
|
||||
merged_chunks.append(
|
||||
replace(
|
||||
current_chunk,
|
||||
text=merged_text,
|
||||
end_index=current_end_index,
|
||||
token_count=current_token_count,
|
||||
)
|
||||
)
|
||||
|
||||
current_chunk = None
|
||||
current_parts = []
|
||||
current_end_index = None
|
||||
current_token_count = None
|
||||
|
||||
for kind, _, _, element in elements:
|
||||
if kind == "chunk":
|
||||
flush()
|
||||
current_chunk = element
|
||||
current_parts = [_get_text(element)]
|
||||
current_end_index = element.end_index
|
||||
current_token_count = element.token_count
|
||||
continue
|
||||
|
||||
if current_chunk is None:
|
||||
continue
|
||||
|
||||
current_parts.append(_get_text(element))
|
||||
current_end_index = max(current_end_index, element.end_index)
|
||||
current_token_count += getattr(element, "token_count", 0)
|
||||
|
||||
flush()
|
||||
|
||||
new_doc = deepcopy(doc)
|
||||
new_doc.chunks = merged_chunks
|
||||
new_doc.code = doc.code
|
||||
new_doc.tables = doc.tables
|
||||
|
||||
return new_doc
|
||||
|
||||
|
||||
def get_chunk_docs(docs: list[dict], chunker: SemanticChunker) -> list[list[Chunk]]:
|
||||
"""
|
||||
Chunk the content of the documents using the provided chunker.
|
||||
def process_documents(docs_path: list[Path], docs_extension: str) -> list[Chunk]:
|
||||
processed_docs = []
|
||||
chunked_docs = []
|
||||
custom_tokenizer = AutoTokenizer.from_pretrained(settings.hf_emb_model_name)
|
||||
|
||||
Args:
|
||||
docs: A list of dictionaries, each containing 'content' and 'title' keys.
|
||||
chunker: An instance of SemanticChunker to use for chunking the content.
|
||||
if docs_extension == ".md":
|
||||
chef = MarkdownChef(tokenizer=custom_tokenizer)
|
||||
for doc in docs_path:
|
||||
processed_doc = chef.process(doc)
|
||||
processed_docs.append((processed_doc, doc.name))
|
||||
|
||||
Returns:
|
||||
A list of lists of Chunk objects, where each inner list corresponds to the chunks of a
|
||||
single document.
|
||||
"""
|
||||
list_chunks = []
|
||||
for processed_doc, filename in processed_docs:
|
||||
fused_doc = _merge_markdown_document(processed_doc)
|
||||
chunked_docs.extend(fused_doc.chunks)
|
||||
|
||||
for doc in docs:
|
||||
content = doc["content"]
|
||||
chunks = chunker.chunk(content)
|
||||
for chunk in chunks:
|
||||
chunk.context = {"source": doc["title"]}
|
||||
list_chunks.append(chunks)
|
||||
logger.info(f"Finished chunking {doc['title']}")
|
||||
elif docs_extension == ".avap":
|
||||
chef = TextChef()
|
||||
chunker = TokenChunker(tokenizer=custom_tokenizer)
|
||||
for doc in docs_path:
|
||||
processed_doc = chef.process(doc)
|
||||
processed_docs.append((processed_doc, doc.name))
|
||||
|
||||
return list_chunks
|
||||
for processed_doc, filename in processed_docs:
|
||||
chunked_doc = chunker.chunk(processed_doc.content)
|
||||
chunked_docs.extend(chunked_doc)
|
||||
|
||||
return chunked_docs
|
||||
|
||||
|
||||
def convert_chunks_to_document(chunks: list[dict] | list[list[Chunk]]) -> list[Document]:
|
||||
"""
|
||||
Convert the chunked content into a list of Document objects.
|
||||
def ingest_documents(
|
||||
chunked_docs: list[Chunk],
|
||||
es_index: str,
|
||||
es_request_timeout: int,
|
||||
es_max_retries: int,
|
||||
es_retry_on_timeout: bool,
|
||||
delete_es_index: bool,
|
||||
) -> None:
|
||||
|
||||
Args:
|
||||
chunks: A list of dictionaries containing 'content' and 'title' keys.
|
||||
logger.info(
|
||||
f"Instantiating Elasticsearch client with URL: {settings.elasticsearch_local_url}..."
|
||||
)
|
||||
es = Elasticsearch(
|
||||
hosts=settings.elasticsearch_local_url,
|
||||
request_timeout=es_request_timeout,
|
||||
max_retries=es_max_retries,
|
||||
retry_on_timeout=es_retry_on_timeout,
|
||||
)
|
||||
|
||||
Returns:
|
||||
A list of Document objects created from the chunked content.
|
||||
"""
|
||||
documents = []
|
||||
if delete_es_index and es.indices.exists(index=es_index):
|
||||
logger.info(f"Deleting existing Elasticsearch index: {es_index}...")
|
||||
es.indices.delete(index=es_index)
|
||||
|
||||
if isinstance(chunks[0], dict):
|
||||
for chunk in chunks:
|
||||
content = chunk["content"]
|
||||
title = chunk["title"]
|
||||
documents.append(Document(id=str(uuid.uuid4()),
|
||||
page_content=content,
|
||||
metadata={"source": title}))
|
||||
|
||||
else:
|
||||
for chunk_list in chunks:
|
||||
for chunk in chunk_list:
|
||||
content = chunk.text
|
||||
title = chunk.context.get("source", "unknown")
|
||||
documents.append(Document(id=str(uuid.uuid4()),
|
||||
page_content=content,
|
||||
metadata={"source": title}))
|
||||
handshake = ElasticHandshake(
|
||||
client=es,
|
||||
index_name=es_index,
|
||||
embedding_model=OllamaEmbeddings(model=settings.ollama_emb_model_name),
|
||||
)
|
||||
|
||||
return documents
|
||||
logger.info(
|
||||
f"Ingesting {len(chunked_docs)} chunks into Elasticsearch index: {es_index}..."
|
||||
)
|
||||
handshake.write(chunked_docs)
|
||||
|
|
|
|||
|
|
@ -0,0 +1,125 @@
|
|||
import requests
|
||||
from typing import Any, Callable
|
||||
|
||||
import numpy as np
|
||||
from chonkie.embeddings import BaseEmbeddings
|
||||
|
||||
from src.config import settings
|
||||
|
||||
|
||||
class OllamaEmbeddings(BaseEmbeddings):
|
||||
"""Chonkie embeddings adapter for a local Ollama embedding model."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model: str,
|
||||
base_url: str = settings.ollama_local_url,
|
||||
timeout: float = 60.0,
|
||||
truncate: bool = True,
|
||||
keep_alive: str = "5m",
|
||||
) -> None:
|
||||
self.model = model
|
||||
self.base_url = base_url.rstrip("/")
|
||||
self.timeout = timeout
|
||||
self.truncate = truncate
|
||||
self.keep_alive = keep_alive
|
||||
self._dimension: int | None = None
|
||||
|
||||
@property
|
||||
def dimension(self) -> int:
|
||||
if self._dimension is None:
|
||||
# Lazy-load the dimension from a real embedding response.
|
||||
self._dimension = int(self.embed(" ").shape[0])
|
||||
return self._dimension
|
||||
|
||||
def embed(self, text: str) -> np.ndarray:
|
||||
embeddings = self._embed_api(text)
|
||||
vector = np.asarray(embeddings[0], dtype=np.float32)
|
||||
|
||||
if self._dimension is None:
|
||||
self._dimension = int(vector.shape[0])
|
||||
|
||||
return vector
|
||||
|
||||
def embed_batch(self, texts: list[str]) -> list[np.ndarray]:
|
||||
if not texts:
|
||||
return []
|
||||
|
||||
embeddings = self._embed_api(texts)
|
||||
vectors = [np.asarray(vector, dtype=np.float32) for vector in embeddings]
|
||||
|
||||
if vectors and self._dimension is None:
|
||||
self._dimension = int(vectors[0].shape[0])
|
||||
|
||||
return vectors
|
||||
|
||||
def count_tokens(self, text: str) -> int:
|
||||
payload = self._build_payload(text)
|
||||
response = self._post_embed(payload)
|
||||
return int(response["prompt_eval_count"])
|
||||
|
||||
def count_tokens_batch(self, texts: list[str]) -> list[int]:
|
||||
# Ollama returns a single prompt_eval_count for the whole request,
|
||||
# not one count per input item, so we compute them individually.
|
||||
return [self.count_tokens(text) for text in texts]
|
||||
|
||||
def get_tokenizer(self) -> Callable[[str], int]:
|
||||
# Chonkie mainly needs something usable for token counting.
|
||||
return self.count_tokens
|
||||
|
||||
@classmethod
|
||||
def is_available(cls) -> bool:
|
||||
try:
|
||||
response = requests.get(
|
||||
f"{settings.ollama_local_url}/api/tags",
|
||||
timeout=5.0,
|
||||
)
|
||||
response.raise_for_status()
|
||||
return True
|
||||
except requests.RequestException:
|
||||
return False
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return (
|
||||
f"OllamaEmbeddings("
|
||||
f"model={self.model!r}, "
|
||||
f"base_url={self.base_url!r}, "
|
||||
f"dimension={self._dimension!r}"
|
||||
f")"
|
||||
)
|
||||
|
||||
def _build_payload(self, text_or_texts: str | list[str]) -> dict[str, Any]:
|
||||
return {
|
||||
"model": self.model,
|
||||
"input": text_or_texts,
|
||||
"truncate": self.truncate,
|
||||
"keep_alive": self.keep_alive,
|
||||
}
|
||||
|
||||
def _post_embed(self, payload: dict[str, Any]) -> dict[str, Any]:
|
||||
try:
|
||||
response = requests.post(
|
||||
f"{self.base_url}/api/embed",
|
||||
json=payload,
|
||||
timeout=self.timeout,
|
||||
)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
except requests.RequestException as exc:
|
||||
raise RuntimeError(
|
||||
f"Failed to call Ollama embeddings endpoint at "
|
||||
f"{self.base_url}/api/embed"
|
||||
) from exc
|
||||
|
||||
if "embeddings" not in data:
|
||||
raise RuntimeError(
|
||||
"Ollama response did not include 'embeddings'. "
|
||||
f"Response keys: {list(data.keys())}"
|
||||
)
|
||||
|
||||
return data
|
||||
|
||||
def _embed_api(self, text_or_texts: str | list[str]) -> list[list[float]]:
|
||||
payload = self._build_payload(text_or_texts)
|
||||
data = self._post_embed(payload)
|
||||
return data["embeddings"]
|
||||
29
uv.lock
29
uv.lock
|
|
@ -250,7 +250,6 @@ name = "assistance-engine"
|
|||
version = "0.1.0"
|
||||
source = { virtual = "." }
|
||||
dependencies = [
|
||||
{ name = "chonkie", extra = ["semantic"] },
|
||||
{ name = "grpcio" },
|
||||
{ name = "grpcio-reflection" },
|
||||
{ name = "grpcio-tools" },
|
||||
|
|
@ -273,7 +272,9 @@ dependencies = [
|
|||
dev = [
|
||||
{ name = "beir" },
|
||||
{ name = "boto3" },
|
||||
{ name = "chonkie", extra = ["elastic", "semantic"] },
|
||||
{ name = "evidently" },
|
||||
{ name = "flatbuffers" },
|
||||
{ name = "jupyter" },
|
||||
{ name = "langfuse" },
|
||||
{ name = "litellm" },
|
||||
|
|
@ -288,7 +289,6 @@ dev = [
|
|||
|
||||
[package.metadata]
|
||||
requires-dist = [
|
||||
{ name = "chonkie", extras = ["semantic"], specifier = ">=1.5.6" },
|
||||
{ name = "grpcio", specifier = ">=1.78.0" },
|
||||
{ name = "grpcio-reflection", specifier = ">=1.78.0" },
|
||||
{ name = "grpcio-tools", specifier = ">=1.78.0" },
|
||||
|
|
@ -311,7 +311,9 @@ requires-dist = [
|
|||
dev = [
|
||||
{ name = "beir", specifier = ">=2.2.0" },
|
||||
{ name = "boto3", specifier = ">=1.42.58" },
|
||||
{ name = "chonkie", extras = ["elastic", "semantic"], specifier = ">=1.6.0" },
|
||||
{ name = "evidently", specifier = ">=0.7.20" },
|
||||
{ name = "flatbuffers", specifier = ">=25.12.19" },
|
||||
{ name = "jupyter", specifier = ">=1.1.1" },
|
||||
{ name = "langfuse", specifier = "<3" },
|
||||
{ name = "litellm", specifier = ">=1.82.0" },
|
||||
|
|
@ -595,7 +597,7 @@ wheels = [
|
|||
|
||||
[[package]]
|
||||
name = "chonkie"
|
||||
version = "1.5.6"
|
||||
version = "1.6.0"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "chonkie-core" },
|
||||
|
|
@ -603,12 +605,15 @@ dependencies = [
|
|||
{ name = "tenacity" },
|
||||
{ name = "tqdm" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/a4/16/e51295955f5a627ebb7867dc2e7fa48d4c6dc2a5f3cde3690de84812e929/chonkie-1.5.6.tar.gz", hash = "sha256:282a24c20b88c4c28d8cae893ac78bcbee531a87d28ec86b419897a9eea2ecf3", size = 172066, upload-time = "2026-02-16T21:44:01.336Z" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/e5/72/fdf8f89ff439f4ec357af0866c819512391936e4e61b6f15635a48434b8a/chonkie-1.6.0.tar.gz", hash = "sha256:14120d80610c1f549027fc7aa9a5ff604a729b545836f6cadd65d5ae83596279", size = 187056, upload-time = "2026-03-11T04:55:07.657Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/18/3a/24cf4cb377f4d44126231d55a19b48a645a0f78f891288a8d4300c95160d/chonkie-1.5.6-py3-none-any.whl", hash = "sha256:4c3be39a0f97315eb3c5efe6dc5d7933d3d27a1918b55c39ab211b403bb03df7", size = 210065, upload-time = "2026-02-16T21:43:59.926Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/ae/c2/7ea7d3409df220dd0e048b1113b44f47eccab9d517b00b037ab0e34c3c7a/chonkie-1.6.0-py3-none-any.whl", hash = "sha256:aa357e02f5cdacac6f8280c5e8651207c866b4137bcf20904db8670ee0808877", size = 232997, upload-time = "2026-03-11T04:55:05.252Z" },
|
||||
]
|
||||
|
||||
[package.optional-dependencies]
|
||||
elastic = [
|
||||
{ name = "elasticsearch" },
|
||||
]
|
||||
semantic = [
|
||||
{ name = "model2vec" },
|
||||
{ name = "tokenizers" },
|
||||
|
|
@ -1061,6 +1066,14 @@ wheels = [
|
|||
{ url = "https://files.pythonhosted.org/packages/9c/0f/5d0c71a1aefeb08efff26272149e07ab922b64f46c63363756224bd6872e/filelock-3.24.3-py3-none-any.whl", hash = "sha256:426e9a4660391f7f8a810d71b0555bce9008b0a1cc342ab1f6947d37639e002d", size = 24331, upload-time = "2026-02-19T00:48:18.465Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "flatbuffers"
|
||||
version = "25.12.19"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/e8/2d/d2a548598be01649e2d46231d151a6c56d10b964d94043a335ae56ea2d92/flatbuffers-25.12.19-py2.py3-none-any.whl", hash = "sha256:7634f50c427838bb021c2d66a3d1168e9d199b0607e6329399f04846d42e20b4", size = 26661, upload-time = "2025-12-19T23:16:13.622Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "fqdn"
|
||||
version = "1.5.1"
|
||||
|
|
@ -3112,14 +3125,14 @@ wheels = [
|
|||
|
||||
[[package]]
|
||||
name = "opentelemetry-proto"
|
||||
version = "1.39.1"
|
||||
version = "1.40.0"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "protobuf" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/49/1d/f25d76d8260c156c40c97c9ed4511ec0f9ce353f8108ca6e7561f82a06b2/opentelemetry_proto-1.39.1.tar.gz", hash = "sha256:6c8e05144fc0d3ed4d22c2289c6b126e03bcd0e6a7da0f16cedd2e1c2772e2c8", size = 46152, upload-time = "2025-12-11T13:32:48.681Z" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/4c/77/dd38991db037fdfce45849491cb61de5ab000f49824a00230afb112a4392/opentelemetry_proto-1.40.0.tar.gz", hash = "sha256:03f639ca129ba513f5819810f5b1f42bcb371391405d99c168fe6937c62febcd", size = 45667, upload-time = "2026-03-04T14:17:31.194Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/51/95/b40c96a7b5203005a0b03d8ce8cd212ff23f1793d5ba289c87a097571b18/opentelemetry_proto-1.39.1-py3-none-any.whl", hash = "sha256:22cdc78efd3b3765d09e68bfbd010d4fc254c9818afd0b6b423387d9dee46007", size = 72535, upload-time = "2025-12-11T13:32:33.866Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/b9/b2/189b2577dde745b15625b3214302605b1353436219d42b7912e77fa8dc24/opentelemetry_proto-1.40.0-py3-none-any.whl", hash = "sha256:266c4385d88923a23d63e353e9761af0f47a6ed0d486979777fe4de59dc9b25f", size = 72073, upload-time = "2026-03-04T14:17:16.673Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
|
|
|||
Loading…
Reference in New Issue