452 lines
14 KiB
Python
452 lines
14 KiB
Python
"""
|
|
avap_ingest.py v2.0
|
|
|
|
Uso:
|
|
|
|
# Ingestar
|
|
python avap_ingest.py --chunks ingestion/chunks.jsonl --index avap-knowledge-v1
|
|
|
|
# Borrar indice y re-ingestar desde cero
|
|
python avap_ingest.py --chunks ingestion/chunks.jsonl --index avap-knowledge-v1 --delete
|
|
|
|
# Reprocesar solo los fallidos (DLQ)
|
|
python avap_ingest.py --chunks ingestion/failed_chunks.jsonl --index avap-knowledge-v1
|
|
|
|
"""
|
|
|
|
import os
|
|
import json
|
|
import time
|
|
import asyncio
|
|
import argparse
|
|
import traceback
|
|
from pathlib import Path
|
|
from datetime import datetime
|
|
from typing import AsyncGenerator
|
|
from elasticsearch import AsyncElasticsearch
|
|
import httpx
|
|
from tqdm import tqdm
|
|
from elasticsearch import helpers as es_helpers
|
|
|
|
|
|
DEFAULT_CHUNKS_PATH = "ingestion/chunks.jsonl"
|
|
DEFAULT_INDEX = "avap-knowledge-v1"
|
|
DEFAULT_OLLAMA_URL= "http://localhost:11434"
|
|
DEFAULT_OLLAMA_MODEL= "qwen3-0.6B-emb:latest"
|
|
DEFAULT_EMBEDDING_DIM= 1024
|
|
BATCH_SIZE_EMBED= 8
|
|
BATCH_SIZE_ES= 50
|
|
QUEUE_MAXSIZE= 5
|
|
MAX_RETRIES= 3
|
|
RETRY_DELAY= 2.0
|
|
OLLAMA_TIMEOUT= 120
|
|
|
|
|
|
def iter_chunks_jsonl(path, batch_size):
|
|
|
|
batch = []
|
|
with open(path, encoding="utf-8") as f:
|
|
for line in f:
|
|
line = line.strip()
|
|
if not line:
|
|
continue
|
|
try:
|
|
chunk = json.loads(line)
|
|
batch.append(chunk)
|
|
if len(batch) >= batch_size:
|
|
yield batch
|
|
batch = []
|
|
except json.JSONDecodeError as e:
|
|
print(e)
|
|
if batch:
|
|
yield batch
|
|
|
|
|
|
def count_lines(path):
|
|
n = 0
|
|
with open(path, encoding="utf-8") as f:
|
|
for line in f:
|
|
if line.strip():
|
|
n += 1
|
|
return n
|
|
|
|
|
|
def build_index_mapping(embedding_dim):
|
|
|
|
return {
|
|
"settings": {
|
|
"number_of_shards": 1,
|
|
"number_of_replicas": 0,
|
|
"analysis": {
|
|
"analyzer": {
|
|
"avap_analyzer": {
|
|
"type": "custom",
|
|
"tokenizer": "standard",
|
|
"filter": ["lowercase", "stop"]
|
|
}
|
|
}
|
|
}
|
|
},
|
|
"mappings": {
|
|
"properties": {
|
|
"chunk_id": {"type": "keyword"},
|
|
"content": {
|
|
"type": "text",
|
|
"analyzer": "avap_analyzer"
|
|
},
|
|
|
|
"embedding": {
|
|
"type": "dense_vector",
|
|
"dims": embedding_dim,
|
|
"index": True,
|
|
"similarity": "cosine",
|
|
"index_options": {
|
|
"type": "int8_hnsw",
|
|
"m": 16,
|
|
"ef_construction": 100
|
|
}
|
|
},
|
|
"doc_type": {"type": "keyword"},
|
|
"block_type": {"type": "keyword"},
|
|
"section": {
|
|
"type": "text",
|
|
"fields": {"keyword": {"type": "keyword"}}
|
|
},
|
|
"source_file": {"type": "keyword"},
|
|
"start_line": {"type": "integer"},
|
|
"end_line": {"type": "integer"},
|
|
"token_estimate": {"type": "integer"},
|
|
"metadata": {
|
|
"properties": {
|
|
"uses_orm": {"type": "boolean"},
|
|
"uses_http": {"type": "boolean"},
|
|
"uses_connector": {"type": "boolean"},
|
|
"uses_async": {"type": "boolean"},
|
|
"uses_crypto": {"type": "boolean"},
|
|
"uses_auth": {"type": "boolean"},
|
|
"uses_error_handling": {"type": "boolean"},
|
|
"uses_loop": {"type": "boolean"},
|
|
"uses_json": {"type": "boolean"},
|
|
"uses_list": {"type": "boolean"},
|
|
"uses_regex": {"type": "boolean"},
|
|
"uses_datetime": {"type": "boolean"},
|
|
"returns_result": {"type": "boolean"},
|
|
"registers_endpoint": {"type": "boolean"},
|
|
"has_overlap": {"type": "boolean"},
|
|
"complexity": {"type": "integer"},
|
|
"full_block_start": {"type": "integer"},
|
|
"full_block_end": {"type": "integer"},
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
class DeadLetterQueue:
|
|
|
|
def __init__(self, base_path = "ingestion"):
|
|
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
self.path = Path(base_path) / f"failed_chunks_{ts}.jsonl"
|
|
self._handle = None
|
|
self.count = 0
|
|
|
|
def _open(self):
|
|
if self._handle is None:
|
|
self.path.parent.mkdir(parents=True, exist_ok=True)
|
|
self._handle = open(self.path, "w", encoding="utf-8")
|
|
|
|
def write(self, chunk, reason) -> None:
|
|
self._open()
|
|
record = {"reason": reason, "chunk": chunk}
|
|
self._handle.write(json.dumps(record, ensure_ascii=False) + "\n")
|
|
self._handle.flush()
|
|
self.count += 1
|
|
|
|
def close(self):
|
|
if self._handle:
|
|
self._handle.close()
|
|
self._handle = None
|
|
|
|
def report(self):
|
|
if self.count:
|
|
print(f"{self.count} Failed: {self.path}")
|
|
else:
|
|
print(" No failed chunks")
|
|
|
|
class OllamaAsyncEmbedder:
|
|
|
|
def __init__(self, base_url, model, timeout = OLLAMA_TIMEOUT):
|
|
self.base_url = base_url.rstrip("/")
|
|
self.model = model
|
|
self._client = httpx.AsyncClient(timeout=timeout)
|
|
|
|
async def probe_dimension(self):
|
|
vecs = await self._embed(["dimension probe"])
|
|
return len(vecs[0])
|
|
|
|
async def _embed(self, texts):
|
|
payload = {"model": self.model, "input": texts}
|
|
for attempt in range(1, MAX_RETRIES + 1):
|
|
try:
|
|
resp = await self._client.post(
|
|
f"{self.base_url}/api/embed",
|
|
json=payload
|
|
)
|
|
resp.raise_for_status()
|
|
return resp.json()["embeddings"]
|
|
except Exception as exc:
|
|
if attempt >= MAX_RETRIES:
|
|
raise RuntimeError(f"Embeddings fail {MAX_RETRIES}: {exc}") from exc
|
|
await asyncio.sleep(RETRY_DELAY * attempt)
|
|
return []
|
|
|
|
async def embed_batch(self, chunks, dlq):
|
|
texts = [c["content"] for c in chunks]
|
|
try:
|
|
vectors = await self._embed(texts)
|
|
return list(zip(chunks, vectors))
|
|
except Exception as exc:
|
|
print(exc)
|
|
results = []
|
|
for chunk in chunks:
|
|
try:
|
|
vecs = await self._embed([chunk["content"]])
|
|
results.append((chunk, vecs[0]))
|
|
except Exception as single_exc:
|
|
dlq.write(chunk, f"Ollama embed failed: {single_exc}")
|
|
return results
|
|
|
|
async def close(self):
|
|
await self._client.aclose()
|
|
|
|
|
|
async def producer(chunks_path, embedder, queue, dlq, batch_size, pbar):
|
|
|
|
for batch in iter_chunks_jsonl(chunks_path, batch_size):
|
|
embedded = await embedder.embed_batch(batch, dlq)
|
|
if embedded:
|
|
await queue.put(embedded)
|
|
pbar.update(len(batch))
|
|
|
|
await queue.put(None)
|
|
|
|
|
|
async def consumer( queue, es_client, index, dlq, batch_size_es, stats):
|
|
|
|
buffer: list[tuple[dict, list[float]]] = []
|
|
|
|
async def flush_buffer():
|
|
if not buffer:
|
|
return
|
|
actions = [
|
|
{
|
|
"_index": index,
|
|
"_id": chunk["chunk_id"],
|
|
"_source": {
|
|
"chunk_id": chunk["chunk_id"],
|
|
"content": chunk["content"],
|
|
"embedding": vector,
|
|
"doc_type": chunk.get("doc_type", "unknown"),
|
|
"block_type": chunk.get("block_type", ""),
|
|
"section": chunk.get("section", ""),
|
|
"source_file": chunk.get("source_file", ""),
|
|
"start_line": chunk.get("start_line", 0),
|
|
"end_line": chunk.get("end_line", 0),
|
|
"token_estimate": chunk.get("token_estimate", 0),
|
|
"metadata": chunk.get("metadata", {}),
|
|
}
|
|
}
|
|
for chunk, vector in buffer
|
|
]
|
|
try:
|
|
ok, errors = await es_helpers.async_bulk(
|
|
es_client, actions,
|
|
raise_on_error=False,
|
|
stats_only=False
|
|
)
|
|
stats["ok"] += ok
|
|
stats["errors"] += len(errors)
|
|
for err in errors:
|
|
failed_id = err.get("index", {}).get("_id", "unknown")
|
|
reason = str(err.get("index", {}).get("error", "unknown ES error"))
|
|
for chunk, _ in buffer:
|
|
if chunk["chunk_id"] == failed_id:
|
|
dlq.write(chunk, f"ES bulk error: {reason}")
|
|
break
|
|
except Exception as exc:
|
|
for chunk, _ in buffer:
|
|
dlq.write(chunk, f"ES bulk exception: {exc}")
|
|
stats["errors"] += len(buffer)
|
|
|
|
buffer.clear()
|
|
|
|
while True:
|
|
item = await queue.get()
|
|
if item is None:
|
|
await flush_buffer()
|
|
break
|
|
buffer.extend(item)
|
|
if len(buffer) >= batch_size_es:
|
|
await flush_buffer()
|
|
|
|
|
|
async def build_es_client():
|
|
url = "http://127.0.0.1:9200"
|
|
|
|
client = AsyncElasticsearch(
|
|
url,
|
|
verify_certs=False,
|
|
request_timeout=60
|
|
)
|
|
|
|
try:
|
|
info = await client.info()
|
|
print(f" Elasticsearch {info['version']['number']} en {url}")
|
|
except Exception as e:
|
|
raise ConnectionError(f"Cant connet {url}. Error: {e}")
|
|
|
|
return client
|
|
|
|
|
|
async def create_index(client: AsyncElasticsearch, index: str,
|
|
embedding_dim: int,
|
|
delete_if_exists: bool = False) -> None:
|
|
exists = await client.indices.exists(index=index)
|
|
if exists and delete_if_exists:
|
|
await client.indices.delete(index=index)
|
|
exists = False
|
|
if not exists:
|
|
await client.indices.create(index=index, body=build_index_mapping(embedding_dim))
|
|
print(f" · Index '{index}' created (dim={embedding_dim}, int8_hnsw, cosine).")
|
|
else:
|
|
print(f" · Inex '{index}' reused.")
|
|
|
|
|
|
"""
|
|
async def build_es_client():
|
|
url = "http://127.0.0.1:9200"
|
|
|
|
client = AsyncElasticsearch(
|
|
url,
|
|
verify_certs=False,
|
|
request_timeout=60,
|
|
headers={
|
|
"Accept": "application/vnd.elasticsearch+json; compatible-with=8",
|
|
"Content-Type": "application/json"
|
|
}
|
|
)
|
|
client.options(headers={"Accept": "application/vnd.elasticsearch+json; compatible-with=8"})
|
|
|
|
try:
|
|
await client.info()
|
|
except Exception as e:
|
|
raise ConnectionError(f"Error de versión/compatibilidad: {e}")
|
|
return client
|
|
"""
|
|
|
|
async def run(args):
|
|
|
|
ollama_url = os.environ.get("OLLAMA_URL", DEFAULT_OLLAMA_URL)
|
|
ollama_model = os.environ.get("OLLAMA_MODEL", DEFAULT_OLLAMA_MODEL)
|
|
embed_dim = int(os.environ.get("OLLAMA_EMBEDDING_DIM", DEFAULT_EMBEDDING_DIM))
|
|
|
|
embedder = OllamaAsyncEmbedder(ollama_url, ollama_model)
|
|
|
|
if args.probe_dim:
|
|
dim = await embedder.probe_dimension()
|
|
print(f" Model dimensions: {dim}")
|
|
await embedder.close()
|
|
return
|
|
|
|
if not Path(args.chunks).exists():
|
|
print(f"File Not Found: {args.chunks}")
|
|
await embedder.close()
|
|
return
|
|
|
|
total = count_lines(args.chunks)
|
|
print(f" Total Chunks: {total}")
|
|
|
|
print("\nConnecting to VectorDB...")
|
|
es_client = await build_es_client()
|
|
|
|
print(f"\nGenerating index '{args.index}'...")
|
|
await create_index(es_client, args.index, embed_dim,
|
|
delete_if_exists=args.delete)
|
|
|
|
print("\n Checking Model dimmensions...")
|
|
actual_dim = await embedder.probe_dimension()
|
|
if actual_dim != embed_dim:
|
|
print(f" Real dimmension ({actual_dim}) != OLLAMA_EMBEDDING_DIM ({embed_dim})")
|
|
await embedder.close()
|
|
await es_client.close()
|
|
return
|
|
print(f" Dimmension: {actual_dim}")
|
|
|
|
dlq = DeadLetterQueue(base_path=str(Path(args.chunks).parent))
|
|
stats = {"ok": 0, "errors": 0}
|
|
queue = asyncio.Queue(maxsize=QUEUE_MAXSIZE)
|
|
|
|
print(f"\nAsync pipeline (Ollama <-> Elasticsearch)...\n")
|
|
t0 = time.time()
|
|
pbar = tqdm(total=total, desc=" Processing", unit="chunks")
|
|
|
|
await asyncio.gather(
|
|
producer(args.chunks, embedder, queue, dlq,
|
|
args.batch_embed, pbar),
|
|
consumer(queue, es_client, args.index, dlq,
|
|
args.batch_es, stats),
|
|
)
|
|
|
|
pbar.close()
|
|
elapsed = time.time() - t0
|
|
|
|
await embedder.close()
|
|
await es_client.close()
|
|
dlq.close()
|
|
|
|
print("RESULT")
|
|
print("----------------")
|
|
print(f"Chunks : {total}")
|
|
print(f" -OK : {stats['ok']}")
|
|
print(f" -Errors : {stats['errors']}")
|
|
print(f" -Index Name: {args.index}")
|
|
print()
|
|
dlq.report()
|
|
print("----------------")
|
|
|
|
def main():
|
|
|
|
parser = argparse.ArgumentParser(
|
|
description="AVAP Ingestor"
|
|
)
|
|
parser.add_argument("--chunks", default=DEFAULT_CHUNKS_PATH,
|
|
help=f"JSONL Chunk File (default: {DEFAULT_CHUNKS_PATH})")
|
|
parser.add_argument("--index", default=DEFAULT_INDEX,
|
|
help=f"Index Name (default: {DEFAULT_INDEX})")
|
|
parser.add_argument("--delete", action="store_true",
|
|
help="Delete index before send")
|
|
parser.add_argument("--probe-dim", action="store_true",
|
|
help="Check Model dimmension")
|
|
parser.add_argument("--batch-embed", type=int, default=BATCH_SIZE_EMBED,
|
|
help=f"Chunks by Ollama call(default: {BATCH_SIZE_EMBED})")
|
|
parser.add_argument("--batch-es", type=int, default=BATCH_SIZE_ES,
|
|
help=f"Docs by bulk ES (default: {BATCH_SIZE_ES})")
|
|
args = parser.parse_args()
|
|
|
|
print("----------------")
|
|
print("AVAP INGESTOR")
|
|
print("----------------")
|
|
if not args.probe_dim:
|
|
print(f" Chunks : {args.chunks}")
|
|
print(f" INDEX ES : {args.index}")
|
|
print(f" Ollama URL : {os.environ.get('OLLAMA_URL', DEFAULT_OLLAMA_URL)}")
|
|
print(f" MODEL : {os.environ.get('OLLAMA_MODEL', DEFAULT_OLLAMA_MODEL)}")
|
|
print(f" MODEL DIM : {os.environ.get('OLLAMA_EMBEDDING_DIM', DEFAULT_EMBEDDING_DIM)}")
|
|
print()
|
|
|
|
asyncio.run(run(args))
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main() |