""" avap_ingest.py v2.0 Uso: # Ingestar python avap_ingest.py --chunks ingestion/chunks.jsonl --index avap-knowledge-v1 # Borrar indice y re-ingestar desde cero python avap_ingest.py --chunks ingestion/chunks.jsonl --index avap-knowledge-v1 --delete # Reprocesar solo los fallidos (DLQ) python avap_ingest.py --chunks ingestion/failed_chunks.jsonl --index avap-knowledge-v1 """ import os import json import time import asyncio import argparse import traceback from pathlib import Path from datetime import datetime from typing import AsyncGenerator from elasticsearch import AsyncElasticsearch import httpx from tqdm import tqdm from elasticsearch import helpers as es_helpers DEFAULT_CHUNKS_PATH = "ingestion/chunks.jsonl" DEFAULT_INDEX = "avap-knowledge-v1" DEFAULT_OLLAMA_URL= "http://localhost:11434" DEFAULT_OLLAMA_MODEL= "qwen3-0.6B-emb:latest" DEFAULT_EMBEDDING_DIM= 1024 BATCH_SIZE_EMBED= 8 BATCH_SIZE_ES= 50 QUEUE_MAXSIZE= 5 MAX_RETRIES= 3 RETRY_DELAY= 2.0 OLLAMA_TIMEOUT= 120 def iter_chunks_jsonl(path, batch_size): batch = [] with open(path, encoding="utf-8") as f: for line in f: line = line.strip() if not line: continue try: chunk = json.loads(line) batch.append(chunk) if len(batch) >= batch_size: yield batch batch = [] except json.JSONDecodeError as e: print(e) if batch: yield batch def count_lines(path): n = 0 with open(path, encoding="utf-8") as f: for line in f: if line.strip(): n += 1 return n def build_index_mapping(embedding_dim): return { "settings": { "number_of_shards": 1, "number_of_replicas": 0, "analysis": { "analyzer": { "avap_analyzer": { "type": "custom", "tokenizer": "standard", "filter": ["lowercase", "stop"] } } } }, "mappings": { "properties": { "chunk_id": {"type": "keyword"}, "content": { "type": "text", "analyzer": "avap_analyzer" }, "embedding": { "type": "dense_vector", "dims": embedding_dim, "index": True, "similarity": "cosine", "index_options": { "type": "int8_hnsw", "m": 16, "ef_construction": 100 } }, "doc_type": {"type": "keyword"}, "block_type": {"type": "keyword"}, "section": { "type": "text", "fields": {"keyword": {"type": "keyword"}} }, "source_file": {"type": "keyword"}, "start_line": {"type": "integer"}, "end_line": {"type": "integer"}, "token_estimate": {"type": "integer"}, "metadata": { "properties": { "uses_orm": {"type": "boolean"}, "uses_http": {"type": "boolean"}, "uses_connector": {"type": "boolean"}, "uses_async": {"type": "boolean"}, "uses_crypto": {"type": "boolean"}, "uses_auth": {"type": "boolean"}, "uses_error_handling": {"type": "boolean"}, "uses_loop": {"type": "boolean"}, "uses_json": {"type": "boolean"}, "uses_list": {"type": "boolean"}, "uses_regex": {"type": "boolean"}, "uses_datetime": {"type": "boolean"}, "returns_result": {"type": "boolean"}, "registers_endpoint": {"type": "boolean"}, "has_overlap": {"type": "boolean"}, "complexity": {"type": "integer"}, "full_block_start": {"type": "integer"}, "full_block_end": {"type": "integer"}, } } } } } class DeadLetterQueue: def __init__(self, base_path = "ingestion"): ts = datetime.now().strftime("%Y%m%d_%H%M%S") self.path = Path(base_path) / f"failed_chunks_{ts}.jsonl" self._handle = None self.count = 0 def _open(self): if self._handle is None: self.path.parent.mkdir(parents=True, exist_ok=True) self._handle = open(self.path, "w", encoding="utf-8") def write(self, chunk, reason) -> None: self._open() record = {"reason": reason, "chunk": chunk} self._handle.write(json.dumps(record, ensure_ascii=False) + "\n") self._handle.flush() self.count += 1 def close(self): if self._handle: self._handle.close() self._handle = None def report(self): if self.count: print(f"{self.count} Failed: {self.path}") else: print(" No failed chunks") class OllamaAsyncEmbedder: def __init__(self, base_url, model, timeout = OLLAMA_TIMEOUT): self.base_url = base_url.rstrip("/") self.model = model self._client = httpx.AsyncClient(timeout=timeout) async def probe_dimension(self): vecs = await self._embed(["dimension probe"]) return len(vecs[0]) async def _embed(self, texts): payload = {"model": self.model, "input": texts} for attempt in range(1, MAX_RETRIES + 1): try: resp = await self._client.post( f"{self.base_url}/api/embed", json=payload ) resp.raise_for_status() return resp.json()["embeddings"] except Exception as exc: if attempt >= MAX_RETRIES: raise RuntimeError(f"Embeddings fail {MAX_RETRIES}: {exc}") from exc await asyncio.sleep(RETRY_DELAY * attempt) return [] async def embed_batch(self, chunks, dlq): texts = [c["content"] for c in chunks] try: vectors = await self._embed(texts) return list(zip(chunks, vectors)) except Exception as exc: print(exc) results = [] for chunk in chunks: try: vecs = await self._embed([chunk["content"]]) results.append((chunk, vecs[0])) except Exception as single_exc: dlq.write(chunk, f"Ollama embed failed: {single_exc}") return results async def close(self): await self._client.aclose() async def producer(chunks_path, embedder, queue, dlq, batch_size, pbar): for batch in iter_chunks_jsonl(chunks_path, batch_size): embedded = await embedder.embed_batch(batch, dlq) if embedded: await queue.put(embedded) pbar.update(len(batch)) await queue.put(None) async def consumer( queue, es_client, index, dlq, batch_size_es, stats): buffer: list[tuple[dict, list[float]]] = [] async def flush_buffer(): if not buffer: return actions = [ { "_index": index, "_id": chunk["chunk_id"], "_source": { "chunk_id": chunk["chunk_id"], "content": chunk["content"], "embedding": vector, "doc_type": chunk.get("doc_type", "unknown"), "block_type": chunk.get("block_type", ""), "section": chunk.get("section", ""), "source_file": chunk.get("source_file", ""), "start_line": chunk.get("start_line", 0), "end_line": chunk.get("end_line", 0), "token_estimate": chunk.get("token_estimate", 0), "metadata": chunk.get("metadata", {}), } } for chunk, vector in buffer ] try: ok, errors = await es_helpers.async_bulk( es_client, actions, raise_on_error=False, stats_only=False ) stats["ok"] += ok stats["errors"] += len(errors) for err in errors: failed_id = err.get("index", {}).get("_id", "unknown") reason = str(err.get("index", {}).get("error", "unknown ES error")) for chunk, _ in buffer: if chunk["chunk_id"] == failed_id: dlq.write(chunk, f"ES bulk error: {reason}") break except Exception as exc: for chunk, _ in buffer: dlq.write(chunk, f"ES bulk exception: {exc}") stats["errors"] += len(buffer) buffer.clear() while True: item = await queue.get() if item is None: await flush_buffer() break buffer.extend(item) if len(buffer) >= batch_size_es: await flush_buffer() async def build_es_client(): url = "http://127.0.0.1:9200" client = AsyncElasticsearch( url, verify_certs=False, request_timeout=60 ) try: info = await client.info() print(f" Elasticsearch {info['version']['number']} en {url}") except Exception as e: raise ConnectionError(f"Cant connet {url}. Error: {e}") return client async def create_index(client: AsyncElasticsearch, index: str, embedding_dim: int, delete_if_exists: bool = False) -> None: exists = await client.indices.exists(index=index) if exists and delete_if_exists: await client.indices.delete(index=index) exists = False if not exists: await client.indices.create(index=index, body=build_index_mapping(embedding_dim)) print(f" · Index '{index}' created (dim={embedding_dim}, int8_hnsw, cosine).") else: print(f" · Inex '{index}' reused.") """ async def build_es_client(): url = "http://127.0.0.1:9200" client = AsyncElasticsearch( url, verify_certs=False, request_timeout=60, headers={ "Accept": "application/vnd.elasticsearch+json; compatible-with=8", "Content-Type": "application/json" } ) client.options(headers={"Accept": "application/vnd.elasticsearch+json; compatible-with=8"}) try: await client.info() except Exception as e: raise ConnectionError(f"Error de versión/compatibilidad: {e}") return client """ async def run(args): ollama_url = os.environ.get("OLLAMA_URL", DEFAULT_OLLAMA_URL) ollama_model = os.environ.get("OLLAMA_MODEL", DEFAULT_OLLAMA_MODEL) embed_dim = int(os.environ.get("OLLAMA_EMBEDDING_DIM", DEFAULT_EMBEDDING_DIM)) embedder = OllamaAsyncEmbedder(ollama_url, ollama_model) if args.probe_dim: dim = await embedder.probe_dimension() print(f" Model dimensions: {dim}") await embedder.close() return if not Path(args.chunks).exists(): print(f"File Not Found: {args.chunks}") await embedder.close() return total = count_lines(args.chunks) print(f" Total Chunks: {total}") print("\nConnecting to VectorDB...") es_client = await build_es_client() print(f"\nGenerating index '{args.index}'...") await create_index(es_client, args.index, embed_dim, delete_if_exists=args.delete) print("\n Checking Model dimmensions...") actual_dim = await embedder.probe_dimension() if actual_dim != embed_dim: print(f" Real dimmension ({actual_dim}) != OLLAMA_EMBEDDING_DIM ({embed_dim})") await embedder.close() await es_client.close() return print(f" Dimmension: {actual_dim}") dlq = DeadLetterQueue(base_path=str(Path(args.chunks).parent)) stats = {"ok": 0, "errors": 0} queue = asyncio.Queue(maxsize=QUEUE_MAXSIZE) print(f"\nAsync pipeline (Ollama <-> Elasticsearch)...\n") t0 = time.time() pbar = tqdm(total=total, desc=" Processing", unit="chunks") await asyncio.gather( producer(args.chunks, embedder, queue, dlq, args.batch_embed, pbar), consumer(queue, es_client, args.index, dlq, args.batch_es, stats), ) pbar.close() elapsed = time.time() - t0 await embedder.close() await es_client.close() dlq.close() print("RESULT") print("----------------") print(f"Chunks : {total}") print(f" -OK : {stats['ok']}") print(f" -Errors : {stats['errors']}") print(f" -Index Name: {args.index}") print() dlq.report() print("----------------") def main(): parser = argparse.ArgumentParser( description="AVAP Ingestor" ) parser.add_argument("--chunks", default=DEFAULT_CHUNKS_PATH, help=f"JSONL Chunk File (default: {DEFAULT_CHUNKS_PATH})") parser.add_argument("--index", default=DEFAULT_INDEX, help=f"Index Name (default: {DEFAULT_INDEX})") parser.add_argument("--delete", action="store_true", help="Delete index before send") parser.add_argument("--probe-dim", action="store_true", help="Check Model dimmension") parser.add_argument("--batch-embed", type=int, default=BATCH_SIZE_EMBED, help=f"Chunks by Ollama call(default: {BATCH_SIZE_EMBED})") parser.add_argument("--batch-es", type=int, default=BATCH_SIZE_ES, help=f"Docs by bulk ES (default: {BATCH_SIZE_ES})") args = parser.parse_args() print("----------------") print("AVAP INGESTOR") print("----------------") if not args.probe_dim: print(f" Chunks : {args.chunks}") print(f" INDEX ES : {args.index}") print(f" Ollama URL : {os.environ.get('OLLAMA_URL', DEFAULT_OLLAMA_URL)}") print(f" MODEL : {os.environ.get('OLLAMA_MODEL', DEFAULT_OLLAMA_MODEL)}") print(f" MODEL DIM : {os.environ.get('OLLAMA_EMBEDDING_DIM', DEFAULT_EMBEDDING_DIM)}") print() asyncio.run(run(args)) if __name__ == "__main__": main()