import os import time import json import logging from collections import defaultdict from pathlib import Path from typing import Optional from ragas import evaluate as ragas_evaluate from ragas.metrics import ( faithfulness, answer_relevancy, context_recall, context_precision,) from ragas.llms import LangchainLLMWrapper from ragas.embeddings import LangchainEmbeddingsWrapper from datasets import Dataset from langchain_anthropic import ChatAnthropic from ragas.run_config import RunConfig import asyncio import time class RateLimitedChatAnthropic(ChatAnthropic): # Ragas usa principalmente este método asíncrono internamente async def _agenerate(self, messages, stop=None, run_manager=None, **kwargs): await asyncio.sleep(3.0) # <-- PON AQUÍ LA PAUSA EN SEGUNDOS (ej: 3 segundos) return await super()._agenerate(messages, stop=stop, run_manager=run_manager, **kwargs) # Añadimos la pausa síncrona también por seguridad def _generate(self, messages, stop=None, run_manager=None, **kwargs): time.sleep(3.0) # <-- MISMA PAUSA AQUÍ return super()._generate(messages, stop=stop, run_manager=run_manager, **kwargs) logger = logging.getLogger(__name__) GOLDEN_DATASET_PATH = Path(__file__).parent / "golden_dataset.json" CLAUDE_MODEL = os.getenv("ANTHROPIC_MODEL", "claude-sonnet-4-20250514") ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY", "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") K_RETRIEVE = 5 ANTHROPIC_AVAILABLE = True from elasticsearch import Elasticsearch from langchain_core.messages import SystemMessage, HumanMessage def retrieve_context( es_client, embeddings, question, index, k = K_RETRIEVE,): query_vector = None try: query_vector = embeddings.embed_query(question) except Exception as e: logger.warning(f"[eval] embed_query fails: {e}") bm25_hits = [] try: resp = es_client.search( index=index, body={ "size": k, "query": { "multi_match": { "query": question, "fields": ["content^2", "text^2"], "type": "best_fields", "fuzziness": "AUTO", } }, "_source": {"excludes": ["embedding"]}, } ) bm25_hits = resp["hits"]["hits"] except Exception as e: logger.warning(f"[eval] BM25 fails: {e}") knn_hits = [] if query_vector: try: resp = es_client.search( index=index, body={ "size": k, "knn": { "field": "embedding", "query_vector": query_vector, "k": k, "num_candidates": k * 5, }, "_source": {"excludes": ["embedding"]}, } ) knn_hits = resp["hits"]["hits"] except Exception as e: logger.warning(f"[eval] kNN falló: {e}") rrf_scores: dict[str, float] = defaultdict(float) hit_by_id: dict[str, dict] = {} for rank, hit in enumerate(bm25_hits): doc_id = hit["_id"] rrf_scores[doc_id] += 1.0 / (rank + 60) hit_by_id[doc_id] = hit for rank, hit in enumerate(knn_hits): doc_id = hit["_id"] rrf_scores[doc_id] += 1.0 / (rank + 60) if doc_id not in hit_by_id: hit_by_id[doc_id] = hit ranked = sorted(rrf_scores.items(), key=lambda x: x[1], reverse=True)[:k] return [ hit_by_id[doc_id]["_source"].get("content") or hit_by_id[doc_id]["_source"].get("text", "") for doc_id, _ in ranked if ( hit_by_id[doc_id]["_source"].get("content") or hit_by_id[doc_id]["_source"].get("text", "") ).strip() ] def generate_answer(llm, question: str, contexts: list[str]) -> str: try: from prompts import GENERATE_PROMPT context_text = "\n\n".join( f"[{i+1}] {ctx}" for i, ctx in enumerate(contexts) ) prompt = SystemMessage( content=GENERATE_PROMPT.content.format(context=context_text) ) resp = llm.invoke([prompt, HumanMessage(content=question)]) return resp.content.strip() except Exception as e: logger.warning(f"[eval] generate_answer fails: {e}") return "" def run_evaluation( es_client, llm, embeddings, index_name, category = None, limit = None,): if not ANTHROPIC_AVAILABLE: return {"error": "langchain-anthropic no instalado. pip install langchain-anthropic"} if not ANTHROPIC_API_KEY: return {"error": "ANTHROPIC_API_KEY no configurada en .env"} if not GOLDEN_DATASET_PATH.exists(): return {"error": f"Golden dataset no encontrado en {GOLDEN_DATASET_PATH}"} questions = json.loads(GOLDEN_DATASET_PATH.read_text(encoding="utf-8")) if category: questions = [q for q in questions if q.get("category") == category] if limit: questions = questions[:limit] if not questions: return {"error": "NO QUESTIONS WITH THIS FILTERS"} logger.info(f"[eval] makind: {len(questions)} questions, index={index_name}") claude_judge = RateLimitedChatAnthropic( model=CLAUDE_MODEL, api_key=ANTHROPIC_API_KEY, temperature=0, max_tokens=2048, ) rows = {"question": [], "answer": [], "contexts": [], "ground_truth": []} details = [] t_start = time.time() for item in questions: q_id = item["id"] question = item["question"] gt = item["ground_truth"] logger.info(f"[eval] {q_id}: {question[:60]}") contexts = retrieve_context(es_client, embeddings, question, index_name) if not contexts: logger.warning(f"[eval] No context for {q_id} — skipping") continue answer = generate_answer(llm, question, contexts) if not answer: logger.warning(f"[eval] No answers for {q_id} — skipping") continue rows["question"].append(question) rows["answer"].append(answer) rows["contexts"].append(contexts) rows["ground_truth"].append(gt) details.append({ "id": q_id, "category": item.get("category", ""), "question": question, "answer_preview": answer[:300], "n_chunks": len(contexts), }) time.sleep(2.5) if not rows["question"]: return {"error": "NO SAMPLES GENETARED"} dataset = Dataset.from_dict(rows) ragas_llm = LangchainLLMWrapper(claude_judge) ragas_emb = LangchainEmbeddingsWrapper(embeddings) metrics = [faithfulness, answer_relevancy, context_recall, context_precision] for metric in metrics: metric.llm = ragas_llm if hasattr(metric, "embeddings"): metric.embeddings = ragas_emb time.sleep(5) logger.info("[eval] JUDGING BY CLAUDE...") run_config = RunConfig(max_workers=1) result = ragas_evaluate(dataset, metrics=metrics, run_config=run_config) elapsed = time.time() - t_start # RAGAS >= 0.2 returns an EvaluationResult object, not a dict. # Extract per-metric means from the underlying DataFrame. try: df = result.to_pandas() def _mean(col): return round(float(df[col].dropna().mean()), 4) if col in df.columns else 0.0 except Exception: # Fallback: try legacy dict-style access df = None def _mean(col): try: return round(float(result[col]), 4) except Exception: return 0.0 scores = { "faithfulness": _mean("faithfulness"), "answer_relevancy": _mean("answer_relevancy"), "context_recall": _mean("context_recall"), "context_precision": _mean("context_precision"), } valid_scores = [v for v in scores.values() if v > 0] global_score = round(sum(valid_scores) / len(valid_scores), 4) if valid_scores else 0.0 verdict = ( "EXCELLENT" if global_score >= 0.8 else "ACCEPTABLE" if global_score >= 0.6 else "INSUFFICIENT" ) logger.info(f"[eval] FINISHED — global={global_score} verdict={verdict} " f"elapsed={elapsed:.0f}s") return { "status": "ok", "questions_evaluated": len(rows["question"]), "elapsed_seconds": round(elapsed, 1), "judge_model": CLAUDE_MODEL, "index": index_name, "category_filter": category or "all", "scores": scores, "global_score": global_score, "verdict": verdict, "details": details, }