From 6ee85838949de6f31d2db8db26d0fcad3589b66e Mon Sep 17 00:00:00 2001 From: rafa-ruiz Date: Tue, 31 Mar 2026 01:40:23 -0700 Subject: [PATCH 1/3] update --- Docker/src/evaluate.py | 25 ++++++++++++++++++++----- Docker/src/golden_dataset.json | 32 ++++++++++++++++++++++++++++++++ docs/LRM/avap_samples.zip | Bin 0 -> 19620 bytes 3 files changed, 52 insertions(+), 5 deletions(-) create mode 100644 Docker/src/golden_dataset.json create mode 100644 docs/LRM/avap_samples.zip diff --git a/Docker/src/evaluate.py b/Docker/src/evaluate.py index 791f9fb..82465ba 100644 --- a/Docker/src/evaluate.py +++ b/Docker/src/evaluate.py @@ -16,7 +16,7 @@ logger = logging.getLogger(__name__) GOLDEN_DATASET_PATH = Path(__file__).parent / "golden_dataset.json" CLAUDE_MODEL = os.getenv("ANTHROPIC_MODEL", "claude-sonnet-4-20250514") -ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY", "") +ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY", "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") K_RETRIEVE = 5 @@ -197,11 +197,26 @@ def run_evaluation( es_client, llm, embeddings, index_name, category = None, lim elapsed = time.time() - t_start + # RAGAS >= 0.2 returns an EvaluationResult object, not a dict. + # Extract per-metric means from the underlying DataFrame. + try: + df = result.to_pandas() + def _mean(col): + return round(float(df[col].dropna().mean()), 4) if col in df.columns else 0.0 + except Exception: + # Fallback: try legacy dict-style access + df = None + def _mean(col): + try: + return round(float(result[col]), 4) + except Exception: + return 0.0 + scores = { - "faithfulness": round(float(result.get("faithfulness", 0)), 4), - "answer_relevancy": round(float(result.get("answer_relevancy", 0)), 4), - "context_recall": round(float(result.get("context_recall", 0)), 4), - "context_precision": round(float(result.get("context_precision", 0)), 4), + "faithfulness": _mean("faithfulness"), + "answer_relevancy": _mean("answer_relevancy"), + "context_recall": _mean("context_recall"), + "context_precision": _mean("context_precision"), } valid_scores = [v for v in scores.values() if v > 0] diff --git a/Docker/src/golden_dataset.json b/Docker/src/golden_dataset.json new file mode 100644 index 0000000..e92d6da --- /dev/null +++ b/Docker/src/golden_dataset.json @@ -0,0 +1,32 @@ +[ + { + "id": "GD-001", + "category": "RETRIEVAL", + "question": "What is AVAP and what is it designed for?", + "ground_truth": "AVAP (Advanced Virtual API Programming) is a Turing-complete Domain-Specific Language (DSL) architecturally designed for the secure, concurrent, and deterministic orchestration of microservices and HTTP I/O. It is not a general-purpose language; its hybrid engine and strict grammar are optimized for fast processing of HTTP transactions, in-memory data manipulation, and interaction with external connectors. AVAP does not have internal print commands — all data output is performed through the HTTP interface using commands like addResult()." + }, + { + "id": "GD-002", + "category": "RETRIEVAL", + "question": "How does AVAP handle conditional logic? What commands are used and how are blocks closed?", + "ground_truth": "AVAP uses a mixed structural grammar for conditional logic, combining keyword fluidity with strict mathematical closures. The if() / else() / end() structure evaluates a logical or comparison expression. Every conditional block requires a mandatory end() closing statement. The if() command compares two values using a comparator operator (e.g., '==', '!=', '>', '<', '>=', '<='). An optional else() block handles the false branch. Example: if(saldo, 0, \">\") executes the true branch when the variable 'saldo' is greater than zero, otherwise the else() block runs, and end() closes the structure." + }, + { + "id": "GD-003", + "category": "CODE_GENERATION", + "question": "Write an AVAP script that reads a 'password' parameter, generates a SHA-256 hash of it, and returns the hash.", + "ground_truth": "The following AVAP script reads a 'password' query parameter, hashes it using SHA-256 via encodeSHA256(), and exposes the result via addResult():\n\naddParam(\"password\", password)\nencodeSHA256(password, hashed_password)\naddResult(hashed_password)\n\nKey commands used:\n- addParam(\"password\", password): reads the 'password' HTTP parameter into the variable 'password'.\n- encodeSHA256(password, hashed_password): computes the SHA-256 hash of the input and stores the 64-character hex digest in 'hashed_password'.\n- addResult(hashed_password): adds 'hashed_password' to the HTTP JSON response body." + }, + { + "id": "GD-004", + "category": "CODE_GENERATION", + "question": "Show an AVAP script that loops from 1 to 5, builds a JSON object with each iteration index as a key, and returns it.", + "ground_truth": "The following AVAP script iterates from 1 to 5 using startLoop/endLoop, dynamically builds a JSON object using AddvariableToJSON() on each iteration, and returns the result:\n\naddVar(mi_json, \"{}\")\nstartLoop(i, 1, 5)\n item = \"item_%s\" % i\n AddvariableToJSON(item, \"valor_generado\", mi_json)\nendLoop()\naddResult(mi_json)\n\nKey commands used:\n- addVar(mi_json, \"{}\"): initializes an empty JSON object.\n- startLoop(i, 1, 5) / endLoop(): iterates the variable 'i' from 1 to 5 inclusive.\n- AddvariableToJSON(item, \"valor_generado\", mi_json): inserts each generated key-value pair into the JSON object.\n- addResult(mi_json): exposes the final JSON in the HTTP response." + }, + { + "id": "GD-005", + "category": "RETRIEVAL", + "question": "How does AVAP support external HTTP calls? What commands are available and how is timeout handled?", + "ground_truth": "AVAP provides two commands for making external HTTP calls: RequestPost and RequestGet. To avoid blocking threads due to network latency, AVAP requires a mandatory timeout parameter (in milliseconds) for both commands. If the timeout is exceeded, the destination variable receives None. RequestPost(url, querystring, headers, body, destino, timeout) executes an HTTP POST and stores the response in 'destino'. RequestGet(url, querystring, headers, destino, timeout) executes an HTTP GET similarly. Both commands are part of AVAP's Section V (Third-Party Connectors and External HTTP Requests) and allow calling external APIs without additional drivers." + } +] diff --git a/docs/LRM/avap_samples.zip b/docs/LRM/avap_samples.zip new file mode 100644 index 0000000000000000000000000000000000000000..5f05e6ef98ecf4bcba6d70b75580fd92b11921c5 GIT binary patch literal 19620 zcmc&+d0dlM)=pR@f*`Vs3Mxw=gk2r07E~lFgs{1wA%qA))&x}CL1YAkQ9wXw3ssc5 z)Vfz47oai)BtQ}B(&}dsakTxcOBHFS&UbIH<=(f15}5g-KYsqB=bY!9bI(2ZoadFt z*3>c~jRU`Zwl9kyA-^n0HYA};5|bztNs~eN&c4f=iE$jesWOUS=Eps=ehE!fTGCu$_i~EPh9%h%Zhpxr3EEW)k6$Y; zuIlXW_`ulU#3%L&`_3Nahs*)C1rBSUol-ZeV5&MC-Cuw^+7JMVN)yG41?~cOH>G)N z{|V|{Ce7)2Q=4mhW2bwkyH|S;e_LitVP}N)3T->1DW5GrR$X;uw_a%~he>7AnmL?i zra@C5<#pc;67@!qE4!u1H8(I&&URsOn!;~3$#=CmwYJ^pVaeI#Z}vCsEh{uGULH{% zyq{d!qoD{%`r)_i1Q3pFEF?rEFlCU;R{@kd;*eR4$HDsdIK(F<#z<1rq6N`nL9{S6 zNv1R?#U}@YGJn@e$`@|>i`Q-5F>A$rTkfN`88ydkx9{Y-SscAZ`RV1+b2D{k-`#lg z<*MwpmTzhs?nWe^e)Tl$*6Ae`QHfV7c|N8OStppm0Uh%4NXK>g(_IA}@7*^^w)YvI zv?1BMckWWw4Oc(so{}vS$TdH;Jox0vzcn^1f@8KOWEY) zq2$9n6A9x*Y4J%gMB??y5~-s4D~*$T@j#sJfuhe2bV*^-z=3GtcJcTfK_}ci-QC)Q zcoc&aofmv(ibtNY_XXo%ekMPUEn|6AO;}9P8rHu+r@hjwRYN1%n4vm+{QiU=>LRx&P zBspHJyu#$VR7xnIUBTSTkaJsFs9$IB$|D{hw#ztp*l%wnv$CbzIN#c4k45DNf{@~J z!;szY>iBj#S>fqYSxW1Bz|%ok3KI==R_bY$ZB~chJ$=_;?V&mJds0HOaG*j6WRe8n zZ>8CMYR5@McZj*0*UW5Thl51^s)x0BqNj$Ql|(8&B_%!}XA-EC&nq_3Le1Csyc*P#&d~MrX>oA>GYQfx6FEUgy7L zy)ez?-^eib%c1sDwEO+SC`TGtlp~b%$ONQu_u&pS?)D!}oVo-!oP#-?Xs$ic>7%k- z9gbhK4|n{`0msWyrD-C?tLQhs!&;sc6(>$jQhEZuxquWLCauY}AW%`unJ^G6vYum# z-J$9%J2T15M>9(fK5=yPHN93UHgl9n|8l6h*v?FBZ~w45yJw^JFTd1v9<-@AbYORd zWo76GmFATt`%23W=y`ZLo;~kXvsg29;6xlVeUK&ufyu&xLxkk|hv19}ojRf<_rs&~ z?s%jMqm!gCFtA1?h*PD*^qV4w!Q;Umylg=`v#YI%Mu(TJTo3_y^JLegc zRTV6hMy+D5W<}-NhexqkytoueK#op?zT#3SYyNriLg0Bm=6#~6e>@tav05EYpO=L@ z-FDz;tQAXTa9ofoNf3jZI3ZaOFOHE!4RbTRLRv^?IcNf@3x_Qk$;WpdD z)626Xh{xxC6)x`z&kyMc0&OLO&244uYI6iPHh5qSc0Xo13inrCypRnmQvl|EqS3TJ zB>(8`S4R*W_;?UtcPWmSfz}=PJ~3J-Ra%XX%o`%%9Rpf&Tp6?uatF1)JYt{3Dzc;? zGov80@WVp;QVk=MYO{x{OQOxFx16#}_8yWR>@}&0i)37|)7Wu=|B}Cr|HsKr`^Nh2 za1a6+76zi(geFuq1nThn=Um+Huo(icC8=1E`>-vF7sn|d=&-zpa=uG|i_;MrqbrDX zvYWk)!#%#ZEIuVJAmAjZGg}gUgQ8B_hp#Tr7HpZybI%G{7|TC#Hk4=hQlA&z=$d&r z{6w&umm9C6hfmU|+9%BMv!WhuXgYW}-L8KwoDAudM;uP4T18IYu6TYKQM|}-^!DZL zJ=ZpZ;8bG)BH|;oO{0LSjwsa=-d-@xR*OZkLdA_Bh!TQ>0&NVgfy;eg?lSsksu659?w6{36 zwn0&%_}`w5W&yY3Hw4jWN4!Lh+J1EepqDU_Gl2m}OC0E&!~$7bg7Py$em9m=JQ$qi zYl%HEoZap0#?flX;rsOzA&`Ig=3yuZK`NI0M6;(6LZBi7b$EVX z8D5~KBA#C>jF&_Y4E2Q~kyxg@Weq+*nENXW>L~SY`q@8j&TS9YyiybWf^#4B`N z)f;VCs#blP<-P?3styYl5h-=(V5tsT9btRE0}mTqdBlhl#nRte+0>*p;zXH12v#wv zNm9u$9j$%Y;2q=4YGF6E8PKWCE$o}T+_jzgx%o`c)4I*-2y$#{3y|lz25?}8Ql$eAiKPY09N~P!A9Fm>RyywZF`BIo?;B?0-iM>5SfMOd5X|uP@SZhrYe*7P z#W7NFfRs<}Rw1qjyQ0=WroroEFAwhRba@xz^lfhO3t;f97Qo7kjyA(-DKMk}vN{)Eveo7U0Dl{*Kt zu)2bHFTji_q|rsrZVG2GT>_Y)$@(&V&=>ul*BOVqv=r6H;Mf`F_Tbu)XcJ-i zqJnwq@aX0^JclO%k4jd9HV15S(}eNErR~<3!D`RVV^g-fnfN_g*5G+S>y)9r`jN7Ywg6mdZ(3!p3spNIRgl}cFoBXa-uVP3h?2+qEy>e^znK>2<{vJa`QCb+#1Qy)uzqLGBH`)KS{hp&4x@pAxs{{&&8I4(&bmP)}~NR|>W5J@Fq z@vgkiBo_`keEu%?@%3&-OZwjC&Is0+^5p(MZM^;-o91$}cd~}}p1(VHZcXxg=3;p7 z8Ry;tuQS09Z^zWd-@EJjYx>UUc`g>Rx{@xQerlfP^T+p>8^^|;kgs2Myx~FL{o<*# zEg_;6XXmfIdsXo9_ouENV3@M1a!dsd=Z?@fDdN~U#+=F)_qNfYe8`*pHu+M0RmPrkZsy{T`1|NQ+aAvHhDNf0`0pQzDr z*Y3O9&JZdc;}>-(9W7a2AcOPo@gyE%qgITJRW^`|&*(AdP+esS-~(>rnJ z4MbZXpUvvtl5PFXnLEq%&V+6KqJGc92mbwv6Y=5qDYdyET(7W@5wQ}UZK}dnN4Pdu z;8hC_Rg;A=V7Y^D8>C`Unp8GS_kZ>BVCCZO_GN3j8}0b9-*i#c?#}tKXD_EebK1ZE z9*Zg}qR)_=nmNhv?S`1s3Ae4Mez*B=rT*;~*0rqn*id`#`w2QnpFJ`CdGh6`(AV=# zTMyUHh)=sbDg0*Wjb9EKa&K~HZ>Z^bzH}bDe0oJ){i&7J)b^ZP>rK`?T_2)eO=#| zw_ix^Y%G#pEz`TY{oky%%B0U$H?Muf>bL!Y+WoesI(F~hj=PloSfTq?F}#|%gOmR? z$bm>Q+Q1Q=NEq^t$)D;-3f5jcDJ(%!NX4tgQdl3sEDKy_;B}j%eAa7ldN3K_as#x> z3>Gb*D+mk>-P(ItS>x;;nMd0{ar|Ukb#3MVH_s@At2k=@xwqfafQun-Y_LZrngsV^ zNG~=j>(t@e>-+Gfw!Xr($?-yw^0K(a_LO3it-0!wB%#0;3`LAcKVF&MTO2k1?5l3y z_Gvpab^V&3*c-=|7oKsvaxXu`uHkz9Q%*Li+sxXcZpE{=iBrPNpB2{7?iA$|B^ZQL z_sg2+>~6m4HCJ+{zis1ZSqpPZHC>}t8J)^_r0FVJW!#YwD6z;}F3IV7q}#J_``16b zS5r}O-=XoNiXX~iu6sVtEAC%K{gq+AZ9U1MII-&OmA8|3nVyR}@I`!Vc2B_1SDr5E z>QPKk4jsJo{ZSZ5?7R@20EgLFFh}N(;p6w5Wux;v$?Lv^0Xh{Dbns!li{rtc~C2VehR`TX~N4>A5wrZCT&s9&z}@+q{eK|LFWR-@cS* z{ZBNUD?XY~Nr_8;@U8c)dAa|%v~*MDoGj109gLD?JLYfj@DHE{llm`3=Cyt4!Czr; z_NUYKo!*bH-tu;ZvrQ1BW-HVa^x zmISwFP_%=E0;w#n#vt>zD-|;F^kybaey^jZ=C1V zoyRt2v+~kRIQqMSX7clLU*v7GvMwqL&Uf}R9^on)1=9b|AdxjpGzULrLBA=nAXxyM z1HY;>;GZM@cto2-h99=@*tm4`Q;!*#^gTd2lGkHV1iu_XzuBh^ea6ApPl=8o++|r3Pw06pGr9`zKEmv;WUN znixVpWcs7u09a#2Gqrv*dW2+upaFi?fFghzPy`N`21o575R6_K>%bArQd2Mu9*DC=h#00ZX3% zWK6ms(HErv&%|_@7%0G~&h-VTAd?wI1vR0lTro|LfF>+FiT@F$kV%VjZ1GrKoS=X= zVlpj21u4KNDyRTOg~v*4NJKRjU67ZGGFELd54svDz^F(~1&|=M7KH@Wppa;onjWAA z57gK-LZ&Cm0mTdTnTe#4&{Y8vNLNLXK)EOqyil((#*mDFPdbDO$bm%Jk$6R;O#-Hm zknstGL&hg+KNODIk5@E(@KH4qi81JbY(kW0i2JDn=uwSO2%tbhBnkzpK%qEdN%dD# z49b|bL2e$(+B3nl1)BjtBcbO39FSm$;((G-9Mdq#6muNMD3w6A8p>mviYbu;CBK=E zJhp~o=>Z^+rH2B6I#3`~OvmToDTL(K*wsNE7|IaCpF>%e;FGLTFxUVt$Y4WpK}9Go zI)=;53d8ju(h2!iC=(0s)-)!QR40f5WRM_+B7<5`WO%oB5NJUnrZUHC2!)X2gtDLT z=UA#Wfb}OVEFcuJuuv3GD2f7qUyPfAqflKwBmkk*ph;N#{{iHWgaZT&heRjTa3~lx z96yv*Q!&FwsRPn#P{Iv9l8Bl{8aWdPAb?CD6akcuBEZj}t__A@#1x((6hYPr%16QP zlUATeHR1_S0ZBF}7N`Qng5M{f*xt^5$0;F1_!AM0a4egc}L;$im0KqB0mqqakd=&}~?uluP4+f@?|_r2)4 z6&HTj37R{y?Juw#ZhukBp>Wjec=Ns<{7**6bv*_>a21IzDe((+0ib25u#E&b;D!~& z0p+7O@PVDK3x;ECs^AI_UG7n^^l1mGhS}l))8T3oH66-BcWI`WymC5bx@w}~q6%GM znPH+gx&l*2wyOeW!_^#WHk6He9ItZ%@ZWVHnKODdaF2s-c<}6>37C$^>tfiY6~F+O kRwxE29L0cVzorL`y#umAu6 literal 0 HcmV?d00001 From aa138783f31a411fa9c9c2230a780925d09153c5 Mon Sep 17 00:00:00 2001 From: rafa-ruiz Date: Tue, 31 Mar 2026 01:40:53 -0700 Subject: [PATCH 2/3] Golden dataset --- docs/LRM/avap_samples.zip | Bin 19620 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 docs/LRM/avap_samples.zip diff --git a/docs/LRM/avap_samples.zip b/docs/LRM/avap_samples.zip deleted file mode 100644 index 5f05e6ef98ecf4bcba6d70b75580fd92b11921c5..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 19620 zcmc&+d0dlM)=pR@f*`Vs3Mxw=gk2r07E~lFgs{1wA%qA))&x}CL1YAkQ9wXw3ssc5 z)Vfz47oai)BtQ}B(&}dsakTxcOBHFS&UbIH<=(f15}5g-KYsqB=bY!9bI(2ZoadFt z*3>c~jRU`Zwl9kyA-^n0HYA};5|bztNs~eN&c4f=iE$jesWOUS=Eps=ehE!fTGCu$_i~EPh9%h%Zhpxr3EEW)k6$Y; zuIlXW_`ulU#3%L&`_3Nahs*)C1rBSUol-ZeV5&MC-Cuw^+7JMVN)yG41?~cOH>G)N z{|V|{Ce7)2Q=4mhW2bwkyH|S;e_LitVP}N)3T->1DW5GrR$X;uw_a%~he>7AnmL?i zra@C5<#pc;67@!qE4!u1H8(I&&URsOn!;~3$#=CmwYJ^pVaeI#Z}vCsEh{uGULH{% zyq{d!qoD{%`r)_i1Q3pFEF?rEFlCU;R{@kd;*eR4$HDsdIK(F<#z<1rq6N`nL9{S6 zNv1R?#U}@YGJn@e$`@|>i`Q-5F>A$rTkfN`88ydkx9{Y-SscAZ`RV1+b2D{k-`#lg z<*MwpmTzhs?nWe^e)Tl$*6Ae`QHfV7c|N8OStppm0Uh%4NXK>g(_IA}@7*^^w)YvI zv?1BMckWWw4Oc(so{}vS$TdH;Jox0vzcn^1f@8KOWEY) zq2$9n6A9x*Y4J%gMB??y5~-s4D~*$T@j#sJfuhe2bV*^-z=3GtcJcTfK_}ci-QC)Q zcoc&aofmv(ibtNY_XXo%ekMPUEn|6AO;}9P8rHu+r@hjwRYN1%n4vm+{QiU=>LRx&P zBspHJyu#$VR7xnIUBTSTkaJsFs9$IB$|D{hw#ztp*l%wnv$CbzIN#c4k45DNf{@~J z!;szY>iBj#S>fqYSxW1Bz|%ok3KI==R_bY$ZB~chJ$=_;?V&mJds0HOaG*j6WRe8n zZ>8CMYR5@McZj*0*UW5Thl51^s)x0BqNj$Ql|(8&B_%!}XA-EC&nq_3Le1Csyc*P#&d~MrX>oA>GYQfx6FEUgy7L zy)ez?-^eib%c1sDwEO+SC`TGtlp~b%$ONQu_u&pS?)D!}oVo-!oP#-?Xs$ic>7%k- z9gbhK4|n{`0msWyrD-C?tLQhs!&;sc6(>$jQhEZuxquWLCauY}AW%`unJ^G6vYum# z-J$9%J2T15M>9(fK5=yPHN93UHgl9n|8l6h*v?FBZ~w45yJw^JFTd1v9<-@AbYORd zWo76GmFATt`%23W=y`ZLo;~kXvsg29;6xlVeUK&ufyu&xLxkk|hv19}ojRf<_rs&~ z?s%jMqm!gCFtA1?h*PD*^qV4w!Q;Umylg=`v#YI%Mu(TJTo3_y^JLegc zRTV6hMy+D5W<}-NhexqkytoueK#op?zT#3SYyNriLg0Bm=6#~6e>@tav05EYpO=L@ z-FDz;tQAXTa9ofoNf3jZI3ZaOFOHE!4RbTRLRv^?IcNf@3x_Qk$;WpdD z)626Xh{xxC6)x`z&kyMc0&OLO&244uYI6iPHh5qSc0Xo13inrCypRnmQvl|EqS3TJ zB>(8`S4R*W_;?UtcPWmSfz}=PJ~3J-Ra%XX%o`%%9Rpf&Tp6?uatF1)JYt{3Dzc;? zGov80@WVp;QVk=MYO{x{OQOxFx16#}_8yWR>@}&0i)37|)7Wu=|B}Cr|HsKr`^Nh2 za1a6+76zi(geFuq1nThn=Um+Huo(icC8=1E`>-vF7sn|d=&-zpa=uG|i_;MrqbrDX zvYWk)!#%#ZEIuVJAmAjZGg}gUgQ8B_hp#Tr7HpZybI%G{7|TC#Hk4=hQlA&z=$d&r z{6w&umm9C6hfmU|+9%BMv!WhuXgYW}-L8KwoDAudM;uP4T18IYu6TYKQM|}-^!DZL zJ=ZpZ;8bG)BH|;oO{0LSjwsa=-d-@xR*OZkLdA_Bh!TQ>0&NVgfy;eg?lSsksu659?w6{36 zwn0&%_}`w5W&yY3Hw4jWN4!Lh+J1EepqDU_Gl2m}OC0E&!~$7bg7Py$em9m=JQ$qi zYl%HEoZap0#?flX;rsOzA&`Ig=3yuZK`NI0M6;(6LZBi7b$EVX z8D5~KBA#C>jF&_Y4E2Q~kyxg@Weq+*nENXW>L~SY`q@8j&TS9YyiybWf^#4B`N z)f;VCs#blP<-P?3styYl5h-=(V5tsT9btRE0}mTqdBlhl#nRte+0>*p;zXH12v#wv zNm9u$9j$%Y;2q=4YGF6E8PKWCE$o}T+_jzgx%o`c)4I*-2y$#{3y|lz25?}8Ql$eAiKPY09N~P!A9Fm>RyywZF`BIo?;B?0-iM>5SfMOd5X|uP@SZhrYe*7P z#W7NFfRs<}Rw1qjyQ0=WroroEFAwhRba@xz^lfhO3t;f97Qo7kjyA(-DKMk}vN{)Eveo7U0Dl{*Kt zu)2bHFTji_q|rsrZVG2GT>_Y)$@(&V&=>ul*BOVqv=r6H;Mf`F_Tbu)XcJ-i zqJnwq@aX0^JclO%k4jd9HV15S(}eNErR~<3!D`RVV^g-fnfN_g*5G+S>y)9r`jN7Ywg6mdZ(3!p3spNIRgl}cFoBXa-uVP3h?2+qEy>e^znK>2<{vJa`QCb+#1Qy)uzqLGBH`)KS{hp&4x@pAxs{{&&8I4(&bmP)}~NR|>W5J@Fq z@vgkiBo_`keEu%?@%3&-OZwjC&Is0+^5p(MZM^;-o91$}cd~}}p1(VHZcXxg=3;p7 z8Ry;tuQS09Z^zWd-@EJjYx>UUc`g>Rx{@xQerlfP^T+p>8^^|;kgs2Myx~FL{o<*# zEg_;6XXmfIdsXo9_ouENV3@M1a!dsd=Z?@fDdN~U#+=F)_qNfYe8`*pHu+M0RmPrkZsy{T`1|NQ+aAvHhDNf0`0pQzDr z*Y3O9&JZdc;}>-(9W7a2AcOPo@gyE%qgITJRW^`|&*(AdP+esS-~(>rnJ z4MbZXpUvvtl5PFXnLEq%&V+6KqJGc92mbwv6Y=5qDYdyET(7W@5wQ}UZK}dnN4Pdu z;8hC_Rg;A=V7Y^D8>C`Unp8GS_kZ>BVCCZO_GN3j8}0b9-*i#c?#}tKXD_EebK1ZE z9*Zg}qR)_=nmNhv?S`1s3Ae4Mez*B=rT*;~*0rqn*id`#`w2QnpFJ`CdGh6`(AV=# zTMyUHh)=sbDg0*Wjb9EKa&K~HZ>Z^bzH}bDe0oJ){i&7J)b^ZP>rK`?T_2)eO=#| zw_ix^Y%G#pEz`TY{oky%%B0U$H?Muf>bL!Y+WoesI(F~hj=PloSfTq?F}#|%gOmR? z$bm>Q+Q1Q=NEq^t$)D;-3f5jcDJ(%!NX4tgQdl3sEDKy_;B}j%eAa7ldN3K_as#x> z3>Gb*D+mk>-P(ItS>x;;nMd0{ar|Ukb#3MVH_s@At2k=@xwqfafQun-Y_LZrngsV^ zNG~=j>(t@e>-+Gfw!Xr($?-yw^0K(a_LO3it-0!wB%#0;3`LAcKVF&MTO2k1?5l3y z_Gvpab^V&3*c-=|7oKsvaxXu`uHkz9Q%*Li+sxXcZpE{=iBrPNpB2{7?iA$|B^ZQL z_sg2+>~6m4HCJ+{zis1ZSqpPZHC>}t8J)^_r0FVJW!#YwD6z;}F3IV7q}#J_``16b zS5r}O-=XoNiXX~iu6sVtEAC%K{gq+AZ9U1MII-&OmA8|3nVyR}@I`!Vc2B_1SDr5E z>QPKk4jsJo{ZSZ5?7R@20EgLFFh}N(;p6w5Wux;v$?Lv^0Xh{Dbns!li{rtc~C2VehR`TX~N4>A5wrZCT&s9&z}@+q{eK|LFWR-@cS* z{ZBNUD?XY~Nr_8;@U8c)dAa|%v~*MDoGj109gLD?JLYfj@DHE{llm`3=Cyt4!Czr; z_NUYKo!*bH-tu;ZvrQ1BW-HVa^x zmISwFP_%=E0;w#n#vt>zD-|;F^kybaey^jZ=C1V zoyRt2v+~kRIQqMSX7clLU*v7GvMwqL&Uf}R9^on)1=9b|AdxjpGzULrLBA=nAXxyM z1HY;>;GZM@cto2-h99=@*tm4`Q;!*#^gTd2lGkHV1iu_XzuBh^ea6ApPl=8o++|r3Pw06pGr9`zKEmv;WUN znixVpWcs7u09a#2Gqrv*dW2+upaFi?fFghzPy`N`21o575R6_K>%bArQd2Mu9*DC=h#00ZX3% zWK6ms(HErv&%|_@7%0G~&h-VTAd?wI1vR0lTro|LfF>+FiT@F$kV%VjZ1GrKoS=X= zVlpj21u4KNDyRTOg~v*4NJKRjU67ZGGFELd54svDz^F(~1&|=M7KH@Wppa;onjWAA z57gK-LZ&Cm0mTdTnTe#4&{Y8vNLNLXK)EOqyil((#*mDFPdbDO$bm%Jk$6R;O#-Hm zknstGL&hg+KNODIk5@E(@KH4qi81JbY(kW0i2JDn=uwSO2%tbhBnkzpK%qEdN%dD# z49b|bL2e$(+B3nl1)BjtBcbO39FSm$;((G-9Mdq#6muNMD3w6A8p>mviYbu;CBK=E zJhp~o=>Z^+rH2B6I#3`~OvmToDTL(K*wsNE7|IaCpF>%e;FGLTFxUVt$Y4WpK}9Go zI)=;53d8ju(h2!iC=(0s)-)!QR40f5WRM_+B7<5`WO%oB5NJUnrZUHC2!)X2gtDLT z=UA#Wfb}OVEFcuJuuv3GD2f7qUyPfAqflKwBmkk*ph;N#{{iHWgaZT&heRjTa3~lx z96yv*Q!&FwsRPn#P{Iv9l8Bl{8aWdPAb?CD6akcuBEZj}t__A@#1x((6hYPr%16QP zlUATeHR1_S0ZBF}7N`Qng5M{f*xt^5$0;F1_!AM0a4egc}L;$im0KqB0mqqakd=&}~?uluP4+f@?|_r2)4 z6&HTj37R{y?Juw#ZhukBp>Wjec=Ns<{7**6bv*_>a21IzDe((+0ib25u#E&b;D!~& z0p+7O@PVDK3x;ECs^AI_UG7n^^l1mGhS}l))8T3oH66-BcWI`WymC5bx@w}~q6%GM znPH+gx&l*2wyOeW!_^#WHk6He9ItZ%@ZWVHnKODdaF2s-c<}6>37C$^>tfiY6~F+O kRwxE29L0cVzorL`y#umAu6 From 1e9a6508f940220a837fab6161284b473a8773e5 Mon Sep 17 00:00:00 2001 From: rafa-ruiz Date: Tue, 31 Mar 2026 01:48:00 -0700 Subject: [PATCH 3/3] Golden dataset --- Docker/src/evaluate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Docker/src/evaluate.py b/Docker/src/evaluate.py index 82465ba..4fcd44e 100644 --- a/Docker/src/evaluate.py +++ b/Docker/src/evaluate.py @@ -16,7 +16,7 @@ logger = logging.getLogger(__name__) GOLDEN_DATASET_PATH = Path(__file__).parent / "golden_dataset.json" CLAUDE_MODEL = os.getenv("ANTHROPIC_MODEL", "claude-sonnet-4-20250514") -ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY", "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") +ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY", "") K_RETRIEVE = 5