created code_indexing_improvements research

2026-03-25 10:37:53 +01:00 · 2026-03-25 10:37:53 +01:00 · da483c51bb
parent fe90548b8b
commit da483c51bb
8 changed files with 103823 additions and 0 deletions
--- a/research/code_indexing/BNF/avap.lark
+++ b/research/code_indexing/BNF/avap.lark
@ -0,0 +1,228 @@
 start: program
 program: separator* line_or_comment (separator+ line_or_comment)* separator*
 ?line_or_comment: simple_stmt comment?
                | compound_stmt
                | comment
                | BLOCK_COMMENT
 ?separator: EOL+
 comment: DOC_COMMENT | LINE_COMMENT
 EOL: /\r?\n/
 DOC_COMMENT.2: /\/\/\/[^\r\n]*/
 LINE_COMMENT.1: /\/\/[^\r\n]*/
 BLOCK_COMMENT: /\/\*[\s\S]*?\*\//
 ?simple_stmt: assignment
            | return_stmt
            | system_command
            | io_command
            | async_command
            | connector_cmd
            | db_command
            | http_command
            | util_command
            | modularity_cmd
            | call_stmt
 ?compound_stmt: function_decl
              | if_stmt
              | loop_stmt
              | try_stmt
 assignment: identifier "=" expression
 call_stmt: identifier "(" argument_list? ")"
         | identifier "=" identifier "." identifier "(" argument_list? ")"
         | identifier "." identifier "(" argument_list? ")"
 system_command: register_cmd
              | addvar_cmd
 register_cmd: "registerEndpoint" "(" stringliteral "," stringliteral "," list_display "," stringliteral "," identifier "," identifier ")"
 addvar_cmd: "addVar" "(" addvar_arg "," addvar_arg ")"
 addvar_arg: identifier
          | literal
          | "$" identifier
 identifier: IDENTIFIER
 system_variable: "_status"
 io_command: addparam_cmd
          | getlistlen_cmd
          | addresult_cmd
          | getparamlist_cmd
 addparam_cmd: "addParam" "(" stringliteral "," identifier ")"
 getlistlen_cmd: "getListLen" "(" identifier "," identifier ")"
 getparamlist_cmd: "getQueryParamList" "(" stringliteral "," identifier ")"
 addresult_cmd: "addResult" "(" identifier ")"
 if_stmt: "if" "(" if_condition ")" separator block ("else" "(" ")" separator block)? "end" "(" ")"
 if_condition: if_atom "," if_atom "," stringliteral
            | "None" "," "None" "," stringliteral
 if_atom: identifier
       | literal
 loop_stmt: "startLoop" "(" identifier "," expression "," expression ")" separator block "endLoop" "(" ")"
 try_stmt: "try" "(" ")" separator block "exception" "(" identifier ")" separator block "end" "(" ")"
 block: separator* line_or_comment (separator+ line_or_comment)* separator*
 async_command: go_stmt
             | gather_stmt
 go_stmt: identifier "=" "go" identifier "(" argument_list? ")"
 gather_stmt: identifier "=" "gather" "(" identifier ("," expression)? ")"
 connector_cmd: connector_instantiation
 connector_instantiation: identifier "=" "avapConnector" "(" stringliteral ")"
 http_command: req_post_cmd
            | req_get_cmd
 req_post_cmd: "RequestPost" "(" expression "," expression "," expression "," expression "," identifier "," expression ")"
 req_get_cmd: "RequestGet" "(" expression "," expression "," expression "," identifier "," expression ")"
 db_command: orm_direct
          | orm_check
          | orm_create
          | orm_select
          | orm_insert
          | orm_update
 orm_direct: "ormDirect" "(" expression "," identifier ")"
 orm_check: "ormCheckTable" "(" expression "," identifier ")"
 orm_create: "ormCreateTable" "(" expression "," expression "," expression "," identifier ")"
 orm_select: "ormAccessSelect" "(" orm_fields "," expression ("," expression)? "," identifier ")"
 orm_fields: "*"
          | expression
 orm_insert: "ormAccessInsert" "(" expression "," expression "," identifier ")"
 orm_update: "ormAccessUpdate" "(" expression "," expression "," expression "," expression "," identifier ")"
 util_command: json_list_cmd
            | crypto_cmd
            | regex_cmd
            | datetime_cmd
            | stamp_cmd
            | string_cmd
            | replace_cmd
 json_list_cmd: "variableToList" "(" expression "," identifier ")"
             | "itemFromList" "(" identifier "," expression "," identifier ")"
             | "variableFromJSON" "(" identifier "," expression "," identifier ")"
             | "AddVariableToJSON" "(" expression "," expression "," identifier ")"
 crypto_cmd: "encodeSHA256" "(" identifier_or_string "," identifier ")"
          | "encodeMD5" "(" identifier_or_string "," identifier ")"
 regex_cmd: "getRegex" "(" identifier "," stringliteral "," identifier ")"
 datetime_cmd: "getDateTime" "(" stringliteral "," expression "," stringliteral "," identifier ")"
 stamp_cmd: "stampToDatetime" "(" expression "," stringliteral "," expression "," identifier ")"
         | "getTimeStamp" "(" stringliteral "," stringliteral "," expression "," identifier ")"
 string_cmd: "randomString" "(" expression "," expression "," identifier ")"
 replace_cmd: "replace" "(" identifier_or_string "," stringliteral "," stringliteral "," identifier ")"
 function_decl: "function" identifier "(" param_list? ")" "{" separator block "}"
 param_list: identifier ("," identifier)*
 return_stmt: "return" "(" expression? ")"
 modularity_cmd: include_stmt
              | import_stmt
 include_stmt: "include" stringliteral
 import_stmt: "import" ("<" identifier ">" | stringliteral)
 ?expression: logical_or
 ?logical_or: logical_and ("or" logical_and)*
 ?logical_and: logical_not ("and" logical_not)*
 ?logical_not: "not" logical_not
            | comparison
 ?comparison: arithmetic (comp_op arithmetic)*
 comp_op: "==" | "!=" | "<" | ">" | "<=" | ">=" | "in" | "is"
 ?arithmetic: term (("+" | "-") term)*
 ?term: factor (("*" | "/" | "%") factor)*
 ?factor: ("+" | "-") factor
       | power
 ?power: primary ("**" factor)?
 ?primary: atom postfix*
 postfix: "." identifier
       | "[" expression "]"
       | "[" expression? ":" expression? (":" expression?)? "]"
       | "(" argument_list? ")"
 ?atom: identifier
     | "$" identifier
     | literal
     | "(" expression ")"
     | list_display
     | dict_display
 list_display: "[" argument_list? "]"
            | "[" expression "for" identifier "in" expression if_clause? "]"
 if_clause: "if" expression
 dict_display: "{" key_datum_list? "}"
 key_datum_list: key_datum ("," key_datum)*
 key_datum: expression ":" expression
 argument_list: expression ("," expression)*
 number: FLOATNUMBER
      | INTEGER
 literal: stringliteral
       | number
       | boolean
       | "None"
 boolean: "True" | "False"
 INTEGER: /[0-9]+/
 FLOATNUMBER: /(?:[0-9]+\.[0-9]*|\.[0-9]+)/
 stringliteral: STRING_DOUBLE
             | STRING_SINGLE
 # STRING_DOUBLE: /"([^"\\]|\\["'\\ntr0])*"/
 # STRING_SINGLE: /'([^'\\]|\\["'\\ntr0])*'/
 STRING_DOUBLE: /"([^"\\]|\\.)*"/
 STRING_SINGLE: /'([^'\\]|\\.)*'/
 identifier_or_string: identifier
                    | stringliteral
 IDENTIFIER: /[A-Za-z_][A-Za-z0-9_]*/
 %ignore /[ \t]+/
--- a/research/code_indexing/chunk.py
+++ b/research/code_indexing/chunk.py
@ -0,0 +1,371 @@
 import json
 from copy import deepcopy
 from dataclasses import replace
 from pathlib import Path
 from typing import Any, Union
 from lark import Lark, Tree
 from chonkie import (
    Chunk,
    ElasticHandshake,
    FileFetcher,
    MarkdownChef,
    TextChef,
    TokenChunker,
    MarkdownDocument
 )
 from elasticsearch import Elasticsearch
 from loguru import logger
 from transformers import AutoTokenizer
 from scripts.pipelines.tasks.embeddings import OllamaEmbeddings
 from src.config import settings
 COMMAND_METADATA_NAMES = {
    # system
    "register_cmd": "registerEndpoint",
    "addvar_cmd": "addVar",
    "addparam_cmd": "addParam",
    "getlistlen_cmd": "getListLen",
    "getparamlist_cmd": "getQueryParamList",
    "addresult_cmd": "addResult",
    # async
    "go_stmt": "go",
    "gather_stmt": "gather",
    # connector
    "connector_instantiation": "avapConnector",
    # http
    "req_post_cmd": "RequestPost",
    "req_get_cmd": "RequestGet",
    # db
    "orm_direct": "ormDirect",
    "orm_check": "ormCheckTable",
    "orm_create": "ormCreateTable",
    "orm_select": "ormAccessSelect",
    "orm_insert": "ormAccessInsert",
    "orm_update": "ormAccessUpdate",
    # util
    "json_list_cmd": "json_list_ops",
    "crypto_cmd": "crypto_ops",
    "regex_cmd": "getRegex",
    "datetime_cmd": "getDateTime",
    "stamp_cmd": "timestamp_ops",
    "string_cmd": "randomString",
    "replace_cmd": "replace",
    # modularity
    "include_stmt": "include",
    "import_stmt": "import",
    # generic statements
    "assignment": "assignment",
    "call_stmt": "call",
    "return_stmt": "return",
    "if_stmt": "if",
    "loop_stmt": "startLoop",
    "try_stmt": "try",
    "function_decl": "function",
 }
 def _extract_command_metadata(ast: Tree | None) -> dict[str, bool]:
    if ast is None:
        return {}
    used_commands: set[str] = set()
    for subtree in ast.iter_subtrees():
        if subtree.data in COMMAND_METADATA_NAMES:
            used_commands.add(COMMAND_METADATA_NAMES[subtree.data])
    return {command_name: True for command_name in sorted(used_commands)}
 def _get_text(element) -> str:
    for attr in ("text", "content", "markdown"):
        value = getattr(element, attr, None)
        if isinstance(value, str):
            return value
    raise AttributeError(
        f"Could not extract text from element of type {type(element).__name__}"
    )
 def _merge_markdown_document(processed_doc: MarkdownDocument) -> MarkdownDocument:
    elements = []
    for chunk in processed_doc.chunks:
        elements.append(("chunk", chunk.start_index, chunk.end_index, chunk))
    for code in processed_doc.code:
        elements.append(("code", code.start_index, code.end_index, code))
    for table in processed_doc.tables:
        elements.append(("table", table.start_index, table.end_index, table))
    elements.sort(key=lambda item: (item[1], item[2]))
    merged_chunks = []
    current_chunk = None
    current_parts = []
    current_end_index = None
    current_token_count = None
    def flush():
        nonlocal current_chunk, current_parts, current_end_index, current_token_count
        if current_chunk is None:
            return
        merged_text = "\n\n".join(part for part in current_parts if part)
        merged_chunks.append(
            replace(
                current_chunk,
                text=merged_text,
                end_index=current_end_index,
                token_count=current_token_count,
            )
        )
        current_chunk = None
        current_parts = []
        current_end_index = None
        current_token_count = None
    for kind, _, _, element in elements:
        if kind == "chunk":
            flush()
            current_chunk = element
            current_parts = [_get_text(element)]
            current_end_index = element.end_index
            current_token_count = element.token_count
            continue
        if current_chunk is None:
            continue
        current_parts.append(_get_text(element))
        current_end_index = max(current_end_index, element.end_index)
        current_token_count += getattr(element, "token_count", 0)
    flush()
    fused_processed_doc = deepcopy(processed_doc)
    fused_processed_doc.chunks = merged_chunks
    fused_processed_doc.code = processed_doc.code
    fused_processed_doc.tables = processed_doc.tables
    return fused_processed_doc
 class ElasticHandshakeWithMetadata(ElasticHandshake):
    """Extended ElasticHandshake that preserves chunk metadata in Elasticsearch."""
    def _create_bulk_actions(self, chunks: list[dict]) -> list[dict[str, Any]]:
        """Generate bulk actions including metadata."""
        actions = []
        embeddings = self.embedding_model.embed_batch([chunk["chunk"].text for chunk in chunks])
        for i, chunk in enumerate(chunks):
            source = {
                "text": chunk["chunk"].text,
                "embedding": embeddings[i],
                "start_index": chunk["chunk"].start_index,
                "end_index": chunk["chunk"].end_index,
                "token_count": chunk["chunk"].token_count,
            }
            # Include metadata if it exists
            if chunk.get("extra_metadata"):
                source.update(chunk["extra_metadata"])
            actions.append({
                "_index": self.index_name,
                "_id": self._generate_id(i, chunk["chunk"]),
                "_source": source,
            })
        return actions
    def write(self, chunks: Union[Chunk, list[Chunk]]) -> list[dict[str, Any]]:
        """Write the chunks to the Elasticsearch index using the bulk API."""
        if isinstance(chunks, Chunk):
            chunks = [chunks]
        actions = self._create_bulk_actions(chunks)
        # Use the bulk helper to efficiently write the documents
        from elasticsearch.helpers import bulk
        success, errors = bulk(self.client, actions, raise_on_error=False)
        if errors:
            logger.warning(f"Encountered {len(errors)} errors during bulk indexing.")  # type: ignore
            # Optionally log the first few errors for debugging
            for i, error in enumerate(errors[:5]):  # type: ignore
                logger.error(f"Error {i + 1}: {error}")
        logger.info(f"Chonkie wrote {success} chunks to Elasticsearch index: {self.index_name}")
        return actions
 def fetch_documents(docs_folder_path: str, docs_extension: list[str]) -> list[Path]:
    """
    Fetch files from a folder that match the specified extensions.
    Args:
        docs_folder_path (str): Path to the folder containing documents
        docs_extension (list[str]): List of file extensions to filter by (e.g., [".md", ".avap"])
    Returns:
        List of Paths to the fetched documents
    """
    fetcher = FileFetcher()
    docs_path = fetcher.fetch(dir=f"{settings.proj_root}/{docs_folder_path}", ext=docs_extension)
    return docs_path
 def process_documents(docs_path: list[Path]) -> list[dict[str, Any]]:
    """
    Process documents by applying appropriate chefs and chunking strategies based on file type.
    Args:
        docs_path: List of Paths to the documents to be processed.
    Returns:
        List of dicts with "chunk" (Chunk object) and "extra_metadata" (dict with file info).
    """
    processed_docs = []
    custom_tokenizer = AutoTokenizer.from_pretrained(settings.hf_emb_model_name)
    chef_md = MarkdownChef(tokenizer=custom_tokenizer)
    chef_txt = TextChef()
    chunker = TokenChunker(tokenizer=custom_tokenizer)
    with open(settings.proj_root / "research/code_indexing/BNF/avap.lark", encoding="utf-8") as grammar:
        lark_parser = Lark(
            grammar.read(),
            parser="lalr",
            propagate_positions=True,
            start="program",
        )
    for doc_path in docs_path:
        doc_extension = doc_path.suffix.lower()
        if doc_extension == ".md":
            processed_doc = chef_md.process(doc_path)
            fused_doc = _merge_markdown_document(processed_doc)
            chunked_doc = fused_doc.chunks
            specific_metadata = {
                "file_type": "avap_docs",
                "filename": doc_path.name,
            }
        elif doc_extension == ".avap":
            processed_doc = chef_txt.process(doc_path)
            try:
                ast = lark_parser.parse(processed_doc.content)
            except Exception as e:
                logger.error(f"Error parsing AVAP code in {doc_path.name}: {e}")
                ast = None
            chunked_doc = chunker.chunk(processed_doc.content)
            specific_metadata = {
                "file_type": "avap_code",
                "filename": doc_path.name,
                **_extract_command_metadata(ast),
            }
        else:
            continue
        for chunk in chunked_doc:
            processed_docs.append(
                {
                    "chunk": chunk,
                    "extra_metadata": {**specific_metadata},
                }
            )
    return processed_docs
 def ingest_documents(
    chunked_docs: list[dict[str, Chunk | dict[str, Any]]],
    es_index: str,
    es_request_timeout: int,
    es_max_retries: int,
    es_retry_on_timeout: bool,
    delete_es_index: bool,
 ) -> list[dict[str, Any]]:
    """
    Ingest processed documents into an Elasticsearch index.
    Args:
        chunked_docs (list[dict[str, Any]]): List of dicts with "chunk" and "metadata" keys
        es_index (str): Name of the Elasticsearch index to ingest into
        es_request_timeout (int): Timeout for Elasticsearch requests in seconds
        es_max_retries (int): Maximum number of retries for Elasticsearch requests
        es_retry_on_timeout (bool): Whether to retry on Elasticsearch request timeouts
        delete_es_index (bool): Whether to delete the existing Elasticsearch index before ingestion
    Returns:
        List of dicts with Elasticsearch response for each chunk
    """
    logger.info(
        f"Instantiating Elasticsearch client with URL: {settings.elasticsearch_local_url}..."
    )
    es = Elasticsearch(
        hosts=settings.elasticsearch_local_url,
        request_timeout=es_request_timeout,
        max_retries=es_max_retries,
        retry_on_timeout=es_retry_on_timeout,
    )
    if delete_es_index and es.indices.exists(index=es_index):
        logger.info(f"Deleting existing Elasticsearch index: {es_index}...")
        es.indices.delete(index=es_index)
    handshake = ElasticHandshakeWithMetadata(
        client=es,
        index_name=es_index,
        embedding_model=OllamaEmbeddings(model=settings.ollama_emb_model_name),
    )
    logger.info(
        f"Ingesting {len(chunked_docs)} chunks into Elasticsearch index: {es_index}..."
    )
    elasticsearch_chunks = handshake.write(chunked_docs)
    return elasticsearch_chunks
 def export_documents(elasticsearch_chunks: list[dict[str, Any]], output_path: str) -> None:
    """
    Export processed documents to JSON files in the specified output folder.
    Args:
        elasticsearch_chunks (list[dict[str, Any]]): List of dicts with Elasticsearch response for each chunk
        output_path (str): Path to the file where the JSON will be saved
    Returns:
        None
    """
    output_path = settings.proj_root / output_path
    for chunk in elasticsearch_chunks:
        chunk["_source"]["embedding"] = chunk["_source"]["embedding"].tolist() # For JSON serialization
    with output_path.open("w", encoding="utf-8") as f:
        json.dump(elasticsearch_chunks, f, ensure_ascii=False, indent=4)
    logger.info(f"Exported processed documents to {output_path}")
--- a/research/code_indexing/chunks/chunks_EBNF_metadata.json
+++ b/research/code_indexing/chunks/chunks_EBNF_metadata.json
--- a/research/code_indexing/chunks/chunks_file_level.json
+++ b/research/code_indexing/chunks/chunks_file_level.json
--- a/research/code_indexing/chunks/chunks_full_ebnf.json
+++ b/research/code_indexing/chunks/chunks_full_ebnf.json
--- a/research/code_indexing/chunks/chunks_grammar_level.jsonl
+++ b/research/code_indexing/chunks/chunks_grammar_level.jsonl
@ -0,0 +1,89 @@
 {"chunk_id": "5208d7435c0286ab", "source_file": "docs/samples/hash_SHA256_para_integridad.avap", "doc_type": "code", "block_type": "encodeSHA256", "section": "", "start_line": 1, "end_line": 1, "content": "encodeSHA256(\"payload_data\", checksum)", "metadata": {"uses_crypto": true, "uses_string_ops": true, "complexity": 2}, "token_estimate": 9}
 {"chunk_id": "e5e9b70428937778", "source_file": "docs/samples/hash_SHA256_para_integridad.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 2, "end_line": 2, "content": "encodeSHA256(\"payload_data\", checksum)\naddResult(checksum)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 14}
 {"chunk_id": "49d6b31967a1db93", "source_file": "docs/samples/hello_world.avap", "doc_type": "code", "block_type": "registerEndpoint", "section": "", "start_line": 1, "end_line": 1, "content": "registerEndpoint(\"/hello_world\",\"GET\",[],\"HELLO_WORLD\",main,result)", "metadata": {"registers_endpoint": true, "complexity": 1}, "token_estimate": 17}
 {"chunk_id": "e7ececd11823d42a", "source_file": "docs/samples/hello_world.avap", "doc_type": "code", "block_type": "addVar", "section": "", "start_line": 2, "end_line": 2, "content": "registerEndpoint(\"/hello_world\",\"GET\",[],\"HELLO_WORLD\",main,result)\naddVar(name,\"Alberto\")", "metadata": {"complexity": 0, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 24}
 {"chunk_id": "f103d7719754088f", "source_file": "docs/samples/hello_world.avap", "doc_type": "code", "block_type": "assignment", "section": "", "start_line": 3, "end_line": 3, "content": "addVar(name,\"Alberto\")\nresult = \"Hello,\" + name", "metadata": {"complexity": 0, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 14}
 {"chunk_id": "4b1ab59c1acb224c", "source_file": "docs/samples/hello_world.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 4, "end_line": 4, "content": "result = \"Hello,\" + name\naddResult(result)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 12}
 {"chunk_id": "682adaeeb528f778", "source_file": "docs/samples/hola_mundo.avap", "doc_type": "code", "block_type": "addVar", "section": "", "start_line": 1, "end_line": 1, "content": "addVar(mensaje, \"Hola mundo desde AVAP\")", "metadata": {"complexity": 0}, "token_estimate": 12}
 {"chunk_id": "9bb665ca8d7590f7", "source_file": "docs/samples/hola_mundo.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 2, "end_line": 2, "content": "addVar(mensaje, \"Hola mundo desde AVAP\")\naddResult(mensaje)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 17}
 {"chunk_id": "ed0136ad03a51e7e", "source_file": "docs/samples/captura_de_listas_multiples.avap", "doc_type": "code", "block_type": "addParam", "section": "", "start_line": 1, "end_line": 1, "content": "addParam(\"emails\", emails)", "metadata": {"uses_auth": true, "complexity": 1}, "token_estimate": 7}
 {"chunk_id": "899291ac8959ae3e", "source_file": "docs/samples/captura_de_listas_multiples.avap", "doc_type": "code", "block_type": "getQueryParamList", "section": "", "start_line": 2, "end_line": 2, "content": "addParam(\"emails\", emails)\ngetQueryParamList(\"lista_correos\", lista_correos)", "metadata": {"complexity": 0, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 21}
 {"chunk_id": "0eeff974dcd74729", "source_file": "docs/samples/captura_de_listas_multiples.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 3, "end_line": 3, "content": "getQueryParamList(\"lista_correos\", lista_correos)\naddResult(lista_correos)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 21}
 {"chunk_id": "b2e95857d059d99d", "source_file": "docs/samples/comparacion_simple.avap", "doc_type": "code", "block_type": "addParam", "section": "", "start_line": 1, "end_line": 1, "content": "addParam(\"lang\", l)", "metadata": {"uses_auth": true, "complexity": 1}, "token_estimate": 7}
 {"chunk_id": "db2fab8dfbe7d460", "source_file": "docs/samples/comparacion_simple.avap", "doc_type": "code", "block_type": "if", "section": "", "start_line": 2, "end_line": 4, "content": "addParam(\"lang\", l)\nif(l, \"es\", \"=\")\n    addVar(msg, \"Hola\")\nend()", "metadata": {"uses_conditional": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 25}
 {"chunk_id": "2628fa886650658a", "source_file": "docs/samples/comparacion_simple.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 5, "end_line": 5, "content": "if(l, \"es\", \"=\")\n    addVar(msg, \"Hola\")\nend()\naddResult(msg)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 22}
 {"chunk_id": "89bddd6830b6a8af", "source_file": "docs/samples/concatenacion_dinamica.avap", "doc_type": "code", "block_type": "assignment", "section": "", "start_line": 1, "end_line": 2, "content": "nombre = \"Sistema\"\nlog = \"Evento registrado por: %s\" % nombre", "metadata": {"complexity": 0}, "token_estimate": 18}
 {"chunk_id": "6797d36c2eb0e38a", "source_file": "docs/samples/concatenacion_dinamica.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 3, "end_line": 3, "content": "nombre = \"Sistema\"\nlog = \"Evento registrado por: %s\" % nombre\naddResult(log)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 23}
 {"chunk_id": "93008a3bed0ea808", "source_file": "docs/samples/if_desigualdad.avap", "doc_type": "code", "block_type": "addParam", "section": "", "start_line": 1, "end_line": 1, "content": "addParam(\"password\",pass_nueva)", "metadata": {"uses_auth": true, "complexity": 1}, "token_estimate": 9}
 {"chunk_id": "142b2aef2f05fae7", "source_file": "docs/samples/if_desigualdad.avap", "doc_type": "code", "block_type": "assignment", "section": "", "start_line": 2, "end_line": 2, "content": "addParam(\"password\",pass_nueva)\npass_antigua = \"password\"", "metadata": {"complexity": 0, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 16}
 {"chunk_id": "b03b67f3aab35d7a", "source_file": "docs/samples/if_desigualdad.avap", "doc_type": "code", "block_type": "if", "section": "", "start_line": 3, "end_line": 5, "content": "pass_antigua = \"password\"\nif(pass_nueva, pass_antigua, \"!=\")\n        addVar(cambio, \"Contraseña actualizada\")\nend()", "metadata": {"uses_conditional": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 33}
 {"chunk_id": "99549cab6c8617d8", "source_file": "docs/samples/if_desigualdad.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 6, "end_line": 6, "content": "if(pass_nueva, pass_antigua, \"!=\")\n        addVar(cambio, \"Contraseña actualizada\")\nend()\naddResult(cambio)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 31}
 {"chunk_id": "123dfdacd4160b0d", "source_file": "docs/samples/limpieza_de_strings.avap", "doc_type": "code", "block_type": "replace", "section": "", "start_line": 1, "end_line": 1, "content": "replace(\"REF_1234_OLD\",\"OLD\", \"NEW\", ref_actualizada)", "metadata": {"uses_string_ops": true, "complexity": 1}, "token_estimate": 17}
 {"chunk_id": "c65655393175720a", "source_file": "docs/samples/limpieza_de_strings.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 2, "end_line": 2, "content": "replace(\"REF_1234_OLD\",\"OLD\", \"NEW\", ref_actualizada)\naddResult(ref_actualizada)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 23}
 {"chunk_id": "3edbf12e560e22b1", "source_file": "docs/samples/manejo_error_sql_critico.avap", "doc_type": "code", "block_type": "try", "section": "", "start_line": 1, "end_line": 7, "content": "try()\n    ormDirect(\"UPDATE table_inexistente SET a=1\", res)\nexception(e)\n    addVar(_status, 500)\n    addVar(error_msg, \"Error de base de datos\")\n    addResult(error_msg)\nend()", "metadata": {"uses_orm": true, "uses_auth": true, "uses_error_handling": true, "uses_exception": true, "returns_result": true, "complexity": 5}, "token_estimate": 51}
 {"chunk_id": "75bcc1f794c8527f", "source_file": "docs/samples/else_estandar.avap", "doc_type": "code", "block_type": "addParam", "section": "", "start_line": 1, "end_line": 1, "content": "addParam(\"sal_par\",saldo)", "metadata": {"uses_auth": true, "complexity": 1}, "token_estimate": 8}
 {"chunk_id": "99462a4539651e84", "source_file": "docs/samples/else_estandar.avap", "doc_type": "code", "block_type": "if", "section": "", "start_line": 2, "end_line": 6, "content": "addParam(\"sal_par\",saldo)\nif(saldo, 0, \">\")\n    permitir = True\nelse()\n    permitir = False\nend()", "metadata": {"uses_conditional": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 33}
 {"chunk_id": "c9134748119a6401", "source_file": "docs/samples/else_estandar.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 7, "end_line": 7, "content": "else()\n    permitir = False\nend()\naddResult(permitir)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 16}
 {"chunk_id": "da88ce6ec35e309a", "source_file": "docs/samples/expresion_compleja.avap", "doc_type": "code", "block_type": "addParam", "section": "", "start_line": 1, "end_line": 2, "content": "addParam(\"userrype\", user_type)\naddParam(\"sells\", compras)", "metadata": {"uses_auth": true, "complexity": 1}, "token_estimate": 19}
 {"chunk_id": "ef826cb80ab05a8c", "source_file": "docs/samples/expresion_compleja.avap", "doc_type": "code", "block_type": "if", "section": "", "start_line": 3, "end_line": 5, "content": "addParam(\"userrype\", user_type)\naddParam(\"sells\", compras)\nif(None, None, \" user_type == 'VIP' or compras > 100\")\n    addVar(descuento, 0.20)\nend()", "metadata": {"uses_conditional": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 51}
 {"chunk_id": "117c5396b3e2f3bd", "source_file": "docs/samples/expresion_compleja.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 6, "end_line": 6, "content": "if(None, None, \" user_type == 'VIP' or compras > 100\")\n    addVar(descuento, 0.20)\nend()\naddResult(descuento)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 37}
 {"chunk_id": "559f8f61eda7ff75", "source_file": "docs/samples/fecha_para_base_de_datos.avap", "doc_type": "code", "block_type": "getDateTime", "section": "", "start_line": 1, "end_line": 1, "content": "getDateTime(\"%Y-%m-%d %H:%M:%S\", 0, \"Europe/Madrid\", sql_date)", "metadata": {"uses_datetime": true, "complexity": 1}, "token_estimate": 27}
 {"chunk_id": "b40f10f126c22c01", "source_file": "docs/samples/fecha_para_base_de_datos.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 2, "end_line": 2, "content": "getDateTime(\"%Y-%m-%d %H:%M:%S\", 0, \"Europe/Madrid\", sql_date)\naddResult(sql_date)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 32}
 {"chunk_id": "717f75fe4eb08ecf", "source_file": "docs/samples/bucle_longitud_de_datos.avap", "doc_type": "code", "block_type": "assignment", "section": "", "start_line": 1, "end_line": 1, "content": "registros = ['1','2','3']", "metadata": {"complexity": 0}, "token_estimate": 10}
 {"chunk_id": "8a695ac320884362", "source_file": "docs/samples/bucle_longitud_de_datos.avap", "doc_type": "code", "block_type": "getListLen", "section": "", "start_line": 2, "end_line": 2, "content": "registros = ['1','2','3']\ngetListLen(registros, total)", "metadata": {"uses_list": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 17}
 {"chunk_id": "9530c2cad477b991", "source_file": "docs/samples/bucle_longitud_de_datos.avap", "doc_type": "code", "block_type": "assignment", "section": "", "start_line": 3, "end_line": 3, "content": "getListLen(registros, total)\ncontador = 0", "metadata": {"complexity": 0, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 11}
 {"chunk_id": "c4acc74c9b001703", "source_file": "docs/samples/bucle_longitud_de_datos.avap", "doc_type": "code", "block_type": "startLoop", "section": "", "start_line": 4, "end_line": 6, "content": "contador = 0\nstartLoop(idx, 0, 2)\n    actual = registros[int(idx)]\nendLoop()", "metadata": {"uses_loop": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 25}
 {"chunk_id": "80e935fcd6c7a232", "source_file": "docs/samples/bucle_longitud_de_datos.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 7, "end_line": 7, "content": "startLoop(idx, 0, 2)\n    actual = registros[int(idx)]\nendLoop()\naddResult(actual)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 24}
 {"chunk_id": "576b1bc85805eef0", "source_file": "docs/samples/calculo_de_expiracion.avap", "doc_type": "code", "block_type": "getDateTime", "section": "", "start_line": 1, "end_line": 1, "content": "getDateTime(\"\", 86400, \"UTC\", expira)", "metadata": {"uses_datetime": true, "complexity": 1}, "token_estimate": 13}
 {"chunk_id": "686f254e071d6280", "source_file": "docs/samples/calculo_de_expiracion.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 2, "end_line": 2, "content": "getDateTime(\"\", 86400, \"UTC\", expira)\naddResult(expira)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 18}
 {"chunk_id": "79fd8fee120921e7", "source_file": "docs/samples/captura_de_id.avap", "doc_type": "code", "block_type": "addParam", "section": "", "start_line": 1, "end_line": 1, "content": "addParam(\"client_id\", id_interno)", "metadata": {"uses_auth": true, "complexity": 1}, "token_estimate": 10}
 {"chunk_id": "03697091447c57d4", "source_file": "docs/samples/captura_de_id.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 2, "end_line": 2, "content": "addParam(\"client_id\", id_interno)\naddResult(id_interno)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 16}
 {"chunk_id": "2c64510b9ac6042b", "source_file": "docs/samples/try_catch_request.avap", "doc_type": "code", "block_type": "try", "section": "", "start_line": 1, "end_line": 6, "content": "try()\n    RequestGet(\"https://api.test.com/data\", 0, 0, respuesta, None)\nexception(e)\n    addVar(error_trace, e)\n    addResult(error_trace)\nend()", "metadata": {"uses_http": true, "uses_error_handling": true, "uses_exception": true, "returns_result": true, "complexity": 4}, "token_estimate": 42}
 {"chunk_id": "4d9f72fb03ba6d2b", "source_file": "docs/samples/validacion_de_nulo.avap", "doc_type": "code", "block_type": "addParam", "section": "", "start_line": 1, "end_line": 1, "content": "addParam(\"api_key\", key)", "metadata": {"uses_auth": true, "complexity": 1}, "token_estimate": 8}
 {"chunk_id": "19fa0a3950612c1e", "source_file": "docs/samples/validacion_de_nulo.avap", "doc_type": "code", "block_type": "if", "section": "", "start_line": 2, "end_line": 6, "content": "addParam(\"api_key\", key)\nif(key, None, \"==\")\n    addVar(_status, 403)\n    addVar(error, \"Acceso denegado: falta API KEY\")\n    addResult(error)\nend()", "metadata": {"uses_auth": true, "uses_conditional": true, "returns_result": true, "complexity": 3, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 47}
 {"chunk_id": "e06fe329097212dd", "source_file": "docs/samples/validacion_in_pertenece_a_lista.avap", "doc_type": "code", "block_type": "addParam", "section": "", "start_line": 1, "end_line": 1, "content": "addParam(\"rol\", r)", "metadata": {"uses_auth": true, "complexity": 1}, "token_estimate": 7}
 {"chunk_id": "285aeb7e911a5075", "source_file": "docs/samples/validacion_in_pertenece_a_lista.avap", "doc_type": "code", "block_type": "assignment", "section": "", "start_line": 2, "end_line": 2, "content": "addParam(\"rol\", r)\nacceso = False", "metadata": {"complexity": 0, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 11}
 {"chunk_id": "f8ed75075b7b1b13", "source_file": "docs/samples/validacion_in_pertenece_a_lista.avap", "doc_type": "code", "block_type": "if", "section": "", "start_line": 4, "end_line": 6, "content": "acceso = False\nif(None, None, \"r == 'admin' or r == 'editor' or r == 'root'\")\n    acceso = True\nend()", "metadata": {"uses_conditional": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 35}
 {"chunk_id": "b323dedebcbd9036", "source_file": "docs/samples/validacion_in_pertenece_a_lista.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 8, "end_line": 8, "content": "if(None, None, \"r == 'admin' or r == 'editor' or r == 'root'\")\n    acceso = True\nend()\naddResult(acceso)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 35}
 {"chunk_id": "d02cc7019c314251", "source_file": "docs/samples/construccion_dinamica_de_objeto.avap", "doc_type": "code", "block_type": "assignment", "section": "", "start_line": 1, "end_line": 1, "content": "datos_cliente = \"datos\"", "metadata": {"complexity": 0}, "token_estimate": 6}
 {"chunk_id": "c1528242fcd85a68", "source_file": "docs/samples/construccion_dinamica_de_objeto.avap", "doc_type": "code", "block_type": "addVar", "section": "", "start_line": 2, "end_line": 2, "content": "datos_cliente = \"datos\"\naddVar(clave, \"cliente_vip\")", "metadata": {"complexity": 0, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 16}
 {"chunk_id": "d335da8caf95ac8d", "source_file": "docs/samples/construccion_dinamica_de_objeto.avap", "doc_type": "code", "block_type": "addVariableToJSON", "section": "", "start_line": 3, "end_line": 3, "content": "addVar(clave, \"cliente_vip\")\nAddvariableToJSON(clave, datos_cliente, mi_json_final)", "metadata": {"uses_json": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 24}
 {"chunk_id": "27067ebe43e3b05d", "source_file": "docs/samples/construccion_dinamica_de_objeto.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 4, "end_line": 4, "content": "AddvariableToJSON(clave, datos_cliente, mi_json_final)\naddResult(mi_json_final)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 20}
 {"chunk_id": "a25dfc3b319135d3", "source_file": "docs/samples/contador_de_parametros.avap", "doc_type": "code", "block_type": "addParam", "section": "", "start_line": 1, "end_line": 1, "content": "addParam(\"data_list\", mi_lista)", "metadata": {"uses_auth": true, "complexity": 1}, "token_estimate": 9}
 {"chunk_id": "d96fd663666733fe", "source_file": "docs/samples/contador_de_parametros.avap", "doc_type": "code", "block_type": "getListLen", "section": "", "start_line": 2, "end_line": 2, "content": "addParam(\"data_list\", mi_lista)\ngetListLen(mi_lista, cantidad)", "metadata": {"uses_list": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 16}
 {"chunk_id": "9905db6de1ea3067", "source_file": "docs/samples/contador_de_parametros.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 3, "end_line": 3, "content": "getListLen(mi_lista, cantidad)\naddResult(cantidad)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 12}
 {"chunk_id": "7c239ad53392d63d", "source_file": "docs/samples/conversion_timestamp_legible.avap", "doc_type": "code", "block_type": "stampToDatetime", "section": "", "start_line": 1, "end_line": 1, "content": "stampToDatetime(1708726162, \"%d/%m/%Y\", 0, fecha_human)", "metadata": {"uses_datetime": true, "complexity": 1}, "token_estimate": 22}
 {"chunk_id": "c4dc5d3c081101a5", "source_file": "docs/samples/conversion_timestamp_legible.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 2, "end_line": 2, "content": "stampToDatetime(1708726162, \"%d/%m/%Y\", 0, fecha_human)\naddResult(fecha_human)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 28}
 {"chunk_id": "2905488dffcbd7ba", "source_file": "docs/samples/referencia_por_valor.avap", "doc_type": "code", "block_type": "addVar", "section": "", "start_line": 1, "end_line": 2, "content": "addVar(base, 1000)\naddVar(copia, $base)", "metadata": {"complexity": 0}, "token_estimate": 16}
 {"chunk_id": "82e05ef62a72de87", "source_file": "docs/samples/referencia_por_valor.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 3, "end_line": 3, "content": "addVar(base, 1000)\naddVar(copia, $base)\naddResult(copia)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 21}
 {"chunk_id": "a6727546f328e768", "source_file": "docs/samples/respuesta_multiple.avap", "doc_type": "code", "block_type": "addVar", "section": "", "start_line": 1, "end_line": 2, "content": "addVar(code, 200)\naddVar(status, \"Success\")", "metadata": {"complexity": 0}, "token_estimate": 14}
 {"chunk_id": "ce12abd61c278bec", "source_file": "docs/samples/respuesta_multiple.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 3, "end_line": 4, "content": "addVar(code, 200)\naddVar(status, \"Success\")\naddResult(code)\naddResult(status)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 22}
 {"chunk_id": "45b0086b13784a7d", "source_file": "docs/samples/salida_bucle_correcta.avap", "doc_type": "code", "block_type": "assignment", "section": "", "start_line": 1, "end_line": 1, "content": "encontrado = False", "metadata": {"complexity": 0}, "token_estimate": 5}
 {"chunk_id": "c6df33b0e7eac0ff", "source_file": "docs/samples/salida_bucle_correcta.avap", "doc_type": "code", "block_type": "startLoop", "section": "", "start_line": 2, "end_line": 7, "content": "encontrado = False\nstartLoop(i, 1, 10)\n    if(i, 5, \"==\")\n        encontrado = True\n        i = 11 \n    end()\nendLoop()", "metadata": {"uses_loop": true, "uses_conditional": true, "complexity": 2, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 42}
 {"chunk_id": "02edc488f13b7367", "source_file": "docs/samples/salida_bucle_correcta.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 8, "end_line": 8, "content": "i = 11 \n    end()\nendLoop()\naddResult(encontrado)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 17}
 {"chunk_id": "c8dbbbf6cb64c10d", "source_file": "docs/samples/funcion_de_suma.avap", "doc_type": "code", "block_type": "function", "section": "", "start_line": 1, "end_line": 4, "content": "function suma(a, b){\n        total = a + b\n        return(total)\n    }", "metadata": {"complexity": 0}, "token_estimate": 19}
 {"chunk_id": "1065800a57207e04", "source_file": "docs/samples/funcion_de_suma.avap", "doc_type": "function_signature", "block_type": "function_signature", "section": "", "start_line": 1, "end_line": 1, "content": "function suma(a, b)", "metadata": {"complexity": 0, "full_block_start": 1, "full_block_end": 4}, "token_estimate": 6}
 {"chunk_id": "1ef5fa8a4a980012", "source_file": "docs/samples/funcion_de_suma.avap", "doc_type": "code", "block_type": "assignment", "section": "", "start_line": 5, "end_line": 5, "content": "// contexto: function suma(a, b)\nresultado = suma(10, 20)", "metadata": {"complexity": 0, "has_overlap": true, "overlap_type": "function_sig"}, "token_estimate": 18}
 {"chunk_id": "ff7df988add5bbef", "source_file": "docs/samples/funcion_de_suma.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 6, "end_line": 6, "content": "// contexto: function suma(a, b)\naddResult(resultado)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "function_sig"}, "token_estimate": 13}
 {"chunk_id": "b8682e4f71d9d7c3", "source_file": "docs/samples/funcion_validacion_acceso.avap", "doc_type": "code", "block_type": "function", "section": "", "start_line": 1, "end_line": 7, "content": "function es_valido(token){\n    response = False\n    if(token, \"SECRET\", \"=\")\n        response = True\n    end()\n    return(response)\n    }", "metadata": {"uses_conditional": true, "complexity": 1}, "token_estimate": 34}
 {"chunk_id": "a1cfc36abdf661a0", "source_file": "docs/samples/funcion_validacion_acceso.avap", "doc_type": "function_signature", "block_type": "function_signature", "section": "", "start_line": 1, "end_line": 1, "content": "function es_valido(token)", "metadata": {"complexity": 0, "full_block_start": 1, "full_block_end": 7}, "token_estimate": 6}
 {"chunk_id": "66706bf4b7d3aede", "source_file": "docs/samples/funcion_validacion_acceso.avap", "doc_type": "code", "block_type": "assignment", "section": "", "start_line": 8, "end_line": 8, "content": "// contexto: function es_valido(token)\nautorizado = es_valido(\"SECRET\")", "metadata": {"complexity": 0, "has_overlap": true, "overlap_type": "function_sig"}, "token_estimate": 18}
 {"chunk_id": "5932e6b75c40b7db", "source_file": "docs/samples/funcion_validacion_acceso.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 9, "end_line": 9, "content": "// contexto: function es_valido(token)\naddResult(autorizado)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "function_sig"}, "token_estimate": 15}
 {"chunk_id": "4be60a16d7cc7c4d", "source_file": "docs/samples/generador_de_tokens_aleatorios.avap", "doc_type": "code", "block_type": "randomString", "section": "", "start_line": 1, "end_line": 1, "content": "randomString(\"[A-Z]\\d\", 32, token_seguridad)", "metadata": {"uses_string_ops": true, "complexity": 1}, "token_estimate": 15}
 {"chunk_id": "1810ca839b071a65", "source_file": "docs/samples/generador_de_tokens_aleatorios.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 2, "end_line": 2, "content": "randomString(\"[A-Z]\\d\", 32, token_seguridad)\naddResult(token_seguridad)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 21}
 {"chunk_id": "ed8b4a4e75a71762", "source_file": "docs/samples/obtencion_timestamp.avap", "doc_type": "code", "block_type": "getDateTime", "section": "", "start_line": 1, "end_line": 1, "content": "getDateTime(\"\", 0, \"UTC\", ahora)", "metadata": {"uses_datetime": true, "complexity": 1}, "token_estimate": 11}
 {"chunk_id": "05d2d0c8e6266861", "source_file": "docs/samples/obtencion_timestamp.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 2, "end_line": 2, "content": "getDateTime(\"\", 0, \"UTC\", ahora)\naddResult(ahora)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 17}
 {"chunk_id": "02d7b0e4a1e1f09c", "source_file": "docs/samples/ormAccessCreate.avap", "doc_type": "code", "block_type": "orm_command", "section": "", "start_line": 1, "end_line": 1, "content": "ormCheckTable(tabla_pruebas,resultado_comprobacion)", "metadata": {"uses_orm": true, "complexity": 1}, "token_estimate": 13}
 {"chunk_id": "6daea421c5a1d565", "source_file": "docs/samples/ormAccessCreate.avap", "doc_type": "code", "block_type": "if", "section": "", "start_line": 2, "end_line": 4, "content": "ormCheckTable(tabla_pruebas,resultado_comprobacion)\nif(resultado_comprobacion,False,'==')\n    ormCreateTable(\"username,age\",'VARCHAR,INTEGER',tabla_pruebas,resultado_creacion)\nend()", "metadata": {"uses_orm": true, "uses_conditional": true, "complexity": 2, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 45}
 {"chunk_id": "47d660e6c1f124d1", "source_file": "docs/samples/ormAccessCreate.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 5, "end_line": 6, "content": "if(resultado_comprobacion,False,'==')\n    ormCreateTable(\"username,age\",'VARCHAR,INTEGER',tabla_pruebas,resultado_creacion)\nend()\naddResult(resultado_comprobacion)\naddResult(resultado_creacion)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 45}
 {"chunk_id": "b15daff2028a2136", "source_file": "docs/samples/paginacion_dinamica_recursos.avap", "doc_type": "code", "block_type": "addParam", "section": "", "start_line": 1, "end_line": 2, "content": "addParam(\"page\", p)\naddParam(\"size\", s)", "metadata": {"uses_auth": true, "complexity": 1}, "token_estimate": 14}
 {"chunk_id": "8f1fa0e84c981765", "source_file": "docs/samples/paginacion_dinamica_recursos.avap", "doc_type": "code", "block_type": "assignment", "section": "", "start_line": 3, "end_line": 6, "content": "addParam(\"page\", p)\naddParam(\"size\", s)\nregistros = [\"u1\", \"u2\", \"u3\", \"u4\", \"u5\", \"u6\"]\noffset = int(p) * int(s)\nlimite = offset + int(s)\ncontador = 0", "metadata": {"complexity": 0, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 62}
 {"chunk_id": "e27ce4178666239b", "source_file": "docs/samples/paginacion_dinamica_recursos.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 7, "end_line": 8, "content": "offset = int(p) * int(s)\nlimite = offset + int(s)\ncontador = 0\naddResult(offset)\naddResult(limite)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 32}
 {"chunk_id": "9a66c0e4c49bbbcb", "source_file": "docs/samples/paginacion_dinamica_recursos.avap", "doc_type": "code", "block_type": "startLoop", "section": "", "start_line": 9, "end_line": 13, "content": "addResult(offset)\naddResult(limite)\nstartLoop(i, 2, limite)\n    actual = registros[int(i)]\n    titulo = \"reg_%s\" % i\n    AddvariableToJSON(titulo, actual, pagina_json)\nendLoop()", "metadata": {"uses_loop": true, "uses_json": true, "complexity": 2, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 53}
 {"chunk_id": "77c985068f6f9269", "source_file": "docs/samples/paginacion_dinamica_recursos.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 14, "end_line": 14, "content": "titulo = \"reg_%s\" % i\n    AddvariableToJSON(titulo, actual, pagina_json)\nendLoop()\naddResult(pagina_json)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 32}
 {"chunk_id": "aeb4f87681bdc8b4", "source_file": "docs/samples/asignacion_booleana.avap", "doc_type": "code", "block_type": "assignment", "section": "", "start_line": 1, "end_line": 2, "content": "nivel = 5\nes_admin = nivel >= 10", "metadata": {"complexity": 0}, "token_estimate": 12}
 {"chunk_id": "5f0f938196d5e573", "source_file": "docs/samples/asignacion_booleana.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 3, "end_line": 3, "content": "nivel = 5\nes_admin = nivel >= 10\naddResult(es_admin)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 18}
 {"chunk_id": "42fb50109876864c", "source_file": "docs/samples/asignacion_matematica.avap", "doc_type": "code", "block_type": "assignment", "section": "", "start_line": 1, "end_line": 3, "content": "subtotal = 150.50\niva = subtotal * 0.21\ntotal = subtotal + iva", "metadata": {"complexity": 0}, "token_estimate": 22}
 {"chunk_id": "6019c2adc7750c04", "source_file": "docs/samples/asignacion_matematica.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 4, "end_line": 4, "content": "subtotal = 150.50\niva = subtotal * 0.21\ntotal = subtotal + iva\naddResult(total)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 27}
 {"chunk_id": "e2f6a0de7e7f9dc1", "source_file": "docs/samples/bucle_1_10.avap", "doc_type": "code", "block_type": "startLoop", "section": "", "start_line": 1, "end_line": 4, "content": "startLoop(i,1,10)\n        item = \"item_%s\" % i\n        AddvariableToJSON(item,'valor_generado',mi_json)\nendLoop()", "metadata": {"uses_loop": true, "uses_json": true, "complexity": 2}, "token_estimate": 36}
 {"chunk_id": "ce1f2fab7c807537", "source_file": "docs/samples/bucle_1_10.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 5, "end_line": 5, "content": "item = \"item_%s\" % i\n        AddvariableToJSON(item,'valor_generado',mi_json)\nendLoop()\naddResult(mi_json)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 32}
--- a/research/code_indexing/elasticsearch_ingestion.py
+++ b/research/code_indexing/elasticsearch_ingestion.py
@ -0,0 +1,64 @@
 import typer
 from loguru import logger
 from scripts.pipelines.tasks.chunk import (
    fetch_documents, 
    process_documents, 
    export_documents,
    ingest_documents
 )
 app = typer.Typer()
@app.command()
 def elasticsearch_ingestion(
    docs_folder_path: str = "docs/samples",
    output_path: str = "research/code_indexing/chunks/chunks_EBNF_metadata.json",
    docs_extension: list[str] = [".avap"],
    es_index: str = "avap-code-indexing-ebnf-metadata",
    es_request_timeout: int = 120,
    es_max_retries: int = 5,
    es_retry_on_timeout: bool = True,
    delete_es_index: bool = True
 ) -> None:  
    """
    Pipeline to ingest documents into an Elasticsearch index. 
    The pipeline includes fetching documents from a specified folder, processing them into chunks, and then ingesting those chunks into the specified Elasticsearch index.
    Args:
        docs_folder_path (str): Path to the folder containing documents to be ingested. Default is "docs/samples".
        docs_extension (list[str]): List of file extensions to filter by (e.g., [".md", ".avap"]). Default is [".md", ".avap"].
        es_index (str): Name of the Elasticsearch index to ingest documents into. Default is "avap-docs-test-v3".
        es_request_timeout (int): Timeout in seconds for Elasticsearch requests. Default is 120 seconds.
        es_max_retries (int): Maximum number of retries for Elasticsearch requests in case of failure. Default is 5 retries.
        es_retry_on_timeout (bool): Whether to retry Elasticsearch requests on timeout. Default is True.
        delete_es_index (bool): Whether to delete the existing Elasticsearch index before ingestion. Default is True.
    Returns:
        None
    """
    logger.info("Starting Elasticsearch ingestion pipeline...")
    logger.info(f"Fetching files from {docs_folder_path}...")
    docs_path = fetch_documents(docs_folder_path, docs_extension)
    logger.info("Processing docs...")
    chunked_docs = process_documents(docs_path)
    logger.info(f"Ingesting chunks in Elasticsearch index: {es_index}...")
    elasticsearch_docs = ingest_documents(chunked_docs, es_index, es_request_timeout, es_max_retries, 
                     es_retry_on_timeout, delete_es_index)
    logger.info(f"Exporting processed documents to {output_path}...")
    export_documents(elasticsearch_docs, output_path)
    logger.info(f"Finished ingesting in {es_index}.")
 if __name__ == "__main__":
    try:
        app()
    except Exception as exc:
        logger.exception(exc)
        raise
--- a/research/code_indexing/generate_avap_code_qa_golden_dataset.ipynb
+++ b/research/code_indexing/generate_avap_code_qa_golden_dataset.ipynb
@ -0,0 +1,198 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "d520f6c3",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "from datasets import load_dataset\n",
    "\n",
    "import boto3\n",
    "from botocore.config import Config\n",
    "from langchain_core.messages import SystemMessage, HumanMessage\n",
    "\n",
    "from src.utils.llm_factory import create_chat_model\n",
    "from src.config import settings"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e08b9060",
   "metadata": {},
   "source": [
    "### Create LLM isntance"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "81111a86",
   "metadata": {},
   "outputs": [],
   "source": [
    "config = Config(\n",
    "    region_name=\"us-east-1\",\n",
    "    connect_timeout=10,     \n",
    "    read_timeout=600,        \n",
    ")\n",
    "\n",
    "client = boto3.client(\"bedrock-runtime\", config=config)\n",
    "\n",
    "llm = create_chat_model(\n",
    "    provider=\"bedrock\",\n",
    "    client=client,\n",
    "    model=\"global.anthropic.claude-sonnet-4-6\",\n",
    "    temperature=0,\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "045f8e81",
   "metadata": {},
   "source": [
    "### Load AVAP data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "07dea3fe",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(settings.proj_root / \"docs/LRM/avap.md\", \"r\") as f:\n",
    "    avap_docs = f.read()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "adbbe8b6",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loaded 33 AVAP samples\n"
     ]
    }
   ],
   "source": [
    "samples_dir = settings.proj_root / \"docs/samples\"\n",
    "avap_samples = []\n",
    "\n",
    "for avap_file in sorted(samples_dir.glob(\"*.avap\")):\n",
    "    with open(avap_file, \"r\") as f:\n",
    "        code = f.read()\n",
    "    \n",
    "    avap_samples.append({\n",
    "        \"file\": avap_file.name,\n",
    "        \"code\": code\n",
    "    })\n",
    "\n",
    "# Display as JSON\n",
    "avap_samples_json = json.dumps(avap_samples, indent=2, ensure_ascii=False)\n",
    "print(f\"Loaded {len(avap_samples)} AVAP samples\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7a15e09a",
   "metadata": {},
   "source": [
    "### Prompt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "895a170f",
   "metadata": {},
   "outputs": [],
   "source": [
    "GOLDEN_DATASET_PROMPT = SystemMessage(\n",
    "    content=f\"\"\"\n",
    "    You are an AI agent responsible for generating a golden dataset of queries for AVAP code retrieval and understanding.\n",
    "\n",
    "    You will receive a JSON array of AVAP code samples, each with a 'file' name and 'code' content.\n",
    "\n",
    "    Your task is to:\n",
    "    1. Analyze each AVAP code sample.\n",
    "    2. Generate 2-3 natural language queries that can be answered by examining that specific code.\n",
    "    3. Output a JSON array where each element has:\n",
    "       - \"query\": A natural language question about AVAP code implementation, best practices, or specific constructs.\n",
    "       - \"context\": The filename of the code sample that provides the context/answer for this query.\n",
    "\n",
    "    Requirements:\n",
    "    - Queries should be diverse: ask about functions, control flow, API operations, error handling, etc.\n",
    "    - Queries must be answerable using ONLY the provided code samples.\n",
    "    - Queries should be framed as natural developer questions (e.g., \"How do you handle errors in AVAP?\" or \"Show me an example of looping over a list\").\n",
    "    - Use natural English (or Spanish if context is Spanish-language code).\n",
    "    - Do not reference exact variable names unless necessary; focus on the patterns and constructs used.\n",
    "    - Output MUST be valid JSON array format.\n",
    "\n",
    "    AVAP Code Samples:\n",
    "    {avap_samples_json}\n",
    "\n",
    "    Output format (JSON array):\n",
    "    [\n",
    "      {{\"query\": \"...\", \"context\": \"filename.avap\"}},\n",
    "      {{\"query\": \"...\", \"context\": \"filename.avap\"}},\n",
    "      ...\n",
    "    ]\n",
    "    \"\"\"\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a3123199",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "98c4f93c",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "723352ee",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "assistance-engine",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }