created code_indexing_improvements research

2026-03-25 10:37:53 +01:00 · 2026-03-25 10:37:53 +01:00 · da483c51bb
parent fe90548b8b
commit da483c51bb
8 changed files with 103823 additions and 0 deletions
--- a/research/code_indexing/BNF/avap.lark
+++ b/research/code_indexing/BNF/avap.lark
@ -0,0 +1,228 @@
+start: program
+
+program: separator* line_or_comment (separator+ line_or_comment)* separator*
+
+?line_or_comment: simple_stmt comment?
+                | compound_stmt
+                | comment
+                | BLOCK_COMMENT
+
+?separator: EOL+
+
+comment: DOC_COMMENT | LINE_COMMENT
+
+EOL: /\r?\n/
+
+DOC_COMMENT.2: /\/\/\/[^\r\n]*/
+LINE_COMMENT.1: /\/\/[^\r\n]*/
+BLOCK_COMMENT: /\/\*[\s\S]*?\*\//
+
+?simple_stmt: assignment
+            | return_stmt
+            | system_command
+            | io_command
+            | async_command
+            | connector_cmd
+            | db_command
+            | http_command
+            | util_command
+            | modularity_cmd
+            | call_stmt
+
+?compound_stmt: function_decl
+              | if_stmt
+              | loop_stmt
+              | try_stmt
+
+assignment: identifier "=" expression
+
+call_stmt: identifier "(" argument_list? ")"
+         | identifier "=" identifier "." identifier "(" argument_list? ")"
+         | identifier "." identifier "(" argument_list? ")"
+
+system_command: register_cmd
+              | addvar_cmd
+
+register_cmd: "registerEndpoint" "(" stringliteral "," stringliteral "," list_display "," stringliteral "," identifier "," identifier ")"
+
+addvar_cmd: "addVar" "(" addvar_arg "," addvar_arg ")"
+
+addvar_arg: identifier
+          | literal
+          | "$" identifier
+
+identifier: IDENTIFIER
+
+system_variable: "_status"
+
+io_command: addparam_cmd
+          | getlistlen_cmd
+          | addresult_cmd
+          | getparamlist_cmd
+
+addparam_cmd: "addParam" "(" stringliteral "," identifier ")"
+getlistlen_cmd: "getListLen" "(" identifier "," identifier ")"
+getparamlist_cmd: "getQueryParamList" "(" stringliteral "," identifier ")"
+addresult_cmd: "addResult" "(" identifier ")"
+
+if_stmt: "if" "(" if_condition ")" separator block ("else" "(" ")" separator block)? "end" "(" ")"
+
+if_condition: if_atom "," if_atom "," stringliteral
+            | "None" "," "None" "," stringliteral
+
+if_atom: identifier
+       | literal
+
+loop_stmt: "startLoop" "(" identifier "," expression "," expression ")" separator block "endLoop" "(" ")"
+
+try_stmt: "try" "(" ")" separator block "exception" "(" identifier ")" separator block "end" "(" ")"
+
+block: separator* line_or_comment (separator+ line_or_comment)* separator*
+
+async_command: go_stmt
+             | gather_stmt
+
+go_stmt: identifier "=" "go" identifier "(" argument_list? ")"
+gather_stmt: identifier "=" "gather" "(" identifier ("," expression)? ")"
+
+connector_cmd: connector_instantiation
+
+connector_instantiation: identifier "=" "avapConnector" "(" stringliteral ")"
+
+http_command: req_post_cmd
+            | req_get_cmd
+
+req_post_cmd: "RequestPost" "(" expression "," expression "," expression "," expression "," identifier "," expression ")"
+req_get_cmd: "RequestGet" "(" expression "," expression "," expression "," identifier "," expression ")"
+
+db_command: orm_direct
+          | orm_check
+          | orm_create
+          | orm_select
+          | orm_insert
+          | orm_update
+
+orm_direct: "ormDirect" "(" expression "," identifier ")"
+orm_check: "ormCheckTable" "(" expression "," identifier ")"
+orm_create: "ormCreateTable" "(" expression "," expression "," expression "," identifier ")"
+
+orm_select: "ormAccessSelect" "(" orm_fields "," expression ("," expression)? "," identifier ")"
+
+orm_fields: "*"
+          | expression
+
+orm_insert: "ormAccessInsert" "(" expression "," expression "," identifier ")"
+orm_update: "ormAccessUpdate" "(" expression "," expression "," expression "," expression "," identifier ")"
+
+util_command: json_list_cmd
+            | crypto_cmd
+            | regex_cmd
+            | datetime_cmd
+            | stamp_cmd
+            | string_cmd
+            | replace_cmd
+
+json_list_cmd: "variableToList" "(" expression "," identifier ")"
+             | "itemFromList" "(" identifier "," expression "," identifier ")"
+             | "variableFromJSON" "(" identifier "," expression "," identifier ")"
+             | "AddVariableToJSON" "(" expression "," expression "," identifier ")"
+
+crypto_cmd: "encodeSHA256" "(" identifier_or_string "," identifier ")"
+          | "encodeMD5" "(" identifier_or_string "," identifier ")"
+
+regex_cmd: "getRegex" "(" identifier "," stringliteral "," identifier ")"
+
+datetime_cmd: "getDateTime" "(" stringliteral "," expression "," stringliteral "," identifier ")"
+
+stamp_cmd: "stampToDatetime" "(" expression "," stringliteral "," expression "," identifier ")"
+         | "getTimeStamp" "(" stringliteral "," stringliteral "," expression "," identifier ")"
+
+string_cmd: "randomString" "(" expression "," expression "," identifier ")"
+
+replace_cmd: "replace" "(" identifier_or_string "," stringliteral "," stringliteral "," identifier ")"
+
+function_decl: "function" identifier "(" param_list? ")" "{" separator block "}"
+
+param_list: identifier ("," identifier)*
+
+return_stmt: "return" "(" expression? ")"
+
+modularity_cmd: include_stmt
+              | import_stmt
+
+include_stmt: "include" stringliteral
+import_stmt: "import" ("<" identifier ">" | stringliteral)
+
+?expression: logical_or
+
+?logical_or: logical_and ("or" logical_and)*
+?logical_and: logical_not ("and" logical_not)*
+
+?logical_not: "not" logical_not
+            | comparison
+
+?comparison: arithmetic (comp_op arithmetic)*
+
+comp_op: "==" | "!=" | "<" | ">" | "<=" | ">=" | "in" | "is"
+
+?arithmetic: term (("+" | "-") term)*
+?term: factor (("*" | "/" | "%") factor)*
+
+?factor: ("+" | "-") factor
+       | power
+
+?power: primary ("**" factor)?
+
+?primary: atom postfix*
+
+postfix: "." identifier
+       | "[" expression "]"
+       | "[" expression? ":" expression? (":" expression?)? "]"
+       | "(" argument_list? ")"
+
+?atom: identifier
+     | "$" identifier
+     | literal
+     | "(" expression ")"
+     | list_display
+     | dict_display
+
+list_display: "[" argument_list? "]"
+            | "[" expression "for" identifier "in" expression if_clause? "]"
+
+if_clause: "if" expression
+
+dict_display: "{" key_datum_list? "}"
+
+key_datum_list: key_datum ("," key_datum)*
+key_datum: expression ":" expression
+
+argument_list: expression ("," expression)*
+
+number: FLOATNUMBER
+      | INTEGER
+
+literal: stringliteral
+       | number
+       | boolean
+       | "None"
+
+boolean: "True" | "False"
+
+INTEGER: /[0-9]+/
+FLOATNUMBER: /(?:[0-9]+\.[0-9]*|\.[0-9]+)/
+
+stringliteral: STRING_DOUBLE
+             | STRING_SINGLE
+
+# STRING_DOUBLE: /"([^"\\]|\\["'\\ntr0])*"/
+# STRING_SINGLE: /'([^'\\]|\\["'\\ntr0])*'/
+STRING_DOUBLE: /"([^"\\]|\\.)*"/
+STRING_SINGLE: /'([^'\\]|\\.)*'/
+
+identifier_or_string: identifier
+                    | stringliteral
+
+IDENTIFIER: /[A-Za-z_][A-Za-z0-9_]*/
+
+%ignore /[ \t]+/
--- a/research/code_indexing/chunk.py
+++ b/research/code_indexing/chunk.py
@ -0,0 +1,371 @@
+import json
+from copy import deepcopy
+from dataclasses import replace
+from pathlib import Path
+from typing import Any, Union
+
+from lark import Lark, Tree
+from chonkie import (
+    Chunk,
+    ElasticHandshake,
+    FileFetcher,
+    MarkdownChef,
+    TextChef,
+    TokenChunker,
+    MarkdownDocument
+)
+from elasticsearch import Elasticsearch
+from loguru import logger
+from transformers import AutoTokenizer
+
+from scripts.pipelines.tasks.embeddings import OllamaEmbeddings
+from src.config import settings
+
+COMMAND_METADATA_NAMES = {
+    # system
+    "register_cmd": "registerEndpoint",
+    "addvar_cmd": "addVar",
+    "addparam_cmd": "addParam",
+    "getlistlen_cmd": "getListLen",
+    "getparamlist_cmd": "getQueryParamList",
+    "addresult_cmd": "addResult",
+
+    # async
+    "go_stmt": "go",
+    "gather_stmt": "gather",
+
+    # connector
+    "connector_instantiation": "avapConnector",
+
+    # http
+    "req_post_cmd": "RequestPost",
+    "req_get_cmd": "RequestGet",
+
+    # db
+    "orm_direct": "ormDirect",
+    "orm_check": "ormCheckTable",
+    "orm_create": "ormCreateTable",
+    "orm_select": "ormAccessSelect",
+    "orm_insert": "ormAccessInsert",
+    "orm_update": "ormAccessUpdate",
+
+    # util
+    "json_list_cmd": "json_list_ops",
+    "crypto_cmd": "crypto_ops",
+    "regex_cmd": "getRegex",
+    "datetime_cmd": "getDateTime",
+    "stamp_cmd": "timestamp_ops",
+    "string_cmd": "randomString",
+    "replace_cmd": "replace",
+
+    # modularity
+    "include_stmt": "include",
+    "import_stmt": "import",
+
+    # generic statements
+    "assignment": "assignment",
+    "call_stmt": "call",
+    "return_stmt": "return",
+    "if_stmt": "if",
+    "loop_stmt": "startLoop",
+    "try_stmt": "try",
+    "function_decl": "function",
+}
+
+
+def _extract_command_metadata(ast: Tree | None) -> dict[str, bool]:
+    if ast is None:
+        return {}
+
+    used_commands: set[str] = set()
+
+    for subtree in ast.iter_subtrees():
+        if subtree.data in COMMAND_METADATA_NAMES:
+            used_commands.add(COMMAND_METADATA_NAMES[subtree.data])
+
+    return {command_name: True for command_name in sorted(used_commands)}
+
+
+def _get_text(element) -> str:
+    for attr in ("text", "content", "markdown"):
+        value = getattr(element, attr, None)
+        if isinstance(value, str):
+            return value
+    raise AttributeError(
+        f"Could not extract text from element of type {type(element).__name__}"
+    )
+
+
+def _merge_markdown_document(processed_doc: MarkdownDocument) -> MarkdownDocument:
+    elements = []
+
+    for chunk in processed_doc.chunks:
+        elements.append(("chunk", chunk.start_index, chunk.end_index, chunk))
+
+    for code in processed_doc.code:
+        elements.append(("code", code.start_index, code.end_index, code))
+
+    for table in processed_doc.tables:
+        elements.append(("table", table.start_index, table.end_index, table))
+
+    elements.sort(key=lambda item: (item[1], item[2]))
+
+    merged_chunks = []
+    current_chunk = None
+    current_parts = []
+    current_end_index = None
+    current_token_count = None
+
+    def flush():
+        nonlocal current_chunk, current_parts, current_end_index, current_token_count
+
+        if current_chunk is None:
+            return
+
+        merged_text = "\n\n".join(part for part in current_parts if part)
+
+        merged_chunks.append(
+            replace(
+                current_chunk,
+                text=merged_text,
+                end_index=current_end_index,
+                token_count=current_token_count,
+            )
+        )
+
+        current_chunk = None
+        current_parts = []
+        current_end_index = None
+        current_token_count = None
+
+    for kind, _, _, element in elements:
+        if kind == "chunk":
+            flush()
+            current_chunk = element
+            current_parts = [_get_text(element)]
+            current_end_index = element.end_index
+            current_token_count = element.token_count
+            continue
+
+        if current_chunk is None:
+            continue
+
+        current_parts.append(_get_text(element))
+        current_end_index = max(current_end_index, element.end_index)
+        current_token_count += getattr(element, "token_count", 0)
+
+    flush()
+
+    fused_processed_doc = deepcopy(processed_doc)
+    fused_processed_doc.chunks = merged_chunks
+    fused_processed_doc.code = processed_doc.code
+    fused_processed_doc.tables = processed_doc.tables
+
+    return fused_processed_doc
+
+
+class ElasticHandshakeWithMetadata(ElasticHandshake):
+    """Extended ElasticHandshake that preserves chunk metadata in Elasticsearch."""
+    
+    def _create_bulk_actions(self, chunks: list[dict]) -> list[dict[str, Any]]:
+        """Generate bulk actions including metadata."""
+        actions = []
+        embeddings = self.embedding_model.embed_batch([chunk["chunk"].text for chunk in chunks])
+        
+        for i, chunk in enumerate(chunks):
+            source = {
+                "text": chunk["chunk"].text,
+                "embedding": embeddings[i],
+                "start_index": chunk["chunk"].start_index,
+                "end_index": chunk["chunk"].end_index,
+                "token_count": chunk["chunk"].token_count,
+            }
+            
+            # Include metadata if it exists
+            if chunk.get("extra_metadata"):
+                source.update(chunk["extra_metadata"])
+            
+            actions.append({
+                "_index": self.index_name,
+                "_id": self._generate_id(i, chunk["chunk"]),
+                "_source": source,
+            })
+
+        return actions
+    
+    def write(self, chunks: Union[Chunk, list[Chunk]]) -> list[dict[str, Any]]:
+        """Write the chunks to the Elasticsearch index using the bulk API."""
+        if isinstance(chunks, Chunk):
+            chunks = [chunks]
+
+        actions = self._create_bulk_actions(chunks)
+
+        # Use the bulk helper to efficiently write the documents
+        from elasticsearch.helpers import bulk
+
+        success, errors = bulk(self.client, actions, raise_on_error=False)
+
+        if errors:
+            logger.warning(f"Encountered {len(errors)} errors during bulk indexing.")  # type: ignore
+            # Optionally log the first few errors for debugging
+            for i, error in enumerate(errors[:5]):  # type: ignore
+                logger.error(f"Error {i + 1}: {error}")
+
+        logger.info(f"Chonkie wrote {success} chunks to Elasticsearch index: {self.index_name}")
+
+        return actions
+
+
+def fetch_documents(docs_folder_path: str, docs_extension: list[str]) -> list[Path]:
+    """
+    Fetch files from a folder that match the specified extensions.
+
+    Args:
+        docs_folder_path (str): Path to the folder containing documents
+        docs_extension (list[str]): List of file extensions to filter by (e.g., [".md", ".avap"])
+
+    Returns:
+        List of Paths to the fetched documents
+    """
+    fetcher = FileFetcher()
+    docs_path = fetcher.fetch(dir=f"{settings.proj_root}/{docs_folder_path}", ext=docs_extension)
+    return docs_path
+
+
+def process_documents(docs_path: list[Path]) -> list[dict[str, Any]]:
+    """
+    Process documents by applying appropriate chefs and chunking strategies based on file type.
+
+    Args:
+        docs_path: List of Paths to the documents to be processed.
+
+    Returns:
+        List of dicts with "chunk" (Chunk object) and "extra_metadata" (dict with file info).
+    """
+    processed_docs = []
+    custom_tokenizer = AutoTokenizer.from_pretrained(settings.hf_emb_model_name)
+
+    chef_md = MarkdownChef(tokenizer=custom_tokenizer)
+    chef_txt = TextChef()
+    chunker = TokenChunker(tokenizer=custom_tokenizer)
+
+    with open(settings.proj_root / "research/code_indexing/BNF/avap.lark", encoding="utf-8") as grammar:
+        lark_parser = Lark(
+            grammar.read(),
+            parser="lalr",
+            propagate_positions=True,
+            start="program",
+        )
+
+    for doc_path in docs_path:
+        doc_extension = doc_path.suffix.lower()
+
+        if doc_extension == ".md":
+            processed_doc = chef_md.process(doc_path)
+            fused_doc = _merge_markdown_document(processed_doc)
+            chunked_doc = fused_doc.chunks
+            specific_metadata = {
+                "file_type": "avap_docs",
+                "filename": doc_path.name,
+            }
+
+        elif doc_extension == ".avap":
+            processed_doc = chef_txt.process(doc_path)
+
+            try:
+                ast = lark_parser.parse(processed_doc.content)
+            except Exception as e:
+                logger.error(f"Error parsing AVAP code in {doc_path.name}: {e}")
+                ast = None
+
+            chunked_doc = chunker.chunk(processed_doc.content)
+
+            specific_metadata = {
+                "file_type": "avap_code",
+                "filename": doc_path.name,
+                **_extract_command_metadata(ast),
+            }
+
+        else:
+            continue
+
+        for chunk in chunked_doc:
+            processed_docs.append(
+                {
+                    "chunk": chunk,
+                    "extra_metadata": {**specific_metadata},
+                }
+            )
+
+    return processed_docs
+
+
+def ingest_documents(
+    chunked_docs: list[dict[str, Chunk | dict[str, Any]]],
+    es_index: str,
+    es_request_timeout: int,
+    es_max_retries: int,
+    es_retry_on_timeout: bool,
+    delete_es_index: bool,
+) -> list[dict[str, Any]]:
+    """
+    Ingest processed documents into an Elasticsearch index.
+
+    Args:
+        chunked_docs (list[dict[str, Any]]): List of dicts with "chunk" and "metadata" keys
+        es_index (str): Name of the Elasticsearch index to ingest into
+        es_request_timeout (int): Timeout for Elasticsearch requests in seconds
+        es_max_retries (int): Maximum number of retries for Elasticsearch requests
+        es_retry_on_timeout (bool): Whether to retry on Elasticsearch request timeouts
+        delete_es_index (bool): Whether to delete the existing Elasticsearch index before ingestion
+
+    Returns:
+        List of dicts with Elasticsearch response for each chunk
+    """
+    logger.info(
+        f"Instantiating Elasticsearch client with URL: {settings.elasticsearch_local_url}..."
+    )
+    es = Elasticsearch(
+        hosts=settings.elasticsearch_local_url,
+        request_timeout=es_request_timeout,
+        max_retries=es_max_retries,
+        retry_on_timeout=es_retry_on_timeout,
+    )
+
+    if delete_es_index and es.indices.exists(index=es_index):
+        logger.info(f"Deleting existing Elasticsearch index: {es_index}...")
+        es.indices.delete(index=es_index)
+
+    handshake = ElasticHandshakeWithMetadata(
+        client=es,
+        index_name=es_index,
+        embedding_model=OllamaEmbeddings(model=settings.ollama_emb_model_name),
+    )
+
+    logger.info(
+        f"Ingesting {len(chunked_docs)} chunks into Elasticsearch index: {es_index}..."
+    )
+    elasticsearch_chunks = handshake.write(chunked_docs)
+
+    return elasticsearch_chunks
+
+
+def export_documents(elasticsearch_chunks: list[dict[str, Any]], output_path: str) -> None:
+    """
+    Export processed documents to JSON files in the specified output folder.
+
+    Args:
+        elasticsearch_chunks (list[dict[str, Any]]): List of dicts with Elasticsearch response for each chunk
+        output_path (str): Path to the file where the JSON will be saved
+    Returns:
+        None
+    """
+    output_path = settings.proj_root / output_path
+
+    for chunk in elasticsearch_chunks:
+        chunk["_source"]["embedding"] = chunk["_source"]["embedding"].tolist() # For JSON serialization
+
+    with output_path.open("w", encoding="utf-8") as f:
+        json.dump(elasticsearch_chunks, f, ensure_ascii=False, indent=4)
+
+    logger.info(f"Exported processed documents to {output_path}")
--- a/research/code_indexing/chunks/chunks_EBNF_metadata.json
+++ b/research/code_indexing/chunks/chunks_EBNF_metadata.json
--- a/research/code_indexing/chunks/chunks_file_level.json
+++ b/research/code_indexing/chunks/chunks_file_level.json
--- a/research/code_indexing/chunks/chunks_full_ebnf.json
+++ b/research/code_indexing/chunks/chunks_full_ebnf.json
--- a/research/code_indexing/chunks/chunks_grammar_level.jsonl
+++ b/research/code_indexing/chunks/chunks_grammar_level.jsonl
@ -0,0 +1,89 @@
+{"chunk_id": "5208d7435c0286ab", "source_file": "docs/samples/hash_SHA256_para_integridad.avap", "doc_type": "code", "block_type": "encodeSHA256", "section": "", "start_line": 1, "end_line": 1, "content": "encodeSHA256(\"payload_data\", checksum)", "metadata": {"uses_crypto": true, "uses_string_ops": true, "complexity": 2}, "token_estimate": 9}
+{"chunk_id": "e5e9b70428937778", "source_file": "docs/samples/hash_SHA256_para_integridad.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 2, "end_line": 2, "content": "encodeSHA256(\"payload_data\", checksum)\naddResult(checksum)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 14}
+{"chunk_id": "49d6b31967a1db93", "source_file": "docs/samples/hello_world.avap", "doc_type": "code", "block_type": "registerEndpoint", "section": "", "start_line": 1, "end_line": 1, "content": "registerEndpoint(\"/hello_world\",\"GET\",[],\"HELLO_WORLD\",main,result)", "metadata": {"registers_endpoint": true, "complexity": 1}, "token_estimate": 17}
+{"chunk_id": "e7ececd11823d42a", "source_file": "docs/samples/hello_world.avap", "doc_type": "code", "block_type": "addVar", "section": "", "start_line": 2, "end_line": 2, "content": "registerEndpoint(\"/hello_world\",\"GET\",[],\"HELLO_WORLD\",main,result)\naddVar(name,\"Alberto\")", "metadata": {"complexity": 0, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 24}
+{"chunk_id": "f103d7719754088f", "source_file": "docs/samples/hello_world.avap", "doc_type": "code", "block_type": "assignment", "section": "", "start_line": 3, "end_line": 3, "content": "addVar(name,\"Alberto\")\nresult = \"Hello,\" + name", "metadata": {"complexity": 0, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 14}
+{"chunk_id": "4b1ab59c1acb224c", "source_file": "docs/samples/hello_world.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 4, "end_line": 4, "content": "result = \"Hello,\" + name\naddResult(result)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 12}
+{"chunk_id": "682adaeeb528f778", "source_file": "docs/samples/hola_mundo.avap", "doc_type": "code", "block_type": "addVar", "section": "", "start_line": 1, "end_line": 1, "content": "addVar(mensaje, \"Hola mundo desde AVAP\")", "metadata": {"complexity": 0}, "token_estimate": 12}
+{"chunk_id": "9bb665ca8d7590f7", "source_file": "docs/samples/hola_mundo.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 2, "end_line": 2, "content": "addVar(mensaje, \"Hola mundo desde AVAP\")\naddResult(mensaje)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 17}
+{"chunk_id": "ed0136ad03a51e7e", "source_file": "docs/samples/captura_de_listas_multiples.avap", "doc_type": "code", "block_type": "addParam", "section": "", "start_line": 1, "end_line": 1, "content": "addParam(\"emails\", emails)", "metadata": {"uses_auth": true, "complexity": 1}, "token_estimate": 7}
+{"chunk_id": "899291ac8959ae3e", "source_file": "docs/samples/captura_de_listas_multiples.avap", "doc_type": "code", "block_type": "getQueryParamList", "section": "", "start_line": 2, "end_line": 2, "content": "addParam(\"emails\", emails)\ngetQueryParamList(\"lista_correos\", lista_correos)", "metadata": {"complexity": 0, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 21}
+{"chunk_id": "0eeff974dcd74729", "source_file": "docs/samples/captura_de_listas_multiples.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 3, "end_line": 3, "content": "getQueryParamList(\"lista_correos\", lista_correos)\naddResult(lista_correos)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 21}
+{"chunk_id": "b2e95857d059d99d", "source_file": "docs/samples/comparacion_simple.avap", "doc_type": "code", "block_type": "addParam", "section": "", "start_line": 1, "end_line": 1, "content": "addParam(\"lang\", l)", "metadata": {"uses_auth": true, "complexity": 1}, "token_estimate": 7}
+{"chunk_id": "db2fab8dfbe7d460", "source_file": "docs/samples/comparacion_simple.avap", "doc_type": "code", "block_type": "if", "section": "", "start_line": 2, "end_line": 4, "content": "addParam(\"lang\", l)\nif(l, \"es\", \"=\")\n    addVar(msg, \"Hola\")\nend()", "metadata": {"uses_conditional": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 25}
+{"chunk_id": "2628fa886650658a", "source_file": "docs/samples/comparacion_simple.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 5, "end_line": 5, "content": "if(l, \"es\", \"=\")\n    addVar(msg, \"Hola\")\nend()\naddResult(msg)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 22}
+{"chunk_id": "89bddd6830b6a8af", "source_file": "docs/samples/concatenacion_dinamica.avap", "doc_type": "code", "block_type": "assignment", "section": "", "start_line": 1, "end_line": 2, "content": "nombre = \"Sistema\"\nlog = \"Evento registrado por: %s\" % nombre", "metadata": {"complexity": 0}, "token_estimate": 18}
+{"chunk_id": "6797d36c2eb0e38a", "source_file": "docs/samples/concatenacion_dinamica.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 3, "end_line": 3, "content": "nombre = \"Sistema\"\nlog = \"Evento registrado por: %s\" % nombre\naddResult(log)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 23}
+{"chunk_id": "93008a3bed0ea808", "source_file": "docs/samples/if_desigualdad.avap", "doc_type": "code", "block_type": "addParam", "section": "", "start_line": 1, "end_line": 1, "content": "addParam(\"password\",pass_nueva)", "metadata": {"uses_auth": true, "complexity": 1}, "token_estimate": 9}
+{"chunk_id": "142b2aef2f05fae7", "source_file": "docs/samples/if_desigualdad.avap", "doc_type": "code", "block_type": "assignment", "section": "", "start_line": 2, "end_line": 2, "content": "addParam(\"password\",pass_nueva)\npass_antigua = \"password\"", "metadata": {"complexity": 0, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 16}
+{"chunk_id": "b03b67f3aab35d7a", "source_file": "docs/samples/if_desigualdad.avap", "doc_type": "code", "block_type": "if", "section": "", "start_line": 3, "end_line": 5, "content": "pass_antigua = \"password\"\nif(pass_nueva, pass_antigua, \"!=\")\n        addVar(cambio, \"Contraseña actualizada\")\nend()", "metadata": {"uses_conditional": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 33}
+{"chunk_id": "99549cab6c8617d8", "source_file": "docs/samples/if_desigualdad.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 6, "end_line": 6, "content": "if(pass_nueva, pass_antigua, \"!=\")\n        addVar(cambio, \"Contraseña actualizada\")\nend()\naddResult(cambio)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 31}
+{"chunk_id": "123dfdacd4160b0d", "source_file": "docs/samples/limpieza_de_strings.avap", "doc_type": "code", "block_type": "replace", "section": "", "start_line": 1, "end_line": 1, "content": "replace(\"REF_1234_OLD\",\"OLD\", \"NEW\", ref_actualizada)", "metadata": {"uses_string_ops": true, "complexity": 1}, "token_estimate": 17}
+{"chunk_id": "c65655393175720a", "source_file": "docs/samples/limpieza_de_strings.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 2, "end_line": 2, "content": "replace(\"REF_1234_OLD\",\"OLD\", \"NEW\", ref_actualizada)\naddResult(ref_actualizada)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 23}
+{"chunk_id": "3edbf12e560e22b1", "source_file": "docs/samples/manejo_error_sql_critico.avap", "doc_type": "code", "block_type": "try", "section": "", "start_line": 1, "end_line": 7, "content": "try()\n    ormDirect(\"UPDATE table_inexistente SET a=1\", res)\nexception(e)\n    addVar(_status, 500)\n    addVar(error_msg, \"Error de base de datos\")\n    addResult(error_msg)\nend()", "metadata": {"uses_orm": true, "uses_auth": true, "uses_error_handling": true, "uses_exception": true, "returns_result": true, "complexity": 5}, "token_estimate": 51}
+{"chunk_id": "75bcc1f794c8527f", "source_file": "docs/samples/else_estandar.avap", "doc_type": "code", "block_type": "addParam", "section": "", "start_line": 1, "end_line": 1, "content": "addParam(\"sal_par\",saldo)", "metadata": {"uses_auth": true, "complexity": 1}, "token_estimate": 8}
+{"chunk_id": "99462a4539651e84", "source_file": "docs/samples/else_estandar.avap", "doc_type": "code", "block_type": "if", "section": "", "start_line": 2, "end_line": 6, "content": "addParam(\"sal_par\",saldo)\nif(saldo, 0, \">\")\n    permitir = True\nelse()\n    permitir = False\nend()", "metadata": {"uses_conditional": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 33}
+{"chunk_id": "c9134748119a6401", "source_file": "docs/samples/else_estandar.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 7, "end_line": 7, "content": "else()\n    permitir = False\nend()\naddResult(permitir)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 16}
+{"chunk_id": "da88ce6ec35e309a", "source_file": "docs/samples/expresion_compleja.avap", "doc_type": "code", "block_type": "addParam", "section": "", "start_line": 1, "end_line": 2, "content": "addParam(\"userrype\", user_type)\naddParam(\"sells\", compras)", "metadata": {"uses_auth": true, "complexity": 1}, "token_estimate": 19}
+{"chunk_id": "ef826cb80ab05a8c", "source_file": "docs/samples/expresion_compleja.avap", "doc_type": "code", "block_type": "if", "section": "", "start_line": 3, "end_line": 5, "content": "addParam(\"userrype\", user_type)\naddParam(\"sells\", compras)\nif(None, None, \" user_type == 'VIP' or compras > 100\")\n    addVar(descuento, 0.20)\nend()", "metadata": {"uses_conditional": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 51}
+{"chunk_id": "117c5396b3e2f3bd", "source_file": "docs/samples/expresion_compleja.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 6, "end_line": 6, "content": "if(None, None, \" user_type == 'VIP' or compras > 100\")\n    addVar(descuento, 0.20)\nend()\naddResult(descuento)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 37}
+{"chunk_id": "559f8f61eda7ff75", "source_file": "docs/samples/fecha_para_base_de_datos.avap", "doc_type": "code", "block_type": "getDateTime", "section": "", "start_line": 1, "end_line": 1, "content": "getDateTime(\"%Y-%m-%d %H:%M:%S\", 0, \"Europe/Madrid\", sql_date)", "metadata": {"uses_datetime": true, "complexity": 1}, "token_estimate": 27}
+{"chunk_id": "b40f10f126c22c01", "source_file": "docs/samples/fecha_para_base_de_datos.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 2, "end_line": 2, "content": "getDateTime(\"%Y-%m-%d %H:%M:%S\", 0, \"Europe/Madrid\", sql_date)\naddResult(sql_date)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 32}
+{"chunk_id": "717f75fe4eb08ecf", "source_file": "docs/samples/bucle_longitud_de_datos.avap", "doc_type": "code", "block_type": "assignment", "section": "", "start_line": 1, "end_line": 1, "content": "registros = ['1','2','3']", "metadata": {"complexity": 0}, "token_estimate": 10}
+{"chunk_id": "8a695ac320884362", "source_file": "docs/samples/bucle_longitud_de_datos.avap", "doc_type": "code", "block_type": "getListLen", "section": "", "start_line": 2, "end_line": 2, "content": "registros = ['1','2','3']\ngetListLen(registros, total)", "metadata": {"uses_list": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 17}
+{"chunk_id": "9530c2cad477b991", "source_file": "docs/samples/bucle_longitud_de_datos.avap", "doc_type": "code", "block_type": "assignment", "section": "", "start_line": 3, "end_line": 3, "content": "getListLen(registros, total)\ncontador = 0", "metadata": {"complexity": 0, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 11}
+{"chunk_id": "c4acc74c9b001703", "source_file": "docs/samples/bucle_longitud_de_datos.avap", "doc_type": "code", "block_type": "startLoop", "section": "", "start_line": 4, "end_line": 6, "content": "contador = 0\nstartLoop(idx, 0, 2)\n    actual = registros[int(idx)]\nendLoop()", "metadata": {"uses_loop": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 25}
+{"chunk_id": "80e935fcd6c7a232", "source_file": "docs/samples/bucle_longitud_de_datos.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 7, "end_line": 7, "content": "startLoop(idx, 0, 2)\n    actual = registros[int(idx)]\nendLoop()\naddResult(actual)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 24}
+{"chunk_id": "576b1bc85805eef0", "source_file": "docs/samples/calculo_de_expiracion.avap", "doc_type": "code", "block_type": "getDateTime", "section": "", "start_line": 1, "end_line": 1, "content": "getDateTime(\"\", 86400, \"UTC\", expira)", "metadata": {"uses_datetime": true, "complexity": 1}, "token_estimate": 13}
+{"chunk_id": "686f254e071d6280", "source_file": "docs/samples/calculo_de_expiracion.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 2, "end_line": 2, "content": "getDateTime(\"\", 86400, \"UTC\", expira)\naddResult(expira)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 18}
+{"chunk_id": "79fd8fee120921e7", "source_file": "docs/samples/captura_de_id.avap", "doc_type": "code", "block_type": "addParam", "section": "", "start_line": 1, "end_line": 1, "content": "addParam(\"client_id\", id_interno)", "metadata": {"uses_auth": true, "complexity": 1}, "token_estimate": 10}
+{"chunk_id": "03697091447c57d4", "source_file": "docs/samples/captura_de_id.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 2, "end_line": 2, "content": "addParam(\"client_id\", id_interno)\naddResult(id_interno)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 16}
+{"chunk_id": "2c64510b9ac6042b", "source_file": "docs/samples/try_catch_request.avap", "doc_type": "code", "block_type": "try", "section": "", "start_line": 1, "end_line": 6, "content": "try()\n    RequestGet(\"https://api.test.com/data\", 0, 0, respuesta, None)\nexception(e)\n    addVar(error_trace, e)\n    addResult(error_trace)\nend()", "metadata": {"uses_http": true, "uses_error_handling": true, "uses_exception": true, "returns_result": true, "complexity": 4}, "token_estimate": 42}
+{"chunk_id": "4d9f72fb03ba6d2b", "source_file": "docs/samples/validacion_de_nulo.avap", "doc_type": "code", "block_type": "addParam", "section": "", "start_line": 1, "end_line": 1, "content": "addParam(\"api_key\", key)", "metadata": {"uses_auth": true, "complexity": 1}, "token_estimate": 8}
+{"chunk_id": "19fa0a3950612c1e", "source_file": "docs/samples/validacion_de_nulo.avap", "doc_type": "code", "block_type": "if", "section": "", "start_line": 2, "end_line": 6, "content": "addParam(\"api_key\", key)\nif(key, None, \"==\")\n    addVar(_status, 403)\n    addVar(error, \"Acceso denegado: falta API KEY\")\n    addResult(error)\nend()", "metadata": {"uses_auth": true, "uses_conditional": true, "returns_result": true, "complexity": 3, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 47}
+{"chunk_id": "e06fe329097212dd", "source_file": "docs/samples/validacion_in_pertenece_a_lista.avap", "doc_type": "code", "block_type": "addParam", "section": "", "start_line": 1, "end_line": 1, "content": "addParam(\"rol\", r)", "metadata": {"uses_auth": true, "complexity": 1}, "token_estimate": 7}
+{"chunk_id": "285aeb7e911a5075", "source_file": "docs/samples/validacion_in_pertenece_a_lista.avap", "doc_type": "code", "block_type": "assignment", "section": "", "start_line": 2, "end_line": 2, "content": "addParam(\"rol\", r)\nacceso = False", "metadata": {"complexity": 0, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 11}
+{"chunk_id": "f8ed75075b7b1b13", "source_file": "docs/samples/validacion_in_pertenece_a_lista.avap", "doc_type": "code", "block_type": "if", "section": "", "start_line": 4, "end_line": 6, "content": "acceso = False\nif(None, None, \"r == 'admin' or r == 'editor' or r == 'root'\")\n    acceso = True\nend()", "metadata": {"uses_conditional": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 35}
+{"chunk_id": "b323dedebcbd9036", "source_file": "docs/samples/validacion_in_pertenece_a_lista.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 8, "end_line": 8, "content": "if(None, None, \"r == 'admin' or r == 'editor' or r == 'root'\")\n    acceso = True\nend()\naddResult(acceso)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 35}
+{"chunk_id": "d02cc7019c314251", "source_file": "docs/samples/construccion_dinamica_de_objeto.avap", "doc_type": "code", "block_type": "assignment", "section": "", "start_line": 1, "end_line": 1, "content": "datos_cliente = \"datos\"", "metadata": {"complexity": 0}, "token_estimate": 6}
+{"chunk_id": "c1528242fcd85a68", "source_file": "docs/samples/construccion_dinamica_de_objeto.avap", "doc_type": "code", "block_type": "addVar", "section": "", "start_line": 2, "end_line": 2, "content": "datos_cliente = \"datos\"\naddVar(clave, \"cliente_vip\")", "metadata": {"complexity": 0, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 16}
+{"chunk_id": "d335da8caf95ac8d", "source_file": "docs/samples/construccion_dinamica_de_objeto.avap", "doc_type": "code", "block_type": "addVariableToJSON", "section": "", "start_line": 3, "end_line": 3, "content": "addVar(clave, \"cliente_vip\")\nAddvariableToJSON(clave, datos_cliente, mi_json_final)", "metadata": {"uses_json": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 24}
+{"chunk_id": "27067ebe43e3b05d", "source_file": "docs/samples/construccion_dinamica_de_objeto.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 4, "end_line": 4, "content": "AddvariableToJSON(clave, datos_cliente, mi_json_final)\naddResult(mi_json_final)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 20}
+{"chunk_id": "a25dfc3b319135d3", "source_file": "docs/samples/contador_de_parametros.avap", "doc_type": "code", "block_type": "addParam", "section": "", "start_line": 1, "end_line": 1, "content": "addParam(\"data_list\", mi_lista)", "metadata": {"uses_auth": true, "complexity": 1}, "token_estimate": 9}
+{"chunk_id": "d96fd663666733fe", "source_file": "docs/samples/contador_de_parametros.avap", "doc_type": "code", "block_type": "getListLen", "section": "", "start_line": 2, "end_line": 2, "content": "addParam(\"data_list\", mi_lista)\ngetListLen(mi_lista, cantidad)", "metadata": {"uses_list": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 16}
+{"chunk_id": "9905db6de1ea3067", "source_file": "docs/samples/contador_de_parametros.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 3, "end_line": 3, "content": "getListLen(mi_lista, cantidad)\naddResult(cantidad)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 12}
+{"chunk_id": "7c239ad53392d63d", "source_file": "docs/samples/conversion_timestamp_legible.avap", "doc_type": "code", "block_type": "stampToDatetime", "section": "", "start_line": 1, "end_line": 1, "content": "stampToDatetime(1708726162, \"%d/%m/%Y\", 0, fecha_human)", "metadata": {"uses_datetime": true, "complexity": 1}, "token_estimate": 22}
+{"chunk_id": "c4dc5d3c081101a5", "source_file": "docs/samples/conversion_timestamp_legible.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 2, "end_line": 2, "content": "stampToDatetime(1708726162, \"%d/%m/%Y\", 0, fecha_human)\naddResult(fecha_human)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 28}
+{"chunk_id": "2905488dffcbd7ba", "source_file": "docs/samples/referencia_por_valor.avap", "doc_type": "code", "block_type": "addVar", "section": "", "start_line": 1, "end_line": 2, "content": "addVar(base, 1000)\naddVar(copia, $base)", "metadata": {"complexity": 0}, "token_estimate": 16}
+{"chunk_id": "82e05ef62a72de87", "source_file": "docs/samples/referencia_por_valor.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 3, "end_line": 3, "content": "addVar(base, 1000)\naddVar(copia, $base)\naddResult(copia)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 21}
+{"chunk_id": "a6727546f328e768", "source_file": "docs/samples/respuesta_multiple.avap", "doc_type": "code", "block_type": "addVar", "section": "", "start_line": 1, "end_line": 2, "content": "addVar(code, 200)\naddVar(status, \"Success\")", "metadata": {"complexity": 0}, "token_estimate": 14}
+{"chunk_id": "ce12abd61c278bec", "source_file": "docs/samples/respuesta_multiple.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 3, "end_line": 4, "content": "addVar(code, 200)\naddVar(status, \"Success\")\naddResult(code)\naddResult(status)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 22}
+{"chunk_id": "45b0086b13784a7d", "source_file": "docs/samples/salida_bucle_correcta.avap", "doc_type": "code", "block_type": "assignment", "section": "", "start_line": 1, "end_line": 1, "content": "encontrado = False", "metadata": {"complexity": 0}, "token_estimate": 5}
+{"chunk_id": "c6df33b0e7eac0ff", "source_file": "docs/samples/salida_bucle_correcta.avap", "doc_type": "code", "block_type": "startLoop", "section": "", "start_line": 2, "end_line": 7, "content": "encontrado = False\nstartLoop(i, 1, 10)\n    if(i, 5, \"==\")\n        encontrado = True\n        i = 11 \n    end()\nendLoop()", "metadata": {"uses_loop": true, "uses_conditional": true, "complexity": 2, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 42}
+{"chunk_id": "02edc488f13b7367", "source_file": "docs/samples/salida_bucle_correcta.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 8, "end_line": 8, "content": "i = 11 \n    end()\nendLoop()\naddResult(encontrado)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 17}
+{"chunk_id": "c8dbbbf6cb64c10d", "source_file": "docs/samples/funcion_de_suma.avap", "doc_type": "code", "block_type": "function", "section": "", "start_line": 1, "end_line": 4, "content": "function suma(a, b){\n        total = a + b\n        return(total)\n    }", "metadata": {"complexity": 0}, "token_estimate": 19}
+{"chunk_id": "1065800a57207e04", "source_file": "docs/samples/funcion_de_suma.avap", "doc_type": "function_signature", "block_type": "function_signature", "section": "", "start_line": 1, "end_line": 1, "content": "function suma(a, b)", "metadata": {"complexity": 0, "full_block_start": 1, "full_block_end": 4}, "token_estimate": 6}
+{"chunk_id": "1ef5fa8a4a980012", "source_file": "docs/samples/funcion_de_suma.avap", "doc_type": "code", "block_type": "assignment", "section": "", "start_line": 5, "end_line": 5, "content": "// contexto: function suma(a, b)\nresultado = suma(10, 20)", "metadata": {"complexity": 0, "has_overlap": true, "overlap_type": "function_sig"}, "token_estimate": 18}
+{"chunk_id": "ff7df988add5bbef", "source_file": "docs/samples/funcion_de_suma.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 6, "end_line": 6, "content": "// contexto: function suma(a, b)\naddResult(resultado)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "function_sig"}, "token_estimate": 13}
+{"chunk_id": "b8682e4f71d9d7c3", "source_file": "docs/samples/funcion_validacion_acceso.avap", "doc_type": "code", "block_type": "function", "section": "", "start_line": 1, "end_line": 7, "content": "function es_valido(token){\n    response = False\n    if(token, \"SECRET\", \"=\")\n        response = True\n    end()\n    return(response)\n    }", "metadata": {"uses_conditional": true, "complexity": 1}, "token_estimate": 34}
+{"chunk_id": "a1cfc36abdf661a0", "source_file": "docs/samples/funcion_validacion_acceso.avap", "doc_type": "function_signature", "block_type": "function_signature", "section": "", "start_line": 1, "end_line": 1, "content": "function es_valido(token)", "metadata": {"complexity": 0, "full_block_start": 1, "full_block_end": 7}, "token_estimate": 6}
+{"chunk_id": "66706bf4b7d3aede", "source_file": "docs/samples/funcion_validacion_acceso.avap", "doc_type": "code", "block_type": "assignment", "section": "", "start_line": 8, "end_line": 8, "content": "// contexto: function es_valido(token)\nautorizado = es_valido(\"SECRET\")", "metadata": {"complexity": 0, "has_overlap": true, "overlap_type": "function_sig"}, "token_estimate": 18}
+{"chunk_id": "5932e6b75c40b7db", "source_file": "docs/samples/funcion_validacion_acceso.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 9, "end_line": 9, "content": "// contexto: function es_valido(token)\naddResult(autorizado)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "function_sig"}, "token_estimate": 15}
+{"chunk_id": "4be60a16d7cc7c4d", "source_file": "docs/samples/generador_de_tokens_aleatorios.avap", "doc_type": "code", "block_type": "randomString", "section": "", "start_line": 1, "end_line": 1, "content": "randomString(\"[A-Z]\\d\", 32, token_seguridad)", "metadata": {"uses_string_ops": true, "complexity": 1}, "token_estimate": 15}
+{"chunk_id": "1810ca839b071a65", "source_file": "docs/samples/generador_de_tokens_aleatorios.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 2, "end_line": 2, "content": "randomString(\"[A-Z]\\d\", 32, token_seguridad)\naddResult(token_seguridad)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 21}
+{"chunk_id": "ed8b4a4e75a71762", "source_file": "docs/samples/obtencion_timestamp.avap", "doc_type": "code", "block_type": "getDateTime", "section": "", "start_line": 1, "end_line": 1, "content": "getDateTime(\"\", 0, \"UTC\", ahora)", "metadata": {"uses_datetime": true, "complexity": 1}, "token_estimate": 11}
+{"chunk_id": "05d2d0c8e6266861", "source_file": "docs/samples/obtencion_timestamp.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 2, "end_line": 2, "content": "getDateTime(\"\", 0, \"UTC\", ahora)\naddResult(ahora)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 17}
+{"chunk_id": "02d7b0e4a1e1f09c", "source_file": "docs/samples/ormAccessCreate.avap", "doc_type": "code", "block_type": "orm_command", "section": "", "start_line": 1, "end_line": 1, "content": "ormCheckTable(tabla_pruebas,resultado_comprobacion)", "metadata": {"uses_orm": true, "complexity": 1}, "token_estimate": 13}
+{"chunk_id": "6daea421c5a1d565", "source_file": "docs/samples/ormAccessCreate.avap", "doc_type": "code", "block_type": "if", "section": "", "start_line": 2, "end_line": 4, "content": "ormCheckTable(tabla_pruebas,resultado_comprobacion)\nif(resultado_comprobacion,False,'==')\n    ormCreateTable(\"username,age\",'VARCHAR,INTEGER',tabla_pruebas,resultado_creacion)\nend()", "metadata": {"uses_orm": true, "uses_conditional": true, "complexity": 2, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 45}
+{"chunk_id": "47d660e6c1f124d1", "source_file": "docs/samples/ormAccessCreate.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 5, "end_line": 6, "content": "if(resultado_comprobacion,False,'==')\n    ormCreateTable(\"username,age\",'VARCHAR,INTEGER',tabla_pruebas,resultado_creacion)\nend()\naddResult(resultado_comprobacion)\naddResult(resultado_creacion)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 45}
+{"chunk_id": "b15daff2028a2136", "source_file": "docs/samples/paginacion_dinamica_recursos.avap", "doc_type": "code", "block_type": "addParam", "section": "", "start_line": 1, "end_line": 2, "content": "addParam(\"page\", p)\naddParam(\"size\", s)", "metadata": {"uses_auth": true, "complexity": 1}, "token_estimate": 14}
+{"chunk_id": "8f1fa0e84c981765", "source_file": "docs/samples/paginacion_dinamica_recursos.avap", "doc_type": "code", "block_type": "assignment", "section": "", "start_line": 3, "end_line": 6, "content": "addParam(\"page\", p)\naddParam(\"size\", s)\nregistros = [\"u1\", \"u2\", \"u3\", \"u4\", \"u5\", \"u6\"]\noffset = int(p) * int(s)\nlimite = offset + int(s)\ncontador = 0", "metadata": {"complexity": 0, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 62}
+{"chunk_id": "e27ce4178666239b", "source_file": "docs/samples/paginacion_dinamica_recursos.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 7, "end_line": 8, "content": "offset = int(p) * int(s)\nlimite = offset + int(s)\ncontador = 0\naddResult(offset)\naddResult(limite)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 32}
+{"chunk_id": "9a66c0e4c49bbbcb", "source_file": "docs/samples/paginacion_dinamica_recursos.avap", "doc_type": "code", "block_type": "startLoop", "section": "", "start_line": 9, "end_line": 13, "content": "addResult(offset)\naddResult(limite)\nstartLoop(i, 2, limite)\n    actual = registros[int(i)]\n    titulo = \"reg_%s\" % i\n    AddvariableToJSON(titulo, actual, pagina_json)\nendLoop()", "metadata": {"uses_loop": true, "uses_json": true, "complexity": 2, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 53}
+{"chunk_id": "77c985068f6f9269", "source_file": "docs/samples/paginacion_dinamica_recursos.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 14, "end_line": 14, "content": "titulo = \"reg_%s\" % i\n    AddvariableToJSON(titulo, actual, pagina_json)\nendLoop()\naddResult(pagina_json)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 32}
+{"chunk_id": "aeb4f87681bdc8b4", "source_file": "docs/samples/asignacion_booleana.avap", "doc_type": "code", "block_type": "assignment", "section": "", "start_line": 1, "end_line": 2, "content": "nivel = 5\nes_admin = nivel >= 10", "metadata": {"complexity": 0}, "token_estimate": 12}
+{"chunk_id": "5f0f938196d5e573", "source_file": "docs/samples/asignacion_booleana.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 3, "end_line": 3, "content": "nivel = 5\nes_admin = nivel >= 10\naddResult(es_admin)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 18}
+{"chunk_id": "42fb50109876864c", "source_file": "docs/samples/asignacion_matematica.avap", "doc_type": "code", "block_type": "assignment", "section": "", "start_line": 1, "end_line": 3, "content": "subtotal = 150.50\niva = subtotal * 0.21\ntotal = subtotal + iva", "metadata": {"complexity": 0}, "token_estimate": 22}
+{"chunk_id": "6019c2adc7750c04", "source_file": "docs/samples/asignacion_matematica.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 4, "end_line": 4, "content": "subtotal = 150.50\niva = subtotal * 0.21\ntotal = subtotal + iva\naddResult(total)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 27}
+{"chunk_id": "e2f6a0de7e7f9dc1", "source_file": "docs/samples/bucle_1_10.avap", "doc_type": "code", "block_type": "startLoop", "section": "", "start_line": 1, "end_line": 4, "content": "startLoop(i,1,10)\n        item = \"item_%s\" % i\n        AddvariableToJSON(item,'valor_generado',mi_json)\nendLoop()", "metadata": {"uses_loop": true, "uses_json": true, "complexity": 2}, "token_estimate": 36}
+{"chunk_id": "ce1f2fab7c807537", "source_file": "docs/samples/bucle_1_10.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 5, "end_line": 5, "content": "item = \"item_%s\" % i\n        AddvariableToJSON(item,'valor_generado',mi_json)\nendLoop()\naddResult(mi_json)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 32}
--- a/research/code_indexing/elasticsearch_ingestion.py
+++ b/research/code_indexing/elasticsearch_ingestion.py
@ -0,0 +1,64 @@
+import typer
+
+from loguru import logger
+
+from scripts.pipelines.tasks.chunk import (
+    fetch_documents, 
+    process_documents, 
+    export_documents,
+    ingest_documents
+)
+
+app = typer.Typer()
+
+
+@app.command()
+def elasticsearch_ingestion(
+    docs_folder_path: str = "docs/samples",
+    output_path: str = "research/code_indexing/chunks/chunks_EBNF_metadata.json",
+    docs_extension: list[str] = [".avap"],
+    es_index: str = "avap-code-indexing-ebnf-metadata",
+    es_request_timeout: int = 120,
+    es_max_retries: int = 5,
+    es_retry_on_timeout: bool = True,
+    delete_es_index: bool = True
+) -> None:  
+    """
+    Pipeline to ingest documents into an Elasticsearch index. 
+    The pipeline includes fetching documents from a specified folder, processing them into chunks, and then ingesting those chunks into the specified Elasticsearch index.
+
+    Args:
+        docs_folder_path (str): Path to the folder containing documents to be ingested. Default is "docs/samples".
+        docs_extension (list[str]): List of file extensions to filter by (e.g., [".md", ".avap"]). Default is [".md", ".avap"].
+        es_index (str): Name of the Elasticsearch index to ingest documents into. Default is "avap-docs-test-v3".
+        es_request_timeout (int): Timeout in seconds for Elasticsearch requests. Default is 120 seconds.
+        es_max_retries (int): Maximum number of retries for Elasticsearch requests in case of failure. Default is 5 retries.
+        es_retry_on_timeout (bool): Whether to retry Elasticsearch requests on timeout. Default is True.
+        delete_es_index (bool): Whether to delete the existing Elasticsearch index before ingestion. Default is True.
+
+    Returns:
+        None
+    """
+    logger.info("Starting Elasticsearch ingestion pipeline...")
+    logger.info(f"Fetching files from {docs_folder_path}...")
+    docs_path = fetch_documents(docs_folder_path, docs_extension)
+
+    logger.info("Processing docs...")
+    chunked_docs = process_documents(docs_path)
+
+    logger.info(f"Ingesting chunks in Elasticsearch index: {es_index}...")
+    elasticsearch_docs = ingest_documents(chunked_docs, es_index, es_request_timeout, es_max_retries, 
+                     es_retry_on_timeout, delete_es_index)
+    
+    logger.info(f"Exporting processed documents to {output_path}...")
+    export_documents(elasticsearch_docs, output_path)
+
+    logger.info(f"Finished ingesting in {es_index}.")
+
+
+if __name__ == "__main__":
+    try:
+        app()
+    except Exception as exc:
+        logger.exception(exc)
+        raise
--- a/research/code_indexing/generate_avap_code_qa_golden_dataset.ipynb
+++ b/research/code_indexing/generate_avap_code_qa_golden_dataset.ipynb
@ -0,0 +1,198 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "d520f6c3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "from datasets import load_dataset\n",
+    "\n",
+    "import boto3\n",
+    "from botocore.config import Config\n",
+    "from langchain_core.messages import SystemMessage, HumanMessage\n",
+    "\n",
+    "from src.utils.llm_factory import create_chat_model\n",
+    "from src.config import settings"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e08b9060",
+   "metadata": {},
+   "source": [
+    "### Create LLM isntance"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "81111a86",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "config = Config(\n",
+    "    region_name=\"us-east-1\",\n",
+    "    connect_timeout=10,     \n",
+    "    read_timeout=600,        \n",
+    ")\n",
+    "\n",
+    "client = boto3.client(\"bedrock-runtime\", config=config)\n",
+    "\n",
+    "llm = create_chat_model(\n",
+    "    provider=\"bedrock\",\n",
+    "    client=client,\n",
+    "    model=\"global.anthropic.claude-sonnet-4-6\",\n",
+    "    temperature=0,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "045f8e81",
+   "metadata": {},
+   "source": [
+    "### Load AVAP data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "07dea3fe",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open(settings.proj_root / \"docs/LRM/avap.md\", \"r\") as f:\n",
+    "    avap_docs = f.read()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "adbbe8b6",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Loaded 33 AVAP samples\n"
+     ]
+    }
+   ],
+   "source": [
+    "samples_dir = settings.proj_root / \"docs/samples\"\n",
+    "avap_samples = []\n",
+    "\n",
+    "for avap_file in sorted(samples_dir.glob(\"*.avap\")):\n",
+    "    with open(avap_file, \"r\") as f:\n",
+    "        code = f.read()\n",
+    "    \n",
+    "    avap_samples.append({\n",
+    "        \"file\": avap_file.name,\n",
+    "        \"code\": code\n",
+    "    })\n",
+    "\n",
+    "# Display as JSON\n",
+    "avap_samples_json = json.dumps(avap_samples, indent=2, ensure_ascii=False)\n",
+    "print(f\"Loaded {len(avap_samples)} AVAP samples\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7a15e09a",
+   "metadata": {},
+   "source": [
+    "### Prompt"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "895a170f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "GOLDEN_DATASET_PROMPT = SystemMessage(\n",
+    "    content=f\"\"\"\n",
+    "    You are an AI agent responsible for generating a golden dataset of queries for AVAP code retrieval and understanding.\n",
+    "\n",
+    "    You will receive a JSON array of AVAP code samples, each with a 'file' name and 'code' content.\n",
+    "\n",
+    "    Your task is to:\n",
+    "    1. Analyze each AVAP code sample.\n",
+    "    2. Generate 2-3 natural language queries that can be answered by examining that specific code.\n",
+    "    3. Output a JSON array where each element has:\n",
+    "       - \"query\": A natural language question about AVAP code implementation, best practices, or specific constructs.\n",
+    "       - \"context\": The filename of the code sample that provides the context/answer for this query.\n",
+    "\n",
+    "    Requirements:\n",
+    "    - Queries should be diverse: ask about functions, control flow, API operations, error handling, etc.\n",
+    "    - Queries must be answerable using ONLY the provided code samples.\n",
+    "    - Queries should be framed as natural developer questions (e.g., \"How do you handle errors in AVAP?\" or \"Show me an example of looping over a list\").\n",
+    "    - Use natural English (or Spanish if context is Spanish-language code).\n",
+    "    - Do not reference exact variable names unless necessary; focus on the patterns and constructs used.\n",
+    "    - Output MUST be valid JSON array format.\n",
+    "\n",
+    "    AVAP Code Samples:\n",
+    "    {avap_samples_json}\n",
+    "\n",
+    "    Output format (JSON array):\n",
+    "    [\n",
+    "      {{\"query\": \"...\", \"context\": \"filename.avap\"}},\n",
+    "      {{\"query\": \"...\", \"context\": \"filename.avap\"}},\n",
+    "      ...\n",
+    "    ]\n",
+    "    \"\"\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a3123199",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "98c4f93c",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "723352ee",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "assistance-engine",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}