created code_indexing_improvements research

This commit is contained in:
acano 2026-03-25 10:37:53 +01:00
parent fe90548b8b
commit da483c51bb
8 changed files with 103823 additions and 0 deletions

View File

@ -0,0 +1,228 @@
start: program
program: separator* line_or_comment (separator+ line_or_comment)* separator*
?line_or_comment: simple_stmt comment?
| compound_stmt
| comment
| BLOCK_COMMENT
?separator: EOL+
comment: DOC_COMMENT | LINE_COMMENT
EOL: /\r?\n/
DOC_COMMENT.2: /\/\/\/[^\r\n]*/
LINE_COMMENT.1: /\/\/[^\r\n]*/
BLOCK_COMMENT: /\/\*[\s\S]*?\*\//
?simple_stmt: assignment
| return_stmt
| system_command
| io_command
| async_command
| connector_cmd
| db_command
| http_command
| util_command
| modularity_cmd
| call_stmt
?compound_stmt: function_decl
| if_stmt
| loop_stmt
| try_stmt
assignment: identifier "=" expression
call_stmt: identifier "(" argument_list? ")"
| identifier "=" identifier "." identifier "(" argument_list? ")"
| identifier "." identifier "(" argument_list? ")"
system_command: register_cmd
| addvar_cmd
register_cmd: "registerEndpoint" "(" stringliteral "," stringliteral "," list_display "," stringliteral "," identifier "," identifier ")"
addvar_cmd: "addVar" "(" addvar_arg "," addvar_arg ")"
addvar_arg: identifier
| literal
| "$" identifier
identifier: IDENTIFIER
system_variable: "_status"
io_command: addparam_cmd
| getlistlen_cmd
| addresult_cmd
| getparamlist_cmd
addparam_cmd: "addParam" "(" stringliteral "," identifier ")"
getlistlen_cmd: "getListLen" "(" identifier "," identifier ")"
getparamlist_cmd: "getQueryParamList" "(" stringliteral "," identifier ")"
addresult_cmd: "addResult" "(" identifier ")"
if_stmt: "if" "(" if_condition ")" separator block ("else" "(" ")" separator block)? "end" "(" ")"
if_condition: if_atom "," if_atom "," stringliteral
| "None" "," "None" "," stringliteral
if_atom: identifier
| literal
loop_stmt: "startLoop" "(" identifier "," expression "," expression ")" separator block "endLoop" "(" ")"
try_stmt: "try" "(" ")" separator block "exception" "(" identifier ")" separator block "end" "(" ")"
block: separator* line_or_comment (separator+ line_or_comment)* separator*
async_command: go_stmt
| gather_stmt
go_stmt: identifier "=" "go" identifier "(" argument_list? ")"
gather_stmt: identifier "=" "gather" "(" identifier ("," expression)? ")"
connector_cmd: connector_instantiation
connector_instantiation: identifier "=" "avapConnector" "(" stringliteral ")"
http_command: req_post_cmd
| req_get_cmd
req_post_cmd: "RequestPost" "(" expression "," expression "," expression "," expression "," identifier "," expression ")"
req_get_cmd: "RequestGet" "(" expression "," expression "," expression "," identifier "," expression ")"
db_command: orm_direct
| orm_check
| orm_create
| orm_select
| orm_insert
| orm_update
orm_direct: "ormDirect" "(" expression "," identifier ")"
orm_check: "ormCheckTable" "(" expression "," identifier ")"
orm_create: "ormCreateTable" "(" expression "," expression "," expression "," identifier ")"
orm_select: "ormAccessSelect" "(" orm_fields "," expression ("," expression)? "," identifier ")"
orm_fields: "*"
| expression
orm_insert: "ormAccessInsert" "(" expression "," expression "," identifier ")"
orm_update: "ormAccessUpdate" "(" expression "," expression "," expression "," expression "," identifier ")"
util_command: json_list_cmd
| crypto_cmd
| regex_cmd
| datetime_cmd
| stamp_cmd
| string_cmd
| replace_cmd
json_list_cmd: "variableToList" "(" expression "," identifier ")"
| "itemFromList" "(" identifier "," expression "," identifier ")"
| "variableFromJSON" "(" identifier "," expression "," identifier ")"
| "AddVariableToJSON" "(" expression "," expression "," identifier ")"
crypto_cmd: "encodeSHA256" "(" identifier_or_string "," identifier ")"
| "encodeMD5" "(" identifier_or_string "," identifier ")"
regex_cmd: "getRegex" "(" identifier "," stringliteral "," identifier ")"
datetime_cmd: "getDateTime" "(" stringliteral "," expression "," stringliteral "," identifier ")"
stamp_cmd: "stampToDatetime" "(" expression "," stringliteral "," expression "," identifier ")"
| "getTimeStamp" "(" stringliteral "," stringliteral "," expression "," identifier ")"
string_cmd: "randomString" "(" expression "," expression "," identifier ")"
replace_cmd: "replace" "(" identifier_or_string "," stringliteral "," stringliteral "," identifier ")"
function_decl: "function" identifier "(" param_list? ")" "{" separator block "}"
param_list: identifier ("," identifier)*
return_stmt: "return" "(" expression? ")"
modularity_cmd: include_stmt
| import_stmt
include_stmt: "include" stringliteral
import_stmt: "import" ("<" identifier ">" | stringliteral)
?expression: logical_or
?logical_or: logical_and ("or" logical_and)*
?logical_and: logical_not ("and" logical_not)*
?logical_not: "not" logical_not
| comparison
?comparison: arithmetic (comp_op arithmetic)*
comp_op: "==" | "!=" | "<" | ">" | "<=" | ">=" | "in" | "is"
?arithmetic: term (("+" | "-") term)*
?term: factor (("*" | "/" | "%") factor)*
?factor: ("+" | "-") factor
| power
?power: primary ("**" factor)?
?primary: atom postfix*
postfix: "." identifier
| "[" expression "]"
| "[" expression? ":" expression? (":" expression?)? "]"
| "(" argument_list? ")"
?atom: identifier
| "$" identifier
| literal
| "(" expression ")"
| list_display
| dict_display
list_display: "[" argument_list? "]"
| "[" expression "for" identifier "in" expression if_clause? "]"
if_clause: "if" expression
dict_display: "{" key_datum_list? "}"
key_datum_list: key_datum ("," key_datum)*
key_datum: expression ":" expression
argument_list: expression ("," expression)*
number: FLOATNUMBER
| INTEGER
literal: stringliteral
| number
| boolean
| "None"
boolean: "True" | "False"
INTEGER: /[0-9]+/
FLOATNUMBER: /(?:[0-9]+\.[0-9]*|\.[0-9]+)/
stringliteral: STRING_DOUBLE
| STRING_SINGLE
# STRING_DOUBLE: /"([^"\\]|\\["'\\ntr0])*"/
# STRING_SINGLE: /'([^'\\]|\\["'\\ntr0])*'/
STRING_DOUBLE: /"([^"\\]|\\.)*"/
STRING_SINGLE: /'([^'\\]|\\.)*'/
identifier_or_string: identifier
| stringliteral
IDENTIFIER: /[A-Za-z_][A-Za-z0-9_]*/
%ignore /[ \t]+/

View File

@ -0,0 +1,371 @@
import json
from copy import deepcopy
from dataclasses import replace
from pathlib import Path
from typing import Any, Union
from lark import Lark, Tree
from chonkie import (
Chunk,
ElasticHandshake,
FileFetcher,
MarkdownChef,
TextChef,
TokenChunker,
MarkdownDocument
)
from elasticsearch import Elasticsearch
from loguru import logger
from transformers import AutoTokenizer
from scripts.pipelines.tasks.embeddings import OllamaEmbeddings
from src.config import settings
COMMAND_METADATA_NAMES = {
# system
"register_cmd": "registerEndpoint",
"addvar_cmd": "addVar",
"addparam_cmd": "addParam",
"getlistlen_cmd": "getListLen",
"getparamlist_cmd": "getQueryParamList",
"addresult_cmd": "addResult",
# async
"go_stmt": "go",
"gather_stmt": "gather",
# connector
"connector_instantiation": "avapConnector",
# http
"req_post_cmd": "RequestPost",
"req_get_cmd": "RequestGet",
# db
"orm_direct": "ormDirect",
"orm_check": "ormCheckTable",
"orm_create": "ormCreateTable",
"orm_select": "ormAccessSelect",
"orm_insert": "ormAccessInsert",
"orm_update": "ormAccessUpdate",
# util
"json_list_cmd": "json_list_ops",
"crypto_cmd": "crypto_ops",
"regex_cmd": "getRegex",
"datetime_cmd": "getDateTime",
"stamp_cmd": "timestamp_ops",
"string_cmd": "randomString",
"replace_cmd": "replace",
# modularity
"include_stmt": "include",
"import_stmt": "import",
# generic statements
"assignment": "assignment",
"call_stmt": "call",
"return_stmt": "return",
"if_stmt": "if",
"loop_stmt": "startLoop",
"try_stmt": "try",
"function_decl": "function",
}
def _extract_command_metadata(ast: Tree | None) -> dict[str, bool]:
if ast is None:
return {}
used_commands: set[str] = set()
for subtree in ast.iter_subtrees():
if subtree.data in COMMAND_METADATA_NAMES:
used_commands.add(COMMAND_METADATA_NAMES[subtree.data])
return {command_name: True for command_name in sorted(used_commands)}
def _get_text(element) -> str:
for attr in ("text", "content", "markdown"):
value = getattr(element, attr, None)
if isinstance(value, str):
return value
raise AttributeError(
f"Could not extract text from element of type {type(element).__name__}"
)
def _merge_markdown_document(processed_doc: MarkdownDocument) -> MarkdownDocument:
elements = []
for chunk in processed_doc.chunks:
elements.append(("chunk", chunk.start_index, chunk.end_index, chunk))
for code in processed_doc.code:
elements.append(("code", code.start_index, code.end_index, code))
for table in processed_doc.tables:
elements.append(("table", table.start_index, table.end_index, table))
elements.sort(key=lambda item: (item[1], item[2]))
merged_chunks = []
current_chunk = None
current_parts = []
current_end_index = None
current_token_count = None
def flush():
nonlocal current_chunk, current_parts, current_end_index, current_token_count
if current_chunk is None:
return
merged_text = "\n\n".join(part for part in current_parts if part)
merged_chunks.append(
replace(
current_chunk,
text=merged_text,
end_index=current_end_index,
token_count=current_token_count,
)
)
current_chunk = None
current_parts = []
current_end_index = None
current_token_count = None
for kind, _, _, element in elements:
if kind == "chunk":
flush()
current_chunk = element
current_parts = [_get_text(element)]
current_end_index = element.end_index
current_token_count = element.token_count
continue
if current_chunk is None:
continue
current_parts.append(_get_text(element))
current_end_index = max(current_end_index, element.end_index)
current_token_count += getattr(element, "token_count", 0)
flush()
fused_processed_doc = deepcopy(processed_doc)
fused_processed_doc.chunks = merged_chunks
fused_processed_doc.code = processed_doc.code
fused_processed_doc.tables = processed_doc.tables
return fused_processed_doc
class ElasticHandshakeWithMetadata(ElasticHandshake):
"""Extended ElasticHandshake that preserves chunk metadata in Elasticsearch."""
def _create_bulk_actions(self, chunks: list[dict]) -> list[dict[str, Any]]:
"""Generate bulk actions including metadata."""
actions = []
embeddings = self.embedding_model.embed_batch([chunk["chunk"].text for chunk in chunks])
for i, chunk in enumerate(chunks):
source = {
"text": chunk["chunk"].text,
"embedding": embeddings[i],
"start_index": chunk["chunk"].start_index,
"end_index": chunk["chunk"].end_index,
"token_count": chunk["chunk"].token_count,
}
# Include metadata if it exists
if chunk.get("extra_metadata"):
source.update(chunk["extra_metadata"])
actions.append({
"_index": self.index_name,
"_id": self._generate_id(i, chunk["chunk"]),
"_source": source,
})
return actions
def write(self, chunks: Union[Chunk, list[Chunk]]) -> list[dict[str, Any]]:
"""Write the chunks to the Elasticsearch index using the bulk API."""
if isinstance(chunks, Chunk):
chunks = [chunks]
actions = self._create_bulk_actions(chunks)
# Use the bulk helper to efficiently write the documents
from elasticsearch.helpers import bulk
success, errors = bulk(self.client, actions, raise_on_error=False)
if errors:
logger.warning(f"Encountered {len(errors)} errors during bulk indexing.") # type: ignore
# Optionally log the first few errors for debugging
for i, error in enumerate(errors[:5]): # type: ignore
logger.error(f"Error {i + 1}: {error}")
logger.info(f"Chonkie wrote {success} chunks to Elasticsearch index: {self.index_name}")
return actions
def fetch_documents(docs_folder_path: str, docs_extension: list[str]) -> list[Path]:
"""
Fetch files from a folder that match the specified extensions.
Args:
docs_folder_path (str): Path to the folder containing documents
docs_extension (list[str]): List of file extensions to filter by (e.g., [".md", ".avap"])
Returns:
List of Paths to the fetched documents
"""
fetcher = FileFetcher()
docs_path = fetcher.fetch(dir=f"{settings.proj_root}/{docs_folder_path}", ext=docs_extension)
return docs_path
def process_documents(docs_path: list[Path]) -> list[dict[str, Any]]:
"""
Process documents by applying appropriate chefs and chunking strategies based on file type.
Args:
docs_path: List of Paths to the documents to be processed.
Returns:
List of dicts with "chunk" (Chunk object) and "extra_metadata" (dict with file info).
"""
processed_docs = []
custom_tokenizer = AutoTokenizer.from_pretrained(settings.hf_emb_model_name)
chef_md = MarkdownChef(tokenizer=custom_tokenizer)
chef_txt = TextChef()
chunker = TokenChunker(tokenizer=custom_tokenizer)
with open(settings.proj_root / "research/code_indexing/BNF/avap.lark", encoding="utf-8") as grammar:
lark_parser = Lark(
grammar.read(),
parser="lalr",
propagate_positions=True,
start="program",
)
for doc_path in docs_path:
doc_extension = doc_path.suffix.lower()
if doc_extension == ".md":
processed_doc = chef_md.process(doc_path)
fused_doc = _merge_markdown_document(processed_doc)
chunked_doc = fused_doc.chunks
specific_metadata = {
"file_type": "avap_docs",
"filename": doc_path.name,
}
elif doc_extension == ".avap":
processed_doc = chef_txt.process(doc_path)
try:
ast = lark_parser.parse(processed_doc.content)
except Exception as e:
logger.error(f"Error parsing AVAP code in {doc_path.name}: {e}")
ast = None
chunked_doc = chunker.chunk(processed_doc.content)
specific_metadata = {
"file_type": "avap_code",
"filename": doc_path.name,
**_extract_command_metadata(ast),
}
else:
continue
for chunk in chunked_doc:
processed_docs.append(
{
"chunk": chunk,
"extra_metadata": {**specific_metadata},
}
)
return processed_docs
def ingest_documents(
chunked_docs: list[dict[str, Chunk | dict[str, Any]]],
es_index: str,
es_request_timeout: int,
es_max_retries: int,
es_retry_on_timeout: bool,
delete_es_index: bool,
) -> list[dict[str, Any]]:
"""
Ingest processed documents into an Elasticsearch index.
Args:
chunked_docs (list[dict[str, Any]]): List of dicts with "chunk" and "metadata" keys
es_index (str): Name of the Elasticsearch index to ingest into
es_request_timeout (int): Timeout for Elasticsearch requests in seconds
es_max_retries (int): Maximum number of retries for Elasticsearch requests
es_retry_on_timeout (bool): Whether to retry on Elasticsearch request timeouts
delete_es_index (bool): Whether to delete the existing Elasticsearch index before ingestion
Returns:
List of dicts with Elasticsearch response for each chunk
"""
logger.info(
f"Instantiating Elasticsearch client with URL: {settings.elasticsearch_local_url}..."
)
es = Elasticsearch(
hosts=settings.elasticsearch_local_url,
request_timeout=es_request_timeout,
max_retries=es_max_retries,
retry_on_timeout=es_retry_on_timeout,
)
if delete_es_index and es.indices.exists(index=es_index):
logger.info(f"Deleting existing Elasticsearch index: {es_index}...")
es.indices.delete(index=es_index)
handshake = ElasticHandshakeWithMetadata(
client=es,
index_name=es_index,
embedding_model=OllamaEmbeddings(model=settings.ollama_emb_model_name),
)
logger.info(
f"Ingesting {len(chunked_docs)} chunks into Elasticsearch index: {es_index}..."
)
elasticsearch_chunks = handshake.write(chunked_docs)
return elasticsearch_chunks
def export_documents(elasticsearch_chunks: list[dict[str, Any]], output_path: str) -> None:
"""
Export processed documents to JSON files in the specified output folder.
Args:
elasticsearch_chunks (list[dict[str, Any]]): List of dicts with Elasticsearch response for each chunk
output_path (str): Path to the file where the JSON will be saved
Returns:
None
"""
output_path = settings.proj_root / output_path
for chunk in elasticsearch_chunks:
chunk["_source"]["embedding"] = chunk["_source"]["embedding"].tolist() # For JSON serialization
with output_path.open("w", encoding="utf-8") as f:
json.dump(elasticsearch_chunks, f, ensure_ascii=False, indent=4)
logger.info(f"Exported processed documents to {output_path}")

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,89 @@
{"chunk_id": "5208d7435c0286ab", "source_file": "docs/samples/hash_SHA256_para_integridad.avap", "doc_type": "code", "block_type": "encodeSHA256", "section": "", "start_line": 1, "end_line": 1, "content": "encodeSHA256(\"payload_data\", checksum)", "metadata": {"uses_crypto": true, "uses_string_ops": true, "complexity": 2}, "token_estimate": 9}
{"chunk_id": "e5e9b70428937778", "source_file": "docs/samples/hash_SHA256_para_integridad.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 2, "end_line": 2, "content": "encodeSHA256(\"payload_data\", checksum)\naddResult(checksum)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 14}
{"chunk_id": "49d6b31967a1db93", "source_file": "docs/samples/hello_world.avap", "doc_type": "code", "block_type": "registerEndpoint", "section": "", "start_line": 1, "end_line": 1, "content": "registerEndpoint(\"/hello_world\",\"GET\",[],\"HELLO_WORLD\",main,result)", "metadata": {"registers_endpoint": true, "complexity": 1}, "token_estimate": 17}
{"chunk_id": "e7ececd11823d42a", "source_file": "docs/samples/hello_world.avap", "doc_type": "code", "block_type": "addVar", "section": "", "start_line": 2, "end_line": 2, "content": "registerEndpoint(\"/hello_world\",\"GET\",[],\"HELLO_WORLD\",main,result)\naddVar(name,\"Alberto\")", "metadata": {"complexity": 0, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 24}
{"chunk_id": "f103d7719754088f", "source_file": "docs/samples/hello_world.avap", "doc_type": "code", "block_type": "assignment", "section": "", "start_line": 3, "end_line": 3, "content": "addVar(name,\"Alberto\")\nresult = \"Hello,\" + name", "metadata": {"complexity": 0, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 14}
{"chunk_id": "4b1ab59c1acb224c", "source_file": "docs/samples/hello_world.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 4, "end_line": 4, "content": "result = \"Hello,\" + name\naddResult(result)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 12}
{"chunk_id": "682adaeeb528f778", "source_file": "docs/samples/hola_mundo.avap", "doc_type": "code", "block_type": "addVar", "section": "", "start_line": 1, "end_line": 1, "content": "addVar(mensaje, \"Hola mundo desde AVAP\")", "metadata": {"complexity": 0}, "token_estimate": 12}
{"chunk_id": "9bb665ca8d7590f7", "source_file": "docs/samples/hola_mundo.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 2, "end_line": 2, "content": "addVar(mensaje, \"Hola mundo desde AVAP\")\naddResult(mensaje)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 17}
{"chunk_id": "ed0136ad03a51e7e", "source_file": "docs/samples/captura_de_listas_multiples.avap", "doc_type": "code", "block_type": "addParam", "section": "", "start_line": 1, "end_line": 1, "content": "addParam(\"emails\", emails)", "metadata": {"uses_auth": true, "complexity": 1}, "token_estimate": 7}
{"chunk_id": "899291ac8959ae3e", "source_file": "docs/samples/captura_de_listas_multiples.avap", "doc_type": "code", "block_type": "getQueryParamList", "section": "", "start_line": 2, "end_line": 2, "content": "addParam(\"emails\", emails)\ngetQueryParamList(\"lista_correos\", lista_correos)", "metadata": {"complexity": 0, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 21}
{"chunk_id": "0eeff974dcd74729", "source_file": "docs/samples/captura_de_listas_multiples.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 3, "end_line": 3, "content": "getQueryParamList(\"lista_correos\", lista_correos)\naddResult(lista_correos)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 21}
{"chunk_id": "b2e95857d059d99d", "source_file": "docs/samples/comparacion_simple.avap", "doc_type": "code", "block_type": "addParam", "section": "", "start_line": 1, "end_line": 1, "content": "addParam(\"lang\", l)", "metadata": {"uses_auth": true, "complexity": 1}, "token_estimate": 7}
{"chunk_id": "db2fab8dfbe7d460", "source_file": "docs/samples/comparacion_simple.avap", "doc_type": "code", "block_type": "if", "section": "", "start_line": 2, "end_line": 4, "content": "addParam(\"lang\", l)\nif(l, \"es\", \"=\")\n addVar(msg, \"Hola\")\nend()", "metadata": {"uses_conditional": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 25}
{"chunk_id": "2628fa886650658a", "source_file": "docs/samples/comparacion_simple.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 5, "end_line": 5, "content": "if(l, \"es\", \"=\")\n addVar(msg, \"Hola\")\nend()\naddResult(msg)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 22}
{"chunk_id": "89bddd6830b6a8af", "source_file": "docs/samples/concatenacion_dinamica.avap", "doc_type": "code", "block_type": "assignment", "section": "", "start_line": 1, "end_line": 2, "content": "nombre = \"Sistema\"\nlog = \"Evento registrado por: %s\" % nombre", "metadata": {"complexity": 0}, "token_estimate": 18}
{"chunk_id": "6797d36c2eb0e38a", "source_file": "docs/samples/concatenacion_dinamica.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 3, "end_line": 3, "content": "nombre = \"Sistema\"\nlog = \"Evento registrado por: %s\" % nombre\naddResult(log)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 23}
{"chunk_id": "93008a3bed0ea808", "source_file": "docs/samples/if_desigualdad.avap", "doc_type": "code", "block_type": "addParam", "section": "", "start_line": 1, "end_line": 1, "content": "addParam(\"password\",pass_nueva)", "metadata": {"uses_auth": true, "complexity": 1}, "token_estimate": 9}
{"chunk_id": "142b2aef2f05fae7", "source_file": "docs/samples/if_desigualdad.avap", "doc_type": "code", "block_type": "assignment", "section": "", "start_line": 2, "end_line": 2, "content": "addParam(\"password\",pass_nueva)\npass_antigua = \"password\"", "metadata": {"complexity": 0, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 16}
{"chunk_id": "b03b67f3aab35d7a", "source_file": "docs/samples/if_desigualdad.avap", "doc_type": "code", "block_type": "if", "section": "", "start_line": 3, "end_line": 5, "content": "pass_antigua = \"password\"\nif(pass_nueva, pass_antigua, \"!=\")\n addVar(cambio, \"Contraseña actualizada\")\nend()", "metadata": {"uses_conditional": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 33}
{"chunk_id": "99549cab6c8617d8", "source_file": "docs/samples/if_desigualdad.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 6, "end_line": 6, "content": "if(pass_nueva, pass_antigua, \"!=\")\n addVar(cambio, \"Contraseña actualizada\")\nend()\naddResult(cambio)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 31}
{"chunk_id": "123dfdacd4160b0d", "source_file": "docs/samples/limpieza_de_strings.avap", "doc_type": "code", "block_type": "replace", "section": "", "start_line": 1, "end_line": 1, "content": "replace(\"REF_1234_OLD\",\"OLD\", \"NEW\", ref_actualizada)", "metadata": {"uses_string_ops": true, "complexity": 1}, "token_estimate": 17}
{"chunk_id": "c65655393175720a", "source_file": "docs/samples/limpieza_de_strings.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 2, "end_line": 2, "content": "replace(\"REF_1234_OLD\",\"OLD\", \"NEW\", ref_actualizada)\naddResult(ref_actualizada)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 23}
{"chunk_id": "3edbf12e560e22b1", "source_file": "docs/samples/manejo_error_sql_critico.avap", "doc_type": "code", "block_type": "try", "section": "", "start_line": 1, "end_line": 7, "content": "try()\n ormDirect(\"UPDATE table_inexistente SET a=1\", res)\nexception(e)\n addVar(_status, 500)\n addVar(error_msg, \"Error de base de datos\")\n addResult(error_msg)\nend()", "metadata": {"uses_orm": true, "uses_auth": true, "uses_error_handling": true, "uses_exception": true, "returns_result": true, "complexity": 5}, "token_estimate": 51}
{"chunk_id": "75bcc1f794c8527f", "source_file": "docs/samples/else_estandar.avap", "doc_type": "code", "block_type": "addParam", "section": "", "start_line": 1, "end_line": 1, "content": "addParam(\"sal_par\",saldo)", "metadata": {"uses_auth": true, "complexity": 1}, "token_estimate": 8}
{"chunk_id": "99462a4539651e84", "source_file": "docs/samples/else_estandar.avap", "doc_type": "code", "block_type": "if", "section": "", "start_line": 2, "end_line": 6, "content": "addParam(\"sal_par\",saldo)\nif(saldo, 0, \">\")\n permitir = True\nelse()\n permitir = False\nend()", "metadata": {"uses_conditional": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 33}
{"chunk_id": "c9134748119a6401", "source_file": "docs/samples/else_estandar.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 7, "end_line": 7, "content": "else()\n permitir = False\nend()\naddResult(permitir)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 16}
{"chunk_id": "da88ce6ec35e309a", "source_file": "docs/samples/expresion_compleja.avap", "doc_type": "code", "block_type": "addParam", "section": "", "start_line": 1, "end_line": 2, "content": "addParam(\"userrype\", user_type)\naddParam(\"sells\", compras)", "metadata": {"uses_auth": true, "complexity": 1}, "token_estimate": 19}
{"chunk_id": "ef826cb80ab05a8c", "source_file": "docs/samples/expresion_compleja.avap", "doc_type": "code", "block_type": "if", "section": "", "start_line": 3, "end_line": 5, "content": "addParam(\"userrype\", user_type)\naddParam(\"sells\", compras)\nif(None, None, \" user_type == 'VIP' or compras > 100\")\n addVar(descuento, 0.20)\nend()", "metadata": {"uses_conditional": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 51}
{"chunk_id": "117c5396b3e2f3bd", "source_file": "docs/samples/expresion_compleja.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 6, "end_line": 6, "content": "if(None, None, \" user_type == 'VIP' or compras > 100\")\n addVar(descuento, 0.20)\nend()\naddResult(descuento)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 37}
{"chunk_id": "559f8f61eda7ff75", "source_file": "docs/samples/fecha_para_base_de_datos.avap", "doc_type": "code", "block_type": "getDateTime", "section": "", "start_line": 1, "end_line": 1, "content": "getDateTime(\"%Y-%m-%d %H:%M:%S\", 0, \"Europe/Madrid\", sql_date)", "metadata": {"uses_datetime": true, "complexity": 1}, "token_estimate": 27}
{"chunk_id": "b40f10f126c22c01", "source_file": "docs/samples/fecha_para_base_de_datos.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 2, "end_line": 2, "content": "getDateTime(\"%Y-%m-%d %H:%M:%S\", 0, \"Europe/Madrid\", sql_date)\naddResult(sql_date)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 32}
{"chunk_id": "717f75fe4eb08ecf", "source_file": "docs/samples/bucle_longitud_de_datos.avap", "doc_type": "code", "block_type": "assignment", "section": "", "start_line": 1, "end_line": 1, "content": "registros = ['1','2','3']", "metadata": {"complexity": 0}, "token_estimate": 10}
{"chunk_id": "8a695ac320884362", "source_file": "docs/samples/bucle_longitud_de_datos.avap", "doc_type": "code", "block_type": "getListLen", "section": "", "start_line": 2, "end_line": 2, "content": "registros = ['1','2','3']\ngetListLen(registros, total)", "metadata": {"uses_list": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 17}
{"chunk_id": "9530c2cad477b991", "source_file": "docs/samples/bucle_longitud_de_datos.avap", "doc_type": "code", "block_type": "assignment", "section": "", "start_line": 3, "end_line": 3, "content": "getListLen(registros, total)\ncontador = 0", "metadata": {"complexity": 0, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 11}
{"chunk_id": "c4acc74c9b001703", "source_file": "docs/samples/bucle_longitud_de_datos.avap", "doc_type": "code", "block_type": "startLoop", "section": "", "start_line": 4, "end_line": 6, "content": "contador = 0\nstartLoop(idx, 0, 2)\n actual = registros[int(idx)]\nendLoop()", "metadata": {"uses_loop": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 25}
{"chunk_id": "80e935fcd6c7a232", "source_file": "docs/samples/bucle_longitud_de_datos.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 7, "end_line": 7, "content": "startLoop(idx, 0, 2)\n actual = registros[int(idx)]\nendLoop()\naddResult(actual)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 24}
{"chunk_id": "576b1bc85805eef0", "source_file": "docs/samples/calculo_de_expiracion.avap", "doc_type": "code", "block_type": "getDateTime", "section": "", "start_line": 1, "end_line": 1, "content": "getDateTime(\"\", 86400, \"UTC\", expira)", "metadata": {"uses_datetime": true, "complexity": 1}, "token_estimate": 13}
{"chunk_id": "686f254e071d6280", "source_file": "docs/samples/calculo_de_expiracion.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 2, "end_line": 2, "content": "getDateTime(\"\", 86400, \"UTC\", expira)\naddResult(expira)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 18}
{"chunk_id": "79fd8fee120921e7", "source_file": "docs/samples/captura_de_id.avap", "doc_type": "code", "block_type": "addParam", "section": "", "start_line": 1, "end_line": 1, "content": "addParam(\"client_id\", id_interno)", "metadata": {"uses_auth": true, "complexity": 1}, "token_estimate": 10}
{"chunk_id": "03697091447c57d4", "source_file": "docs/samples/captura_de_id.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 2, "end_line": 2, "content": "addParam(\"client_id\", id_interno)\naddResult(id_interno)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 16}
{"chunk_id": "2c64510b9ac6042b", "source_file": "docs/samples/try_catch_request.avap", "doc_type": "code", "block_type": "try", "section": "", "start_line": 1, "end_line": 6, "content": "try()\n RequestGet(\"https://api.test.com/data\", 0, 0, respuesta, None)\nexception(e)\n addVar(error_trace, e)\n addResult(error_trace)\nend()", "metadata": {"uses_http": true, "uses_error_handling": true, "uses_exception": true, "returns_result": true, "complexity": 4}, "token_estimate": 42}
{"chunk_id": "4d9f72fb03ba6d2b", "source_file": "docs/samples/validacion_de_nulo.avap", "doc_type": "code", "block_type": "addParam", "section": "", "start_line": 1, "end_line": 1, "content": "addParam(\"api_key\", key)", "metadata": {"uses_auth": true, "complexity": 1}, "token_estimate": 8}
{"chunk_id": "19fa0a3950612c1e", "source_file": "docs/samples/validacion_de_nulo.avap", "doc_type": "code", "block_type": "if", "section": "", "start_line": 2, "end_line": 6, "content": "addParam(\"api_key\", key)\nif(key, None, \"==\")\n addVar(_status, 403)\n addVar(error, \"Acceso denegado: falta API KEY\")\n addResult(error)\nend()", "metadata": {"uses_auth": true, "uses_conditional": true, "returns_result": true, "complexity": 3, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 47}
{"chunk_id": "e06fe329097212dd", "source_file": "docs/samples/validacion_in_pertenece_a_lista.avap", "doc_type": "code", "block_type": "addParam", "section": "", "start_line": 1, "end_line": 1, "content": "addParam(\"rol\", r)", "metadata": {"uses_auth": true, "complexity": 1}, "token_estimate": 7}
{"chunk_id": "285aeb7e911a5075", "source_file": "docs/samples/validacion_in_pertenece_a_lista.avap", "doc_type": "code", "block_type": "assignment", "section": "", "start_line": 2, "end_line": 2, "content": "addParam(\"rol\", r)\nacceso = False", "metadata": {"complexity": 0, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 11}
{"chunk_id": "f8ed75075b7b1b13", "source_file": "docs/samples/validacion_in_pertenece_a_lista.avap", "doc_type": "code", "block_type": "if", "section": "", "start_line": 4, "end_line": 6, "content": "acceso = False\nif(None, None, \"r == 'admin' or r == 'editor' or r == 'root'\")\n acceso = True\nend()", "metadata": {"uses_conditional": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 35}
{"chunk_id": "b323dedebcbd9036", "source_file": "docs/samples/validacion_in_pertenece_a_lista.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 8, "end_line": 8, "content": "if(None, None, \"r == 'admin' or r == 'editor' or r == 'root'\")\n acceso = True\nend()\naddResult(acceso)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 35}
{"chunk_id": "d02cc7019c314251", "source_file": "docs/samples/construccion_dinamica_de_objeto.avap", "doc_type": "code", "block_type": "assignment", "section": "", "start_line": 1, "end_line": 1, "content": "datos_cliente = \"datos\"", "metadata": {"complexity": 0}, "token_estimate": 6}
{"chunk_id": "c1528242fcd85a68", "source_file": "docs/samples/construccion_dinamica_de_objeto.avap", "doc_type": "code", "block_type": "addVar", "section": "", "start_line": 2, "end_line": 2, "content": "datos_cliente = \"datos\"\naddVar(clave, \"cliente_vip\")", "metadata": {"complexity": 0, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 16}
{"chunk_id": "d335da8caf95ac8d", "source_file": "docs/samples/construccion_dinamica_de_objeto.avap", "doc_type": "code", "block_type": "addVariableToJSON", "section": "", "start_line": 3, "end_line": 3, "content": "addVar(clave, \"cliente_vip\")\nAddvariableToJSON(clave, datos_cliente, mi_json_final)", "metadata": {"uses_json": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 24}
{"chunk_id": "27067ebe43e3b05d", "source_file": "docs/samples/construccion_dinamica_de_objeto.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 4, "end_line": 4, "content": "AddvariableToJSON(clave, datos_cliente, mi_json_final)\naddResult(mi_json_final)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 20}
{"chunk_id": "a25dfc3b319135d3", "source_file": "docs/samples/contador_de_parametros.avap", "doc_type": "code", "block_type": "addParam", "section": "", "start_line": 1, "end_line": 1, "content": "addParam(\"data_list\", mi_lista)", "metadata": {"uses_auth": true, "complexity": 1}, "token_estimate": 9}
{"chunk_id": "d96fd663666733fe", "source_file": "docs/samples/contador_de_parametros.avap", "doc_type": "code", "block_type": "getListLen", "section": "", "start_line": 2, "end_line": 2, "content": "addParam(\"data_list\", mi_lista)\ngetListLen(mi_lista, cantidad)", "metadata": {"uses_list": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 16}
{"chunk_id": "9905db6de1ea3067", "source_file": "docs/samples/contador_de_parametros.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 3, "end_line": 3, "content": "getListLen(mi_lista, cantidad)\naddResult(cantidad)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 12}
{"chunk_id": "7c239ad53392d63d", "source_file": "docs/samples/conversion_timestamp_legible.avap", "doc_type": "code", "block_type": "stampToDatetime", "section": "", "start_line": 1, "end_line": 1, "content": "stampToDatetime(1708726162, \"%d/%m/%Y\", 0, fecha_human)", "metadata": {"uses_datetime": true, "complexity": 1}, "token_estimate": 22}
{"chunk_id": "c4dc5d3c081101a5", "source_file": "docs/samples/conversion_timestamp_legible.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 2, "end_line": 2, "content": "stampToDatetime(1708726162, \"%d/%m/%Y\", 0, fecha_human)\naddResult(fecha_human)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 28}
{"chunk_id": "2905488dffcbd7ba", "source_file": "docs/samples/referencia_por_valor.avap", "doc_type": "code", "block_type": "addVar", "section": "", "start_line": 1, "end_line": 2, "content": "addVar(base, 1000)\naddVar(copia, $base)", "metadata": {"complexity": 0}, "token_estimate": 16}
{"chunk_id": "82e05ef62a72de87", "source_file": "docs/samples/referencia_por_valor.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 3, "end_line": 3, "content": "addVar(base, 1000)\naddVar(copia, $base)\naddResult(copia)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 21}
{"chunk_id": "a6727546f328e768", "source_file": "docs/samples/respuesta_multiple.avap", "doc_type": "code", "block_type": "addVar", "section": "", "start_line": 1, "end_line": 2, "content": "addVar(code, 200)\naddVar(status, \"Success\")", "metadata": {"complexity": 0}, "token_estimate": 14}
{"chunk_id": "ce12abd61c278bec", "source_file": "docs/samples/respuesta_multiple.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 3, "end_line": 4, "content": "addVar(code, 200)\naddVar(status, \"Success\")\naddResult(code)\naddResult(status)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 22}
{"chunk_id": "45b0086b13784a7d", "source_file": "docs/samples/salida_bucle_correcta.avap", "doc_type": "code", "block_type": "assignment", "section": "", "start_line": 1, "end_line": 1, "content": "encontrado = False", "metadata": {"complexity": 0}, "token_estimate": 5}
{"chunk_id": "c6df33b0e7eac0ff", "source_file": "docs/samples/salida_bucle_correcta.avap", "doc_type": "code", "block_type": "startLoop", "section": "", "start_line": 2, "end_line": 7, "content": "encontrado = False\nstartLoop(i, 1, 10)\n if(i, 5, \"==\")\n encontrado = True\n i = 11 \n end()\nendLoop()", "metadata": {"uses_loop": true, "uses_conditional": true, "complexity": 2, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 42}
{"chunk_id": "02edc488f13b7367", "source_file": "docs/samples/salida_bucle_correcta.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 8, "end_line": 8, "content": "i = 11 \n end()\nendLoop()\naddResult(encontrado)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 17}
{"chunk_id": "c8dbbbf6cb64c10d", "source_file": "docs/samples/funcion_de_suma.avap", "doc_type": "code", "block_type": "function", "section": "", "start_line": 1, "end_line": 4, "content": "function suma(a, b){\n total = a + b\n return(total)\n }", "metadata": {"complexity": 0}, "token_estimate": 19}
{"chunk_id": "1065800a57207e04", "source_file": "docs/samples/funcion_de_suma.avap", "doc_type": "function_signature", "block_type": "function_signature", "section": "", "start_line": 1, "end_line": 1, "content": "function suma(a, b)", "metadata": {"complexity": 0, "full_block_start": 1, "full_block_end": 4}, "token_estimate": 6}
{"chunk_id": "1ef5fa8a4a980012", "source_file": "docs/samples/funcion_de_suma.avap", "doc_type": "code", "block_type": "assignment", "section": "", "start_line": 5, "end_line": 5, "content": "// contexto: function suma(a, b)\nresultado = suma(10, 20)", "metadata": {"complexity": 0, "has_overlap": true, "overlap_type": "function_sig"}, "token_estimate": 18}
{"chunk_id": "ff7df988add5bbef", "source_file": "docs/samples/funcion_de_suma.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 6, "end_line": 6, "content": "// contexto: function suma(a, b)\naddResult(resultado)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "function_sig"}, "token_estimate": 13}
{"chunk_id": "b8682e4f71d9d7c3", "source_file": "docs/samples/funcion_validacion_acceso.avap", "doc_type": "code", "block_type": "function", "section": "", "start_line": 1, "end_line": 7, "content": "function es_valido(token){\n response = False\n if(token, \"SECRET\", \"=\")\n response = True\n end()\n return(response)\n }", "metadata": {"uses_conditional": true, "complexity": 1}, "token_estimate": 34}
{"chunk_id": "a1cfc36abdf661a0", "source_file": "docs/samples/funcion_validacion_acceso.avap", "doc_type": "function_signature", "block_type": "function_signature", "section": "", "start_line": 1, "end_line": 1, "content": "function es_valido(token)", "metadata": {"complexity": 0, "full_block_start": 1, "full_block_end": 7}, "token_estimate": 6}
{"chunk_id": "66706bf4b7d3aede", "source_file": "docs/samples/funcion_validacion_acceso.avap", "doc_type": "code", "block_type": "assignment", "section": "", "start_line": 8, "end_line": 8, "content": "// contexto: function es_valido(token)\nautorizado = es_valido(\"SECRET\")", "metadata": {"complexity": 0, "has_overlap": true, "overlap_type": "function_sig"}, "token_estimate": 18}
{"chunk_id": "5932e6b75c40b7db", "source_file": "docs/samples/funcion_validacion_acceso.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 9, "end_line": 9, "content": "// contexto: function es_valido(token)\naddResult(autorizado)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "function_sig"}, "token_estimate": 15}
{"chunk_id": "4be60a16d7cc7c4d", "source_file": "docs/samples/generador_de_tokens_aleatorios.avap", "doc_type": "code", "block_type": "randomString", "section": "", "start_line": 1, "end_line": 1, "content": "randomString(\"[A-Z]\\d\", 32, token_seguridad)", "metadata": {"uses_string_ops": true, "complexity": 1}, "token_estimate": 15}
{"chunk_id": "1810ca839b071a65", "source_file": "docs/samples/generador_de_tokens_aleatorios.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 2, "end_line": 2, "content": "randomString(\"[A-Z]\\d\", 32, token_seguridad)\naddResult(token_seguridad)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 21}
{"chunk_id": "ed8b4a4e75a71762", "source_file": "docs/samples/obtencion_timestamp.avap", "doc_type": "code", "block_type": "getDateTime", "section": "", "start_line": 1, "end_line": 1, "content": "getDateTime(\"\", 0, \"UTC\", ahora)", "metadata": {"uses_datetime": true, "complexity": 1}, "token_estimate": 11}
{"chunk_id": "05d2d0c8e6266861", "source_file": "docs/samples/obtencion_timestamp.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 2, "end_line": 2, "content": "getDateTime(\"\", 0, \"UTC\", ahora)\naddResult(ahora)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 17}
{"chunk_id": "02d7b0e4a1e1f09c", "source_file": "docs/samples/ormAccessCreate.avap", "doc_type": "code", "block_type": "orm_command", "section": "", "start_line": 1, "end_line": 1, "content": "ormCheckTable(tabla_pruebas,resultado_comprobacion)", "metadata": {"uses_orm": true, "complexity": 1}, "token_estimate": 13}
{"chunk_id": "6daea421c5a1d565", "source_file": "docs/samples/ormAccessCreate.avap", "doc_type": "code", "block_type": "if", "section": "", "start_line": 2, "end_line": 4, "content": "ormCheckTable(tabla_pruebas,resultado_comprobacion)\nif(resultado_comprobacion,False,'==')\n ormCreateTable(\"username,age\",'VARCHAR,INTEGER',tabla_pruebas,resultado_creacion)\nend()", "metadata": {"uses_orm": true, "uses_conditional": true, "complexity": 2, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 45}
{"chunk_id": "47d660e6c1f124d1", "source_file": "docs/samples/ormAccessCreate.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 5, "end_line": 6, "content": "if(resultado_comprobacion,False,'==')\n ormCreateTable(\"username,age\",'VARCHAR,INTEGER',tabla_pruebas,resultado_creacion)\nend()\naddResult(resultado_comprobacion)\naddResult(resultado_creacion)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 45}
{"chunk_id": "b15daff2028a2136", "source_file": "docs/samples/paginacion_dinamica_recursos.avap", "doc_type": "code", "block_type": "addParam", "section": "", "start_line": 1, "end_line": 2, "content": "addParam(\"page\", p)\naddParam(\"size\", s)", "metadata": {"uses_auth": true, "complexity": 1}, "token_estimate": 14}
{"chunk_id": "8f1fa0e84c981765", "source_file": "docs/samples/paginacion_dinamica_recursos.avap", "doc_type": "code", "block_type": "assignment", "section": "", "start_line": 3, "end_line": 6, "content": "addParam(\"page\", p)\naddParam(\"size\", s)\nregistros = [\"u1\", \"u2\", \"u3\", \"u4\", \"u5\", \"u6\"]\noffset = int(p) * int(s)\nlimite = offset + int(s)\ncontador = 0", "metadata": {"complexity": 0, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 62}
{"chunk_id": "e27ce4178666239b", "source_file": "docs/samples/paginacion_dinamica_recursos.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 7, "end_line": 8, "content": "offset = int(p) * int(s)\nlimite = offset + int(s)\ncontador = 0\naddResult(offset)\naddResult(limite)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 32}
{"chunk_id": "9a66c0e4c49bbbcb", "source_file": "docs/samples/paginacion_dinamica_recursos.avap", "doc_type": "code", "block_type": "startLoop", "section": "", "start_line": 9, "end_line": 13, "content": "addResult(offset)\naddResult(limite)\nstartLoop(i, 2, limite)\n actual = registros[int(i)]\n titulo = \"reg_%s\" % i\n AddvariableToJSON(titulo, actual, pagina_json)\nendLoop()", "metadata": {"uses_loop": true, "uses_json": true, "complexity": 2, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 53}
{"chunk_id": "77c985068f6f9269", "source_file": "docs/samples/paginacion_dinamica_recursos.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 14, "end_line": 14, "content": "titulo = \"reg_%s\" % i\n AddvariableToJSON(titulo, actual, pagina_json)\nendLoop()\naddResult(pagina_json)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 32}
{"chunk_id": "aeb4f87681bdc8b4", "source_file": "docs/samples/asignacion_booleana.avap", "doc_type": "code", "block_type": "assignment", "section": "", "start_line": 1, "end_line": 2, "content": "nivel = 5\nes_admin = nivel >= 10", "metadata": {"complexity": 0}, "token_estimate": 12}
{"chunk_id": "5f0f938196d5e573", "source_file": "docs/samples/asignacion_booleana.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 3, "end_line": 3, "content": "nivel = 5\nes_admin = nivel >= 10\naddResult(es_admin)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 18}
{"chunk_id": "42fb50109876864c", "source_file": "docs/samples/asignacion_matematica.avap", "doc_type": "code", "block_type": "assignment", "section": "", "start_line": 1, "end_line": 3, "content": "subtotal = 150.50\niva = subtotal * 0.21\ntotal = subtotal + iva", "metadata": {"complexity": 0}, "token_estimate": 22}
{"chunk_id": "6019c2adc7750c04", "source_file": "docs/samples/asignacion_matematica.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 4, "end_line": 4, "content": "subtotal = 150.50\niva = subtotal * 0.21\ntotal = subtotal + iva\naddResult(total)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 27}
{"chunk_id": "e2f6a0de7e7f9dc1", "source_file": "docs/samples/bucle_1_10.avap", "doc_type": "code", "block_type": "startLoop", "section": "", "start_line": 1, "end_line": 4, "content": "startLoop(i,1,10)\n item = \"item_%s\" % i\n AddvariableToJSON(item,'valor_generado',mi_json)\nendLoop()", "metadata": {"uses_loop": true, "uses_json": true, "complexity": 2}, "token_estimate": 36}
{"chunk_id": "ce1f2fab7c807537", "source_file": "docs/samples/bucle_1_10.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 5, "end_line": 5, "content": "item = \"item_%s\" % i\n AddvariableToJSON(item,'valor_generado',mi_json)\nendLoop()\naddResult(mi_json)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 32}

View File

@ -0,0 +1,64 @@
import typer
from loguru import logger
from scripts.pipelines.tasks.chunk import (
fetch_documents,
process_documents,
export_documents,
ingest_documents
)
app = typer.Typer()
@app.command()
def elasticsearch_ingestion(
docs_folder_path: str = "docs/samples",
output_path: str = "research/code_indexing/chunks/chunks_EBNF_metadata.json",
docs_extension: list[str] = [".avap"],
es_index: str = "avap-code-indexing-ebnf-metadata",
es_request_timeout: int = 120,
es_max_retries: int = 5,
es_retry_on_timeout: bool = True,
delete_es_index: bool = True
) -> None:
"""
Pipeline to ingest documents into an Elasticsearch index.
The pipeline includes fetching documents from a specified folder, processing them into chunks, and then ingesting those chunks into the specified Elasticsearch index.
Args:
docs_folder_path (str): Path to the folder containing documents to be ingested. Default is "docs/samples".
docs_extension (list[str]): List of file extensions to filter by (e.g., [".md", ".avap"]). Default is [".md", ".avap"].
es_index (str): Name of the Elasticsearch index to ingest documents into. Default is "avap-docs-test-v3".
es_request_timeout (int): Timeout in seconds for Elasticsearch requests. Default is 120 seconds.
es_max_retries (int): Maximum number of retries for Elasticsearch requests in case of failure. Default is 5 retries.
es_retry_on_timeout (bool): Whether to retry Elasticsearch requests on timeout. Default is True.
delete_es_index (bool): Whether to delete the existing Elasticsearch index before ingestion. Default is True.
Returns:
None
"""
logger.info("Starting Elasticsearch ingestion pipeline...")
logger.info(f"Fetching files from {docs_folder_path}...")
docs_path = fetch_documents(docs_folder_path, docs_extension)
logger.info("Processing docs...")
chunked_docs = process_documents(docs_path)
logger.info(f"Ingesting chunks in Elasticsearch index: {es_index}...")
elasticsearch_docs = ingest_documents(chunked_docs, es_index, es_request_timeout, es_max_retries,
es_retry_on_timeout, delete_es_index)
logger.info(f"Exporting processed documents to {output_path}...")
export_documents(elasticsearch_docs, output_path)
logger.info(f"Finished ingesting in {es_index}.")
if __name__ == "__main__":
try:
app()
except Exception as exc:
logger.exception(exc)
raise

View File

@ -0,0 +1,198 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "d520f6c3",
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"from datasets import load_dataset\n",
"\n",
"import boto3\n",
"from botocore.config import Config\n",
"from langchain_core.messages import SystemMessage, HumanMessage\n",
"\n",
"from src.utils.llm_factory import create_chat_model\n",
"from src.config import settings"
]
},
{
"cell_type": "markdown",
"id": "e08b9060",
"metadata": {},
"source": [
"### Create LLM isntance"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "81111a86",
"metadata": {},
"outputs": [],
"source": [
"config = Config(\n",
" region_name=\"us-east-1\",\n",
" connect_timeout=10, \n",
" read_timeout=600, \n",
")\n",
"\n",
"client = boto3.client(\"bedrock-runtime\", config=config)\n",
"\n",
"llm = create_chat_model(\n",
" provider=\"bedrock\",\n",
" client=client,\n",
" model=\"global.anthropic.claude-sonnet-4-6\",\n",
" temperature=0,\n",
")"
]
},
{
"cell_type": "markdown",
"id": "045f8e81",
"metadata": {},
"source": [
"### Load AVAP data"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "07dea3fe",
"metadata": {},
"outputs": [],
"source": [
"with open(settings.proj_root / \"docs/LRM/avap.md\", \"r\") as f:\n",
" avap_docs = f.read()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "adbbe8b6",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Loaded 33 AVAP samples\n"
]
}
],
"source": [
"samples_dir = settings.proj_root / \"docs/samples\"\n",
"avap_samples = []\n",
"\n",
"for avap_file in sorted(samples_dir.glob(\"*.avap\")):\n",
" with open(avap_file, \"r\") as f:\n",
" code = f.read()\n",
" \n",
" avap_samples.append({\n",
" \"file\": avap_file.name,\n",
" \"code\": code\n",
" })\n",
"\n",
"# Display as JSON\n",
"avap_samples_json = json.dumps(avap_samples, indent=2, ensure_ascii=False)\n",
"print(f\"Loaded {len(avap_samples)} AVAP samples\")"
]
},
{
"cell_type": "markdown",
"id": "7a15e09a",
"metadata": {},
"source": [
"### Prompt"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "895a170f",
"metadata": {},
"outputs": [],
"source": [
"GOLDEN_DATASET_PROMPT = SystemMessage(\n",
" content=f\"\"\"\n",
" You are an AI agent responsible for generating a golden dataset of queries for AVAP code retrieval and understanding.\n",
"\n",
" You will receive a JSON array of AVAP code samples, each with a 'file' name and 'code' content.\n",
"\n",
" Your task is to:\n",
" 1. Analyze each AVAP code sample.\n",
" 2. Generate 2-3 natural language queries that can be answered by examining that specific code.\n",
" 3. Output a JSON array where each element has:\n",
" - \"query\": A natural language question about AVAP code implementation, best practices, or specific constructs.\n",
" - \"context\": The filename of the code sample that provides the context/answer for this query.\n",
"\n",
" Requirements:\n",
" - Queries should be diverse: ask about functions, control flow, API operations, error handling, etc.\n",
" - Queries must be answerable using ONLY the provided code samples.\n",
" - Queries should be framed as natural developer questions (e.g., \"How do you handle errors in AVAP?\" or \"Show me an example of looping over a list\").\n",
" - Use natural English (or Spanish if context is Spanish-language code).\n",
" - Do not reference exact variable names unless necessary; focus on the patterns and constructs used.\n",
" - Output MUST be valid JSON array format.\n",
"\n",
" AVAP Code Samples:\n",
" {avap_samples_json}\n",
"\n",
" Output format (JSON array):\n",
" [\n",
" {{\"query\": \"...\", \"context\": \"filename.avap\"}},\n",
" {{\"query\": \"...\", \"context\": \"filename.avap\"}},\n",
" ...\n",
" ]\n",
" \"\"\"\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a3123199",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "98c4f93c",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "723352ee",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "assistance-engine",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.13"
}
},
"nbformat": 4,
"nbformat_minor": 5
}