BNF extraction pipeline from avap.md

2026-03-11 11:29:19 +01:00 · 2026-03-11 11:29:19 +01:00 · 3ac432567b
parent cd3922abbd
commit 3ac432567b
21 changed files with 699 additions and 13 deletions
--- a/ingestion/code/BNF/n01_BNF.txt
+++ b/ingestion/code/BNF/n01_BNF.txt
@ -0,0 +1,42 @@
 <program>          ::= ( <line> | <block_comment> )*
 <line>             ::= [ <statement> ] [ <line_comment> | <doc_comment> ] <EOL>
                     | ( <line_comment> | <doc_comment> ) <EOL>
 <EOL>              ::= /* Retorno de carro / Salto de línea (\n o \r\n) */
 <statement>        ::= <assignment>
                     | <method_call_stmt>
                     | <function_call_stmt>
                     | <function_decl>
                     | <return_stmt>
                     | <system_command>
                     | <io_command>
                     | <control_flow>
                     | <async_command>
                     | <connector_cmd>
                     | <db_command>
                     | <http_command>
                     | <util_command>
                     | <modularity_cmd>
 <assignment>       ::= <identifier> "=" <expression>
 /* Llamada a función global (sin receptor de objeto) */
 <function_call_stmt> ::= <identifier> "(" [<argument_list>] ")"
 /* Llamada a método sobre un objeto conector (con receptor) */
 <method_call_stmt> ::= <identifier> "=" <identifier> "." <identifier> "(" [<argument_list>] ")"
 <system_command>   ::= <register_cmd> | <addvar_cmd>
 <register_cmd>     ::= "registerEndpoint(" <stringliteral> "," <stringliteral> "," <list_display> "," <stringliteral> "," <identifier> "," <identifier> ")"
 /* addVar asigna un valor a una variable. Acepta (valor, variable) o (variable, valor).
   Si ambos argumentos son identificadores, el valor del segundo se asigna al primero.
   No está permitido pasar dos literales como argumentos. */
 <addvar_cmd>       ::= "addVar(" <addvar_arg> "," <addvar_arg> ")"
 <addvar_arg>       ::= <identifier> | <literal> | "$" <identifier>
 /* Restricción semántica: al menos uno de los dos <addvar_arg> debe ser <identifier> */
 <identifier>       ::= [a-zA-Z_] [a-zA-Z0-9_]*
 /* Variables de sistema reservadas — accesibles y asignables desde cualquier scope:
   _status  — código HTTP de respuesta (ej. addVar(_status, 401) o _status = 404) */
 <system_variable>  ::= "_status"
--- a/ingestion/code/BNF/n02_BNF.txt
+++ b/ingestion/code/BNF/n02_BNF.txt
@ -0,0 +1,5 @@
 <io_command>       ::= <addparam_cmd> | <getlistlen_cmd> | <addresult_cmd> | <getparamlist_cmd>
 <addparam_cmd>     ::= "addParam(" <stringliteral> "," <identifier> ")"
 <getlistlen_cmd>   ::= "getListLen(" <identifier> "," <identifier> ")"
 <getparamlist_cmd> ::= "getQueryParamList(" <stringliteral> "," <identifier> ")"
 <addresult_cmd>    ::= "addResult(" <identifier> ")"
--- a/ingestion/code/BNF/n03_BNF.txt
+++ b/ingestion/code/BNF/n03_BNF.txt
@ -0,0 +1,28 @@
 <control_flow>    ::= <if_stmt> | <loop_stmt> | <try_stmt>
 <if_stmt>         ::= "if(" <if_condition> ")" <EOL>
                        <block>
                      [ "else()" <EOL> <block> ]
                      "end()" <EOL>
 /* if() soporta dos modos:
   Modo 1 — comparación estructurada: los dos primeros argumentos deben ser
             identificadores simples o literales, nunca expresiones de acceso.
             Si se necesita comparar un valor extraído de una estructura (ej. dict['clave']),
             debe asignarse previamente a una variable.
   Modo 2 — expresión libre: None, None, expresión compleja como string */
 <if_condition>    ::= <if_atom> "," <if_atom> "," <stringliteral>
                    | "None" "," "None" "," <stringliteral>
 <if_atom>         ::= <identifier> | <literal>
 <loop_stmt>       ::= "startLoop(" <identifier> "," <expression> "," <expression> ")" <EOL>
                        <block>
                      "endLoop()" <EOL>
 <try_stmt>        ::= "try()" <EOL>
                        <block>
                      "exception(" <identifier> ")" <EOL>
                        <block>
                      "end()" <EOL>
 <block>           ::= <line>*
--- a/ingestion/code/BNF/n04_BNF.txt
+++ b/ingestion/code/BNF/n04_BNF.txt
@ -0,0 +1,3 @@
 <async_command>   ::= <go_stmt> | <gather_stmt>
 <go_stmt>         ::= <identifier> "=" "go" <identifier> "(" [<argument_list>] ")"
 <gather_stmt>     ::= <identifier> "=" "gather(" <identifier> ["," <expression>] ")"
--- a/ingestion/code/BNF/n05_BNF.txt
+++ b/ingestion/code/BNF/n05_BNF.txt
@ -0,0 +1,25 @@
 /* Instanciación de conector de terceros y llamada a sus métodos dinámicos */
 <connector_cmd>        ::= <connector_instantiation> | <connector_method_call>
 <connector_instantiation> ::= <identifier> "=" "avapConnector(" <stringliteral> ")"
 <connector_method_call>   ::= [ <identifier> "=" ] <identifier> "." <identifier> "(" [<argument_list>] ")"
 /* Cliente HTTP con Timeout Obligatorio */
 <http_command>    ::= <req_post_cmd> | <req_get_cmd>
 <req_post_cmd>    ::= "RequestPost(" <expression> "," <expression> "," <expression> "," <expression> "," <identifier> "," <expression> ")"
 <req_get_cmd>     ::= "RequestGet(" <expression> "," <expression> "," <expression> "," <identifier> "," <expression> ")"
 /* ORM y Persistencia (Estandarizado con tableName) */
 <db_command>      ::= <orm_direct> | <orm_check> | <orm_create> | <orm_select> | <orm_insert> | <orm_update>
 <orm_direct>      ::= "ormDirect(" <expression> "," <identifier> ")"
 <orm_check>       ::= "ormCheckTable(" <expression> "," <identifier> ")"
 <orm_create>      ::= "ormCreateTable(" <expression> "," <expression> "," <expression> "," <identifier> ")"
 /* ormAccessSelect(fields, tableName, selector, varTarget) */
 <orm_select>      ::= "ormAccessSelect(" <orm_fields> "," <expression> "," [<expression>] "," <identifier> ")"
 <orm_fields>      ::= "*" | <expression>
 /* ormAccessInsert(fieldsValues, tableName, varTarget) */
 <orm_insert>      ::= "ormAccessInsert(" <expression> "," <expression> "," <identifier> ")"
 /* ormAccessUpdate(fields, fieldsValues, tableName, selector, varTarget) */
 <orm_update>      ::= "ormAccessUpdate(" <expression> "," <expression> "," <expression> "," <expression> "," <identifier> ")"
--- a/ingestion/code/BNF/n06_BNF.txt
+++ b/ingestion/code/BNF/n06_BNF.txt
@ -0,0 +1,29 @@
 /* [CORRECCIÓN] Todas las subreglas de <util_command> están ahora completamente expandidas. */
 <util_command>    ::= <json_list_cmd> | <crypto_cmd> | <regex_cmd> | <datetime_cmd> | <stamp_cmd> | <string_cmd> | <replace_cmd>
 /* Manipulación de listas y JSON */
 <json_list_cmd>   ::= "variableToList(" <expression> "," <identifier> ")"
                    | "itemFromList(" <identifier> "," <expression> "," <identifier> ")"
                    | "variableFromJSON(" <identifier> "," <expression> "," <identifier> ")"
                    | "AddVariableToJSON(" <expression> "," <expression> "," <identifier> ")"
 /* Criptografía */
 <crypto_cmd>      ::= "encodeSHA256(" <identifier_or_string> "," <identifier> ")"
                    | "encodeMD5(" <identifier_or_string> "," <identifier> ")"
 /* Expresiones regulares */
 <regex_cmd>       ::= "getRegex(" <identifier> "," <stringliteral> "," <identifier> ")"
 <datetime_cmd>    ::= "getDateTime(" <stringliteral> "," <expression> "," <stringliteral> "," <identifier> ")"
 /*  Argumentos: formato_salida, epoch_origen, zona_horaria, destino */
 <stamp_cmd>       ::= "stampToDatetime(" <expression> "," <stringliteral> "," <expression> "," <identifier> ")"
 /*  Argumentos: epoch_origen, formato, timedelta, destino */
                    | "getTimeStamp(" <stringliteral> "," <stringliteral> "," <expression> "," <identifier> ")"
 /*  Argumentos: fecha_string, formato_entrada, timedelta, destino */
 <string_cmd>      ::= "randomString(" <expression> "," <identifier> ")"
 /*  Argumentos: longitud, destino */
 <replace_cmd>     ::= "replace(" <identifier_or_string> "," <stringliteral> "," <stringliteral> "," <identifier> ")"
 /*  Argumentos: origen, patron_busqueda, reemplazo, destino */
--- a/ingestion/code/BNF/n07_BNF.txt
+++ b/ingestion/code/BNF/n07_BNF.txt
@ -0,0 +1,9 @@
 /* Nota: las funciones utilizan llaves {} como delimitadores de bloque por decisión
   arquitectónica explícita, diferenciándose de las estructuras de control (if, loop, try)
   que usan palabras clave de cierre (end(), endLoop()). Ambos patrones coexisten
   en la gramática y el parser los distingue por el token de apertura. */
 <function_decl>   ::= "function" <identifier> "(" [<param_list>] ")" "{" <EOL>
                        <block>
                      "}" <EOL>
 <param_list>      ::= <identifier> ("," <identifier>)*
 <return_stmt>     ::= "return(" [<expression>] ")"
--- a/ingestion/code/BNF/n08_BNF.txt
+++ b/ingestion/code/BNF/n08_BNF.txt
@ -0,0 +1,3 @@
 <modularity_cmd>  ::= <include_stmt> | <import_stmt>
 <include_stmt>    ::= "include" " " <stringliteral>
 <import_stmt>     ::= "import" " " ( "<" <identifier> ">" | <stringliteral> )
--- a/ingestion/code/BNF/n09_BNF.txt
+++ b/ingestion/code/BNF/n09_BNF.txt
@ -0,0 +1,62 @@
 /* Jerarquía de Expresiones (Precedencia de menor a mayor) */
 <expression>       ::= <logical_or>
 <logical_or>       ::= <logical_and> ( "or" <logical_and> )*
 <logical_and>      ::= <logical_not> ( "and" <logical_not> )*
 <logical_not>      ::= "not" <logical_not> | <comparison>
 <comparison>       ::= <arithmetic> ( <comp_op> <arithmetic> )*
 <comp_op>          ::= "==" | "!=" | "<" | ">" | "<=" | ">=" | "in" | "is"
 <arithmetic>       ::= <term> ( ( "+" | "-" ) <term> )*
 <term>             ::= <factor> ( ( "*" | "/" | "%" ) <factor> )*
 <factor>           ::= ( "+" | "-" ) <factor> | <power>
 <power>            ::= <primary> [ "**" <factor> ]
 /* Primarios y Átomos (Accesos, Castings, Slicing, Métodos y Funciones)
   La regla <primary> cubre también el acceso a métodos de objetos conector
   (conector.metodo(...)) y el acceso por clave a sus resultados (resultado["key"]) */
 <primary>          ::= <atom>
                     | <primary> "." <identifier>
                     | <primary> "[" <expression> "]"
                     | <primary> "[" [<expression>] ":" [<expression>] [":" [<expression>]] "]"
                     | <primary> "(" [<argument_list>] ")"
 <atom>             ::= <identifier>
                     | "$" <identifier>
                     | <literal>
                     | "(" <expression> ")"
                     | <list_display>
                     | <dict_display>
 /* Estructuras de Datos, Comprensiones y Argumentos */
 <list_display>     ::= "[" [<argument_list>] "]"
                     | "[" <expression> "for" <identifier> "in" <expression> [<if_clause>] "]"
 <if_clause>        ::= "if" <expression>
 <dict_display>     ::= "{" [<key_datum_list>] "}"
 <key_datum_list>   ::= <key_datum> ( "," <key_datum> )*
 <key_datum>        ::= <expression> ":" <expression>
 <argument_list>    ::= <expression> ( "," <expression> )*
 /* Tipo numérico unificado */
 <number>           ::= <floatnumber> | <integer>
 /* Literales (Tipos de Datos Primitivos Soportados) */
 <literal>          ::= <stringliteral> | <number> | <boolean> | "None"
 <boolean>          ::= "True" | "False"
 <integer>          ::= [0-9]+
 <floatnumber>      ::= [0-9]+ "." [0-9]* | "." [0-9]+
 /* Cadenas de Texto con soporte de secuencias de escape */
 <stringliteral>    ::= "\"" <text_double> "\"" | "'" <text_single> "'"
 <escape_sequence>  ::= "\\" ( "\"" | "'" | "\\" | "n" | "t" | "r" | "0" )
 <text_double>      ::= ( [^"\\] | <escape_sequence> )*
 <text_single>      ::= ( [^'\\] | <escape_sequence> )*
 <identifier_or_string> ::= <identifier> | <stringliteral>
 /* Reglas de Comentarios para el Lexer
   El lexer aplica longest-match: /// debe evaluarse ANTES que // */
 <doc_comment>      ::= "///" <any_text>
 <line_comment>     ::= "//" <any_text>
 <block_comment>    ::= "/*" <any_content> "*/"
 <any_text>         ::= [^\r\n]*
 <any_content>      ::= /* Cualquier secuencia de caracteres que no contenga la subcadena "*/" */
--- a/ingestion/code/n01_BNF.txt
+++ b/ingestion/code/n01_BNF.txt
@ -0,0 +1,42 @@
 <program>          ::= ( <line> | <block_comment> )*
 <line>             ::= [ <statement> ] [ <line_comment> | <doc_comment> ] <EOL>
                     | ( <line_comment> | <doc_comment> ) <EOL>
 <EOL>              ::= /* Retorno de carro / Salto de línea (\n o \r\n) */
 <statement>        ::= <assignment>
                     | <method_call_stmt>
                     | <function_call_stmt>
                     | <function_decl>
                     | <return_stmt>
                     | <system_command>
                     | <io_command>
                     | <control_flow>
                     | <async_command>
                     | <connector_cmd>
                     | <db_command>
                     | <http_command>
                     | <util_command>
                     | <modularity_cmd>
 <assignment>       ::= <identifier> "=" <expression>
 /* Llamada a función global (sin receptor de objeto) */
 <function_call_stmt> ::= <identifier> "(" [<argument_list>] ")"
 /* Llamada a método sobre un objeto conector (con receptor) */
 <method_call_stmt> ::= <identifier> "=" <identifier> "." <identifier> "(" [<argument_list>] ")"
 <system_command>   ::= <register_cmd> | <addvar_cmd>
 <register_cmd>     ::= "registerEndpoint(" <stringliteral> "," <stringliteral> "," <list_display> "," <stringliteral> "," <identifier> "," <identifier> ")"
 /* addVar asigna un valor a una variable. Acepta (valor, variable) o (variable, valor).
   Si ambos argumentos son identificadores, el valor del segundo se asigna al primero.
   No está permitido pasar dos literales como argumentos. */
 <addvar_cmd>       ::= "addVar(" <addvar_arg> "," <addvar_arg> ")"
 <addvar_arg>       ::= <identifier> | <literal> | "$" <identifier>
 /* Restricción semántica: al menos uno de los dos <addvar_arg> debe ser <identifier> */
 <identifier>       ::= [a-zA-Z_] [a-zA-Z0-9_]*
 /* Variables de sistema reservadas — accesibles y asignables desde cualquier scope:
   _status  — código HTTP de respuesta (ej. addVar(_status, 401) o _status = 404) */
 <system_variable>  ::= "_status"
--- a/ingestion/code/n02_BNF.txt
+++ b/ingestion/code/n02_BNF.txt
@ -0,0 +1,5 @@
 <io_command>       ::= <addparam_cmd> | <getlistlen_cmd> | <addresult_cmd> | <getparamlist_cmd>
 <addparam_cmd>     ::= "addParam(" <stringliteral> "," <identifier> ")"
 <getlistlen_cmd>   ::= "getListLen(" <identifier> "," <identifier> ")"
 <getparamlist_cmd> ::= "getQueryParamList(" <stringliteral> "," <identifier> ")"
 <addresult_cmd>    ::= "addResult(" <identifier> ")"
--- a/ingestion/code/n03_BNF.txt
+++ b/ingestion/code/n03_BNF.txt
@ -0,0 +1,28 @@
 <control_flow>    ::= <if_stmt> | <loop_stmt> | <try_stmt>
 <if_stmt>         ::= "if(" <if_condition> ")" <EOL>
                        <block>
                      [ "else()" <EOL> <block> ]
                      "end()" <EOL>
 /* if() soporta dos modos:
   Modo 1 — comparación estructurada: los dos primeros argumentos deben ser
             identificadores simples o literales, nunca expresiones de acceso.
             Si se necesita comparar un valor extraído de una estructura (ej. dict['clave']),
             debe asignarse previamente a una variable.
   Modo 2 — expresión libre: None, None, expresión compleja como string */
 <if_condition>    ::= <if_atom> "," <if_atom> "," <stringliteral>
                    | "None" "," "None" "," <stringliteral>
 <if_atom>         ::= <identifier> | <literal>
 <loop_stmt>       ::= "startLoop(" <identifier> "," <expression> "," <expression> ")" <EOL>
                        <block>
                      "endLoop()" <EOL>
 <try_stmt>        ::= "try()" <EOL>
                        <block>
                      "exception(" <identifier> ")" <EOL>
                        <block>
                      "end()" <EOL>
 <block>           ::= <line>*
--- a/ingestion/code/n04_BNF.txt
+++ b/ingestion/code/n04_BNF.txt
@ -0,0 +1,3 @@
 <async_command>   ::= <go_stmt> | <gather_stmt>
 <go_stmt>         ::= <identifier> "=" "go" <identifier> "(" [<argument_list>] ")"
 <gather_stmt>     ::= <identifier> "=" "gather(" <identifier> ["," <expression>] ")"
--- a/ingestion/code/n05_BNF.txt
+++ b/ingestion/code/n05_BNF.txt
@ -0,0 +1,25 @@
 /* Instanciación de conector de terceros y llamada a sus métodos dinámicos */
 <connector_cmd>        ::= <connector_instantiation> | <connector_method_call>
 <connector_instantiation> ::= <identifier> "=" "avapConnector(" <stringliteral> ")"
 <connector_method_call>   ::= [ <identifier> "=" ] <identifier> "." <identifier> "(" [<argument_list>] ")"
 /* Cliente HTTP con Timeout Obligatorio */
 <http_command>    ::= <req_post_cmd> | <req_get_cmd>
 <req_post_cmd>    ::= "RequestPost(" <expression> "," <expression> "," <expression> "," <expression> "," <identifier> "," <expression> ")"
 <req_get_cmd>     ::= "RequestGet(" <expression> "," <expression> "," <expression> "," <identifier> "," <expression> ")"
 /* ORM y Persistencia (Estandarizado con tableName) */
 <db_command>      ::= <orm_direct> | <orm_check> | <orm_create> | <orm_select> | <orm_insert> | <orm_update>
 <orm_direct>      ::= "ormDirect(" <expression> "," <identifier> ")"
 <orm_check>       ::= "ormCheckTable(" <expression> "," <identifier> ")"
 <orm_create>      ::= "ormCreateTable(" <expression> "," <expression> "," <expression> "," <identifier> ")"
 /* ormAccessSelect(fields, tableName, selector, varTarget) */
 <orm_select>      ::= "ormAccessSelect(" <orm_fields> "," <expression> "," [<expression>] "," <identifier> ")"
 <orm_fields>      ::= "*" | <expression>
 /* ormAccessInsert(fieldsValues, tableName, varTarget) */
 <orm_insert>      ::= "ormAccessInsert(" <expression> "," <expression> "," <identifier> ")"
 /* ormAccessUpdate(fields, fieldsValues, tableName, selector, varTarget) */
 <orm_update>      ::= "ormAccessUpdate(" <expression> "," <expression> "," <expression> "," <expression> "," <identifier> ")"
--- a/ingestion/code/n06_BNF.txt
+++ b/ingestion/code/n06_BNF.txt
@ -0,0 +1,29 @@
 /* [CORRECCIÓN] Todas las subreglas de <util_command> están ahora completamente expandidas. */
 <util_command>    ::= <json_list_cmd> | <crypto_cmd> | <regex_cmd> | <datetime_cmd> | <stamp_cmd> | <string_cmd> | <replace_cmd>
 /* Manipulación de listas y JSON */
 <json_list_cmd>   ::= "variableToList(" <expression> "," <identifier> ")"
                    | "itemFromList(" <identifier> "," <expression> "," <identifier> ")"
                    | "variableFromJSON(" <identifier> "," <expression> "," <identifier> ")"
                    | "AddVariableToJSON(" <expression> "," <expression> "," <identifier> ")"
 /* Criptografía */
 <crypto_cmd>      ::= "encodeSHA256(" <identifier_or_string> "," <identifier> ")"
                    | "encodeMD5(" <identifier_or_string> "," <identifier> ")"
 /* Expresiones regulares */
 <regex_cmd>       ::= "getRegex(" <identifier> "," <stringliteral> "," <identifier> ")"
 <datetime_cmd>    ::= "getDateTime(" <stringliteral> "," <expression> "," <stringliteral> "," <identifier> ")"
 /*  Argumentos: formato_salida, epoch_origen, zona_horaria, destino */
 <stamp_cmd>       ::= "stampToDatetime(" <expression> "," <stringliteral> "," <expression> "," <identifier> ")"
 /*  Argumentos: epoch_origen, formato, timedelta, destino */
                    | "getTimeStamp(" <stringliteral> "," <stringliteral> "," <expression> "," <identifier> ")"
 /*  Argumentos: fecha_string, formato_entrada, timedelta, destino */
 <string_cmd>      ::= "randomString(" <expression> "," <identifier> ")"
 /*  Argumentos: longitud, destino */
 <replace_cmd>     ::= "replace(" <identifier_or_string> "," <stringliteral> "," <stringliteral> "," <identifier> ")"
 /*  Argumentos: origen, patron_busqueda, reemplazo, destino */
--- a/ingestion/code/n07_BNF.txt
+++ b/ingestion/code/n07_BNF.txt
@ -0,0 +1,9 @@
 /* Nota: las funciones utilizan llaves {} como delimitadores de bloque por decisión
   arquitectónica explícita, diferenciándose de las estructuras de control (if, loop, try)
   que usan palabras clave de cierre (end(), endLoop()). Ambos patrones coexisten
   en la gramática y el parser los distingue por el token de apertura. */
 <function_decl>   ::= "function" <identifier> "(" [<param_list>] ")" "{" <EOL>
                        <block>
                      "}" <EOL>
 <param_list>      ::= <identifier> ("," <identifier>)*
 <return_stmt>     ::= "return(" [<expression>] ")"
--- a/ingestion/code/n08_BNF.txt
+++ b/ingestion/code/n08_BNF.txt
@ -0,0 +1,3 @@
 <modularity_cmd>  ::= <include_stmt> | <import_stmt>
 <include_stmt>    ::= "include" " " <stringliteral>
 <import_stmt>     ::= "import" " " ( "<" <identifier> ">" | <stringliteral> )
--- a/ingestion/code/n09_BNF.txt
+++ b/ingestion/code/n09_BNF.txt
@ -0,0 +1,62 @@
 /* Jerarquía de Expresiones (Precedencia de menor a mayor) */
 <expression>       ::= <logical_or>
 <logical_or>       ::= <logical_and> ( "or" <logical_and> )*
 <logical_and>      ::= <logical_not> ( "and" <logical_not> )*
 <logical_not>      ::= "not" <logical_not> | <comparison>
 <comparison>       ::= <arithmetic> ( <comp_op> <arithmetic> )*
 <comp_op>          ::= "==" | "!=" | "<" | ">" | "<=" | ">=" | "in" | "is"
 <arithmetic>       ::= <term> ( ( "+" | "-" ) <term> )*
 <term>             ::= <factor> ( ( "*" | "/" | "%" ) <factor> )*
 <factor>           ::= ( "+" | "-" ) <factor> | <power>
 <power>            ::= <primary> [ "**" <factor> ]
 /* Primarios y Átomos (Accesos, Castings, Slicing, Métodos y Funciones)
   La regla <primary> cubre también el acceso a métodos de objetos conector
   (conector.metodo(...)) y el acceso por clave a sus resultados (resultado["key"]) */
 <primary>          ::= <atom>
                     | <primary> "." <identifier>
                     | <primary> "[" <expression> "]"
                     | <primary> "[" [<expression>] ":" [<expression>] [":" [<expression>]] "]"
                     | <primary> "(" [<argument_list>] ")"
 <atom>             ::= <identifier>
                     | "$" <identifier>
                     | <literal>
                     | "(" <expression> ")"
                     | <list_display>
                     | <dict_display>
 /* Estructuras de Datos, Comprensiones y Argumentos */
 <list_display>     ::= "[" [<argument_list>] "]"
                     | "[" <expression> "for" <identifier> "in" <expression> [<if_clause>] "]"
 <if_clause>        ::= "if" <expression>
 <dict_display>     ::= "{" [<key_datum_list>] "}"
 <key_datum_list>   ::= <key_datum> ( "," <key_datum> )*
 <key_datum>        ::= <expression> ":" <expression>
 <argument_list>    ::= <expression> ( "," <expression> )*
 /* Tipo numérico unificado */
 <number>           ::= <floatnumber> | <integer>
 /* Literales (Tipos de Datos Primitivos Soportados) */
 <literal>          ::= <stringliteral> | <number> | <boolean> | "None"
 <boolean>          ::= "True" | "False"
 <integer>          ::= [0-9]+
 <floatnumber>      ::= [0-9]+ "." [0-9]* | "." [0-9]+
 /* Cadenas de Texto con soporte de secuencias de escape */
 <stringliteral>    ::= "\"" <text_double> "\"" | "'" <text_single> "'"
 <escape_sequence>  ::= "\\" ( "\"" | "'" | "\\" | "n" | "t" | "r" | "0" )
 <text_double>      ::= ( [^"\\] | <escape_sequence> )*
 <text_single>      ::= ( [^'\\] | <escape_sequence> )*
 <identifier_or_string> ::= <identifier> | <stringliteral>
 /* Reglas de Comentarios para el Lexer
   El lexer aplica longest-match: /// debe evaluarse ANTES que // */
 <doc_comment>      ::= "///" <any_text>
 <line_comment>     ::= "//" <any_text>
 <block_comment>    ::= "/*" <any_content> "*/"
 <any_text>         ::= [^\r\n]*
 <any_content>      ::= /* Cualquier secuencia de caracteres que no contenga la subcadena "*/" */
--- a/scratches/pseco/ingestion/Code
+++ b/scratches/pseco/ingestion/Code
@ -2,7 +2,7 @@
 "cells": [
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
   "id": "0a8abbfa",
   "metadata": {},
   "outputs": [
@ -24,7 +24,6 @@
    "from dataclasses import dataclass\n",
    "from pathlib import Path\n",
    "from typing import Any, Dict, List, Optional, Tuple\n",
    "# from bnf import grammar\n",
    "import nltk\n",
    "from elasticsearch import Elasticsearch\n",
    "from langchain_core.documents import Document\n",
@ -185,7 +184,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
   "id": "26ad9c81",
   "metadata": {},
   "outputs": [
@ -209,7 +208,7 @@
    }
   ],
   "source": [
-    "grammar_ = (DATA_DIR / \"raw\" / \"code\" / \"BNF_v1.txt\").read_text(\n",
+    "grammar_ = (settings.data_path / \"raw\" / \"code\" / \"BNF_v1.txt\").read_text(\n",
    "    encoding=\"utf-8\"\n",
    ")\n",
    "grammar(grammar_)"
--- a/scratches/pseco/ingestion/Code
+++ b/scratches/pseco/ingestion/Code
@ -2,24 +2,55 @@
 "cells": [
  {
   "cell_type": "code",
-   "execution_count": 51,
+   "execution_count": 1,
   "id": "5b646fb1",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[2mUsing Python 3.12.11 environment at: /home/pseco/VsCodeProjects/assistance-engine/.venv\u001b[0m\n",
      "\u001b[2mAudited \u001b[1m1 package\u001b[0m \u001b[2min 2ms\u001b[0m\u001b[0m\n"
     ]
    }
   ],
   "source": [
    "! uv pip install bnf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "274d6d68",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[2mUsing Python 3.12.11 environment at: /home/pseco/VsCodeProjects/assistance-engine/.venv\u001b[0m\n",
      "\u001b[2mAudited \u001b[1m1 package\u001b[0m \u001b[2min 2ms\u001b[0m\u001b[0m\n"
     ]
    }
   ],
   "source": [
    "! uv pip install ebnf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "0a8abbfa",
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "\n",
    "from dataclasses import dataclass\n",
    "\n",
    "from typing import Any, Dict, List, Optional, Tuple\n",
    "\n",
    "from lark import Tree, Lark\n",
    "\n",
    "\n",
    "from bnf import grammar as bnf_grammar, parse as bnf_parse\n",
-    "from ebnf import grammar as ebnf_grammar, parse as ebnf_parse\n",
+    "from src.config import settings"
    "\n",
    "from src.config import DATA_DIR"
   ]
  },
  {
--- a/scripts/pipelines/flows/bnf_files_generator.py
+++ b/scripts/pipelines/flows/bnf_files_generator.py
@ -0,0 +1,244 @@
 """
 Generator for BNF specification files from AVAP documentation.
 This script extracts BNF specifications from the AVAP Language Reference Manual (LRM)
 and generates individual text files for each BNF section.
 Output format: n0X_BNF.txt (where X is the section number)
 Default output directory: ingestion/code/BNF/
 Default markdown source: docs/LRM/avap.md
 USAGE EXAMPLES:
 Use default configuration:
    python scripts/pipelines/flows/bnf_files_generator.py
 Customize input and output paths:
    python scripts/pipelines/flows/bnf_files_generator.py --markdown docs/LRM/avap.md --output ingestion/code
    python scripts/pipelines/flows/bnf_files_generator.py -m docs/LRM/avap.md -o ingestion/code
 OPTIONS:
    --markdown, -m: Path to the AVAP markdown file (relative to project root)
    --output, -o:   Output directory for BNF files (relative to project root)
 """
 import re
 import typer
 from pathlib import Path
 from typing import List, Tuple, Optional
 app = typer.Typer()
 class BNFExtractor:
    """Extract BNF specifications from AVAP markdown documentation."""
    def __init__(self, markdown_file: Path, output_dir: Path):
        """
        Initialize BNF extractor.
        Args:
            markdown_file: Path to the AVAP markdown file
            output_dir: Directory where BNF files will be saved
        """
        self.markdown_file = markdown_file
        self.output_dir = output_dir
        self.bnf_sections: List[Tuple[int, str, str]] = []
    @staticmethod
    def _roman_to_int(roman: str) -> int:
        """
        Convert Roman numerals to integers.
        Args:
            roman: Roman numeral string (e.g., 'I', 'IV', 'IX', 'XII')
        Returns:
            Integer value of the Roman numeral
        """
        roman_values = {
            'I': 1, 'V': 5, 'X': 10, 'L': 50,
            'C': 100, 'D': 500, 'M': 1000
        }
        total = 0
        prev_value = 0
        for char in reversed(roman):
            value = roman_values.get(char, 0)
            if value < prev_value:
                total -= value
            else:
                total += value
            prev_value = value
        return total
    def read_markdown_file(self) -> str:
        """Read the markdown file content."""
        with open(self.markdown_file, "r", encoding="utf-8") as f:
            return f.read()
    def extract_bnf_sections(self, content: str) -> List[Tuple[int, str, str]]:
        """
        Extract all BNF specifications from markdown content.
        Pattern: ### Especificación BNF (Sección I)
                 ```bnf
                 ... BNF content ...
                 ```
        Args:
            content: Markdown file content
        Returns:
            List of tuples: (section_number, section_title, bnf_content)
        """
        bnf_sections = []
        # Pattern to find BNF specification headers and extract Roman numerals
        # Matches: ### Especificación BNF (Sección I), (Sección II), etc.
        header_pattern = r"### Especificación BNF \(Sección ([IVXLCDM]+)\)"
        # Find all BNF headers with their positions
        for match in re.finditer(header_pattern, content):
            roman_numeral = match.group(1)
            section_number = self._roman_to_int(roman_numeral)
            header_start = match.start()
            header_end = match.end()
            # Find the code block after this header
            code_block_pattern = r"```bnf\n(.*?)```"
            search_start = header_end
            code_match = re.search(code_block_pattern, content[search_start:], re.DOTALL)
            if code_match:
                bnf_content = code_match.group(1).strip()
                section_title = f"Especificación BNF - Sección {roman_numeral}"
                bnf_sections.append((section_number, section_title, bnf_content))
        self.bnf_sections = bnf_sections
        return bnf_sections
    def format_bnf_file_content(self, section_number: int, title: str, bnf_content: str) -> str:
        """
        Format BNF content for file output.
        Args:
            section_number: Section number (1-9, etc.)
            title: Section title
            bnf_content: Raw BNF grammar content
        Returns:
            BNF content without additional formatting
        """
        return bnf_content
    def save_bnf_files(self) -> int:
        """
        Save extracted BNF sections to individual files.
        File naming convention: n0X_BNF.txt (e.g., n01_BNF.txt, n02_BNF.txt, etc.)
        Returns:
            Number of files created
        """
        # Ensure output directory exists
        self.output_dir.mkdir(parents=True, exist_ok=True)
        files_created = 0
        for section_number, title, bnf_content in self.bnf_sections:
            # Format filename with zero-padded section number
            filename = f"n{section_number:02d}_BNF.txt"
            filepath = self.output_dir / filename
            # Format and write file content
            formatted_content = self.format_bnf_file_content(
                section_number, title, bnf_content
            )
            with open(filepath, "w", encoding="utf-8") as f:
                f.write(formatted_content)
            print(f"Created: {filepath}")
            files_created += 1
        return files_created
    def generate(self) -> Tuple[int, List[str]]:
        """
        Execute the complete BNF extraction and file generation process.
        Returns:
            Tuple of (number_of_files_created, list_of_file_paths)
        """
        print(f"Reading markdown file: {self.markdown_file}")
        content = self.read_markdown_file()
        print(f"Extracting BNF specifications...")
        bnf_sections = self.extract_bnf_sections(content)
        print(f"Found {len(bnf_sections)} BNF sections:")
        for section_number, title, _ in bnf_sections:
            print(f"  - {title}")
        print(f"\nSaving BNF files to: {self.output_dir}")
        files_created = self.save_bnf_files()
        # Generate list of created file paths
        file_paths = [
            str(self.output_dir / f"n{i:02d}_BNF.txt")
            for i, _, _ in bnf_sections
        ]
        return files_created, file_paths
@app.command()
 def main(
    markdown_file: str = typer.Option(
        "docs/LRM/avap.md",
        "--markdown",
        "-m",
        help="Path to AVAP markdown file (relative to project root)"
    ),
    output_dir: str = typer.Option(
        "ingestion/code/BNF/",
        "--output",
        "-o",
        help="Output directory for BNF files (relative to project root)"
    )
 ):
    """Extract BNF specifications from AVAP documentation.
    Default behavior:
    - Reads from: docs/LRM/avap.md
    - Writes to: ingestion/code/BNF/
    """
    # Get project root directory (scripts/pipelines/flows -> project root)
    script_dir = Path(__file__).parent
    project_root = script_dir.parent.parent.parent
    # Convert relative paths to absolute
    markdown_path = project_root / markdown_file
    output_path = project_root / output_dir
    # Verify markdown file exists
    if not markdown_path.exists():
        typer.echo(f"Error: Markdown file not found: {markdown_path}", err=True)
        raise typer.Exit(code=1)
    # Create extractor and generate files
    extractor = BNFExtractor(markdown_path, output_path)
    files_created, file_paths = extractor.generate()
    print(f"\n{'='*80}")
    print(f"BNF extraction complete!")
    print(f"Total files created: {files_created}")
    print(f"Output directory: {output_path}")
    print(f"{'='*80}")
 if __name__ == "__main__":
    app()