From 3ac432567b501132e47b5111ffcd7d8128ac945f Mon Sep 17 00:00:00 2001 From: pseco Date: Wed, 11 Mar 2026 11:29:19 +0100 Subject: [PATCH] BNF extraction pipeline from avap.md --- ingestion/code/BNF/n01_BNF.txt | 42 +++ ingestion/code/BNF/n02_BNF.txt | 5 + ingestion/code/BNF/n03_BNF.txt | 28 ++ ingestion/code/BNF/n04_BNF.txt | 3 + ingestion/code/BNF/n05_BNF.txt | 25 ++ ingestion/code/BNF/n06_BNF.txt | 29 +++ ingestion/code/BNF/n07_BNF.txt | 9 + ingestion/code/BNF/n08_BNF.txt | 3 + ingestion/code/BNF/n09_BNF.txt | 62 +++++ ingestion/code/n01_BNF.txt | 42 +++ ingestion/code/n02_BNF.txt | 5 + ingestion/code/n03_BNF.txt | 28 ++ ingestion/code/n04_BNF.txt | 3 + ingestion/code/n05_BNF.txt | 25 ++ ingestion/code/n06_BNF.txt | 29 +++ ingestion/code/n07_BNF.txt | 9 + ingestion/code/n08_BNF.txt | 3 + ingestion/code/n09_BNF.txt | 62 +++++ .../n00 Proper Lark Chunking.ipynb | 7 +- .../Code Ingestion/n01 BNF Check.ipynb | 49 +++- .../pipelines/flows/bnf_files_generator.py | 244 ++++++++++++++++++ 21 files changed, 699 insertions(+), 13 deletions(-) create mode 100644 ingestion/code/BNF/n01_BNF.txt create mode 100644 ingestion/code/BNF/n02_BNF.txt create mode 100644 ingestion/code/BNF/n03_BNF.txt create mode 100644 ingestion/code/BNF/n04_BNF.txt create mode 100644 ingestion/code/BNF/n05_BNF.txt create mode 100644 ingestion/code/BNF/n06_BNF.txt create mode 100644 ingestion/code/BNF/n07_BNF.txt create mode 100644 ingestion/code/BNF/n08_BNF.txt create mode 100644 ingestion/code/BNF/n09_BNF.txt create mode 100644 ingestion/code/n01_BNF.txt create mode 100644 ingestion/code/n02_BNF.txt create mode 100644 ingestion/code/n03_BNF.txt create mode 100644 ingestion/code/n04_BNF.txt create mode 100644 ingestion/code/n05_BNF.txt create mode 100644 ingestion/code/n06_BNF.txt create mode 100644 ingestion/code/n07_BNF.txt create mode 100644 ingestion/code/n08_BNF.txt create mode 100644 ingestion/code/n09_BNF.txt create mode 100644 scripts/pipelines/flows/bnf_files_generator.py diff --git a/ingestion/code/BNF/n01_BNF.txt b/ingestion/code/BNF/n01_BNF.txt new file mode 100644 index 0000000..2763ab2 --- /dev/null +++ b/ingestion/code/BNF/n01_BNF.txt @@ -0,0 +1,42 @@ + ::= ( | )* + ::= [ ] [ | ] + | ( | ) + ::= /* Retorno de carro / Salto de línea (\n o \r\n) */ + + ::= + | + | + | + | + | + | + | + | + | + | + | + | + | + + ::= "=" + +/* Llamada a función global (sin receptor de objeto) */ + ::= "(" [] ")" + +/* Llamada a método sobre un objeto conector (con receptor) */ + ::= "=" "." "(" [] ")" + + ::= | + ::= "registerEndpoint(" "," "," "," "," "," ")" +/* addVar asigna un valor a una variable. Acepta (valor, variable) o (variable, valor). + Si ambos argumentos son identificadores, el valor del segundo se asigna al primero. + No está permitido pasar dos literales como argumentos. */ + ::= "addVar(" "," ")" + ::= | | "$" +/* Restricción semántica: al menos uno de los dos debe ser */ + + ::= [a-zA-Z_] [a-zA-Z0-9_]* + +/* Variables de sistema reservadas — accesibles y asignables desde cualquier scope: + _status — código HTTP de respuesta (ej. addVar(_status, 401) o _status = 404) */ + ::= "_status" \ No newline at end of file diff --git a/ingestion/code/BNF/n02_BNF.txt b/ingestion/code/BNF/n02_BNF.txt new file mode 100644 index 0000000..dcb404a --- /dev/null +++ b/ingestion/code/BNF/n02_BNF.txt @@ -0,0 +1,5 @@ + ::= | | | + ::= "addParam(" "," ")" + ::= "getListLen(" "," ")" + ::= "getQueryParamList(" "," ")" + ::= "addResult(" ")" \ No newline at end of file diff --git a/ingestion/code/BNF/n03_BNF.txt b/ingestion/code/BNF/n03_BNF.txt new file mode 100644 index 0000000..651582d --- /dev/null +++ b/ingestion/code/BNF/n03_BNF.txt @@ -0,0 +1,28 @@ + ::= | | + + ::= "if(" ")" + + [ "else()" ] + "end()" + +/* if() soporta dos modos: + Modo 1 — comparación estructurada: los dos primeros argumentos deben ser + identificadores simples o literales, nunca expresiones de acceso. + Si se necesita comparar un valor extraído de una estructura (ej. dict['clave']), + debe asignarse previamente a una variable. + Modo 2 — expresión libre: None, None, expresión compleja como string */ + ::= "," "," + | "None" "," "None" "," + ::= | + + ::= "startLoop(" "," "," ")" + + "endLoop()" + + ::= "try()" + + "exception(" ")" + + "end()" + + ::= * \ No newline at end of file diff --git a/ingestion/code/BNF/n04_BNF.txt b/ingestion/code/BNF/n04_BNF.txt new file mode 100644 index 0000000..253a0a8 --- /dev/null +++ b/ingestion/code/BNF/n04_BNF.txt @@ -0,0 +1,3 @@ + ::= | + ::= "=" "go" "(" [] ")" + ::= "=" "gather(" ["," ] ")" \ No newline at end of file diff --git a/ingestion/code/BNF/n05_BNF.txt b/ingestion/code/BNF/n05_BNF.txt new file mode 100644 index 0000000..2563bca --- /dev/null +++ b/ingestion/code/BNF/n05_BNF.txt @@ -0,0 +1,25 @@ +/* Instanciación de conector de terceros y llamada a sus métodos dinámicos */ + ::= | + ::= "=" "avapConnector(" ")" + ::= [ "=" ] "." "(" [] ")" + +/* Cliente HTTP con Timeout Obligatorio */ + ::= | + ::= "RequestPost(" "," "," "," "," "," ")" + ::= "RequestGet(" "," "," "," "," ")" + +/* ORM y Persistencia (Estandarizado con tableName) */ + ::= | | | | | + ::= "ormDirect(" "," ")" + ::= "ormCheckTable(" "," ")" + ::= "ormCreateTable(" "," "," "," ")" + +/* ormAccessSelect(fields, tableName, selector, varTarget) */ + ::= "ormAccessSelect(" "," "," [] "," ")" + ::= "*" | + +/* ormAccessInsert(fieldsValues, tableName, varTarget) */ + ::= "ormAccessInsert(" "," "," ")" + +/* ormAccessUpdate(fields, fieldsValues, tableName, selector, varTarget) */ + ::= "ormAccessUpdate(" "," "," "," "," ")" \ No newline at end of file diff --git a/ingestion/code/BNF/n06_BNF.txt b/ingestion/code/BNF/n06_BNF.txt new file mode 100644 index 0000000..bbb341b --- /dev/null +++ b/ingestion/code/BNF/n06_BNF.txt @@ -0,0 +1,29 @@ +/* [CORRECCIÓN] Todas las subreglas de están ahora completamente expandidas. */ + ::= | | | | | | + +/* Manipulación de listas y JSON */ + ::= "variableToList(" "," ")" + | "itemFromList(" "," "," ")" + | "variableFromJSON(" "," "," ")" + | "AddVariableToJSON(" "," "," ")" + +/* Criptografía */ + ::= "encodeSHA256(" "," ")" + | "encodeMD5(" "," ")" + +/* Expresiones regulares */ + ::= "getRegex(" "," "," ")" + + ::= "getDateTime(" "," "," "," ")" +/* Argumentos: formato_salida, epoch_origen, zona_horaria, destino */ + + ::= "stampToDatetime(" "," "," "," ")" +/* Argumentos: epoch_origen, formato, timedelta, destino */ + | "getTimeStamp(" "," "," "," ")" +/* Argumentos: fecha_string, formato_entrada, timedelta, destino */ + + ::= "randomString(" "," ")" +/* Argumentos: longitud, destino */ + + ::= "replace(" "," "," "," ")" +/* Argumentos: origen, patron_busqueda, reemplazo, destino */ \ No newline at end of file diff --git a/ingestion/code/BNF/n07_BNF.txt b/ingestion/code/BNF/n07_BNF.txt new file mode 100644 index 0000000..630a99f --- /dev/null +++ b/ingestion/code/BNF/n07_BNF.txt @@ -0,0 +1,9 @@ +/* Nota: las funciones utilizan llaves {} como delimitadores de bloque por decisión + arquitectónica explícita, diferenciándose de las estructuras de control (if, loop, try) + que usan palabras clave de cierre (end(), endLoop()). Ambos patrones coexisten + en la gramática y el parser los distingue por el token de apertura. */ + ::= "function" "(" [] ")" "{" + + "}" + ::= ("," )* + ::= "return(" [] ")" \ No newline at end of file diff --git a/ingestion/code/BNF/n08_BNF.txt b/ingestion/code/BNF/n08_BNF.txt new file mode 100644 index 0000000..e42159f --- /dev/null +++ b/ingestion/code/BNF/n08_BNF.txt @@ -0,0 +1,3 @@ + ::= | + ::= "include" " " + ::= "import" " " ( "<" ">" | ) \ No newline at end of file diff --git a/ingestion/code/BNF/n09_BNF.txt b/ingestion/code/BNF/n09_BNF.txt new file mode 100644 index 0000000..90c683e --- /dev/null +++ b/ingestion/code/BNF/n09_BNF.txt @@ -0,0 +1,62 @@ +/* Jerarquía de Expresiones (Precedencia de menor a mayor) */ + ::= + ::= ( "or" )* + ::= ( "and" )* + ::= "not" | + + ::= ( )* + ::= "==" | "!=" | "<" | ">" | "<=" | ">=" | "in" | "is" + + ::= ( ( "+" | "-" ) )* + ::= ( ( "*" | "/" | "%" ) )* + ::= ( "+" | "-" ) | + ::= [ "**" ] + +/* Primarios y Átomos (Accesos, Castings, Slicing, Métodos y Funciones) + La regla cubre también el acceso a métodos de objetos conector + (conector.metodo(...)) y el acceso por clave a sus resultados (resultado["key"]) */ + ::= + | "." + | "[" "]" + | "[" [] ":" [] [":" []] "]" + | "(" [] ")" + + ::= + | "$" + | + | "(" ")" + | + | + +/* Estructuras de Datos, Comprensiones y Argumentos */ + ::= "[" [] "]" + | "[" "for" "in" [] "]" + ::= "if" + ::= "{" [] "}" + ::= ( "," )* + ::= ":" + ::= ( "," )* + +/* Tipo numérico unificado */ + ::= | + +/* Literales (Tipos de Datos Primitivos Soportados) */ + ::= | | | "None" + ::= "True" | "False" + ::= [0-9]+ + ::= [0-9]+ "." [0-9]* | "." [0-9]+ + +/* Cadenas de Texto con soporte de secuencias de escape */ + ::= "\"" "\"" | "'" "'" + ::= "\\" ( "\"" | "'" | "\\" | "n" | "t" | "r" | "0" ) + ::= ( [^"\\] | )* + ::= ( [^'\\] | )* + ::= | + +/* Reglas de Comentarios para el Lexer + El lexer aplica longest-match: /// debe evaluarse ANTES que // */ + ::= "///" + ::= "//" + ::= "/*" "*/" + ::= [^\r\n]* + ::= /* Cualquier secuencia de caracteres que no contenga la subcadena "*/" */ \ No newline at end of file diff --git a/ingestion/code/n01_BNF.txt b/ingestion/code/n01_BNF.txt new file mode 100644 index 0000000..2763ab2 --- /dev/null +++ b/ingestion/code/n01_BNF.txt @@ -0,0 +1,42 @@ + ::= ( | )* + ::= [ ] [ | ] + | ( | ) + ::= /* Retorno de carro / Salto de línea (\n o \r\n) */ + + ::= + | + | + | + | + | + | + | + | + | + | + | + | + | + + ::= "=" + +/* Llamada a función global (sin receptor de objeto) */ + ::= "(" [] ")" + +/* Llamada a método sobre un objeto conector (con receptor) */ + ::= "=" "." "(" [] ")" + + ::= | + ::= "registerEndpoint(" "," "," "," "," "," ")" +/* addVar asigna un valor a una variable. Acepta (valor, variable) o (variable, valor). + Si ambos argumentos son identificadores, el valor del segundo se asigna al primero. + No está permitido pasar dos literales como argumentos. */ + ::= "addVar(" "," ")" + ::= | | "$" +/* Restricción semántica: al menos uno de los dos debe ser */ + + ::= [a-zA-Z_] [a-zA-Z0-9_]* + +/* Variables de sistema reservadas — accesibles y asignables desde cualquier scope: + _status — código HTTP de respuesta (ej. addVar(_status, 401) o _status = 404) */ + ::= "_status" \ No newline at end of file diff --git a/ingestion/code/n02_BNF.txt b/ingestion/code/n02_BNF.txt new file mode 100644 index 0000000..dcb404a --- /dev/null +++ b/ingestion/code/n02_BNF.txt @@ -0,0 +1,5 @@ + ::= | | | + ::= "addParam(" "," ")" + ::= "getListLen(" "," ")" + ::= "getQueryParamList(" "," ")" + ::= "addResult(" ")" \ No newline at end of file diff --git a/ingestion/code/n03_BNF.txt b/ingestion/code/n03_BNF.txt new file mode 100644 index 0000000..651582d --- /dev/null +++ b/ingestion/code/n03_BNF.txt @@ -0,0 +1,28 @@ + ::= | | + + ::= "if(" ")" + + [ "else()" ] + "end()" + +/* if() soporta dos modos: + Modo 1 — comparación estructurada: los dos primeros argumentos deben ser + identificadores simples o literales, nunca expresiones de acceso. + Si se necesita comparar un valor extraído de una estructura (ej. dict['clave']), + debe asignarse previamente a una variable. + Modo 2 — expresión libre: None, None, expresión compleja como string */ + ::= "," "," + | "None" "," "None" "," + ::= | + + ::= "startLoop(" "," "," ")" + + "endLoop()" + + ::= "try()" + + "exception(" ")" + + "end()" + + ::= * \ No newline at end of file diff --git a/ingestion/code/n04_BNF.txt b/ingestion/code/n04_BNF.txt new file mode 100644 index 0000000..253a0a8 --- /dev/null +++ b/ingestion/code/n04_BNF.txt @@ -0,0 +1,3 @@ + ::= | + ::= "=" "go" "(" [] ")" + ::= "=" "gather(" ["," ] ")" \ No newline at end of file diff --git a/ingestion/code/n05_BNF.txt b/ingestion/code/n05_BNF.txt new file mode 100644 index 0000000..2563bca --- /dev/null +++ b/ingestion/code/n05_BNF.txt @@ -0,0 +1,25 @@ +/* Instanciación de conector de terceros y llamada a sus métodos dinámicos */ + ::= | + ::= "=" "avapConnector(" ")" + ::= [ "=" ] "." "(" [] ")" + +/* Cliente HTTP con Timeout Obligatorio */ + ::= | + ::= "RequestPost(" "," "," "," "," "," ")" + ::= "RequestGet(" "," "," "," "," ")" + +/* ORM y Persistencia (Estandarizado con tableName) */ + ::= | | | | | + ::= "ormDirect(" "," ")" + ::= "ormCheckTable(" "," ")" + ::= "ormCreateTable(" "," "," "," ")" + +/* ormAccessSelect(fields, tableName, selector, varTarget) */ + ::= "ormAccessSelect(" "," "," [] "," ")" + ::= "*" | + +/* ormAccessInsert(fieldsValues, tableName, varTarget) */ + ::= "ormAccessInsert(" "," "," ")" + +/* ormAccessUpdate(fields, fieldsValues, tableName, selector, varTarget) */ + ::= "ormAccessUpdate(" "," "," "," "," ")" \ No newline at end of file diff --git a/ingestion/code/n06_BNF.txt b/ingestion/code/n06_BNF.txt new file mode 100644 index 0000000..bbb341b --- /dev/null +++ b/ingestion/code/n06_BNF.txt @@ -0,0 +1,29 @@ +/* [CORRECCIÓN] Todas las subreglas de están ahora completamente expandidas. */ + ::= | | | | | | + +/* Manipulación de listas y JSON */ + ::= "variableToList(" "," ")" + | "itemFromList(" "," "," ")" + | "variableFromJSON(" "," "," ")" + | "AddVariableToJSON(" "," "," ")" + +/* Criptografía */ + ::= "encodeSHA256(" "," ")" + | "encodeMD5(" "," ")" + +/* Expresiones regulares */ + ::= "getRegex(" "," "," ")" + + ::= "getDateTime(" "," "," "," ")" +/* Argumentos: formato_salida, epoch_origen, zona_horaria, destino */ + + ::= "stampToDatetime(" "," "," "," ")" +/* Argumentos: epoch_origen, formato, timedelta, destino */ + | "getTimeStamp(" "," "," "," ")" +/* Argumentos: fecha_string, formato_entrada, timedelta, destino */ + + ::= "randomString(" "," ")" +/* Argumentos: longitud, destino */ + + ::= "replace(" "," "," "," ")" +/* Argumentos: origen, patron_busqueda, reemplazo, destino */ \ No newline at end of file diff --git a/ingestion/code/n07_BNF.txt b/ingestion/code/n07_BNF.txt new file mode 100644 index 0000000..630a99f --- /dev/null +++ b/ingestion/code/n07_BNF.txt @@ -0,0 +1,9 @@ +/* Nota: las funciones utilizan llaves {} como delimitadores de bloque por decisión + arquitectónica explícita, diferenciándose de las estructuras de control (if, loop, try) + que usan palabras clave de cierre (end(), endLoop()). Ambos patrones coexisten + en la gramática y el parser los distingue por el token de apertura. */ + ::= "function" "(" [] ")" "{" + + "}" + ::= ("," )* + ::= "return(" [] ")" \ No newline at end of file diff --git a/ingestion/code/n08_BNF.txt b/ingestion/code/n08_BNF.txt new file mode 100644 index 0000000..e42159f --- /dev/null +++ b/ingestion/code/n08_BNF.txt @@ -0,0 +1,3 @@ + ::= | + ::= "include" " " + ::= "import" " " ( "<" ">" | ) \ No newline at end of file diff --git a/ingestion/code/n09_BNF.txt b/ingestion/code/n09_BNF.txt new file mode 100644 index 0000000..90c683e --- /dev/null +++ b/ingestion/code/n09_BNF.txt @@ -0,0 +1,62 @@ +/* Jerarquía de Expresiones (Precedencia de menor a mayor) */ + ::= + ::= ( "or" )* + ::= ( "and" )* + ::= "not" | + + ::= ( )* + ::= "==" | "!=" | "<" | ">" | "<=" | ">=" | "in" | "is" + + ::= ( ( "+" | "-" ) )* + ::= ( ( "*" | "/" | "%" ) )* + ::= ( "+" | "-" ) | + ::= [ "**" ] + +/* Primarios y Átomos (Accesos, Castings, Slicing, Métodos y Funciones) + La regla cubre también el acceso a métodos de objetos conector + (conector.metodo(...)) y el acceso por clave a sus resultados (resultado["key"]) */ + ::= + | "." + | "[" "]" + | "[" [] ":" [] [":" []] "]" + | "(" [] ")" + + ::= + | "$" + | + | "(" ")" + | + | + +/* Estructuras de Datos, Comprensiones y Argumentos */ + ::= "[" [] "]" + | "[" "for" "in" [] "]" + ::= "if" + ::= "{" [] "}" + ::= ( "," )* + ::= ":" + ::= ( "," )* + +/* Tipo numérico unificado */ + ::= | + +/* Literales (Tipos de Datos Primitivos Soportados) */ + ::= | | | "None" + ::= "True" | "False" + ::= [0-9]+ + ::= [0-9]+ "." [0-9]* | "." [0-9]+ + +/* Cadenas de Texto con soporte de secuencias de escape */ + ::= "\"" "\"" | "'" "'" + ::= "\\" ( "\"" | "'" | "\\" | "n" | "t" | "r" | "0" ) + ::= ( [^"\\] | )* + ::= ( [^'\\] | )* + ::= | + +/* Reglas de Comentarios para el Lexer + El lexer aplica longest-match: /// debe evaluarse ANTES que // */ + ::= "///" + ::= "//" + ::= "/*" "*/" + ::= [^\r\n]* + ::= /* Cualquier secuencia de caracteres que no contenga la subcadena "*/" */ \ No newline at end of file diff --git a/scratches/pseco/ingestion/Code Ingestion/n00 Proper Lark Chunking.ipynb b/scratches/pseco/ingestion/Code Ingestion/n00 Proper Lark Chunking.ipynb index 01de0c6..3897027 100644 --- a/scratches/pseco/ingestion/Code Ingestion/n00 Proper Lark Chunking.ipynb +++ b/scratches/pseco/ingestion/Code Ingestion/n00 Proper Lark Chunking.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "id": "0a8abbfa", "metadata": {}, "outputs": [ @@ -24,7 +24,6 @@ "from dataclasses import dataclass\n", "from pathlib import Path\n", "from typing import Any, Dict, List, Optional, Tuple\n", - "# from bnf import grammar\n", "import nltk\n", "from elasticsearch import Elasticsearch\n", "from langchain_core.documents import Document\n", @@ -185,7 +184,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "id": "26ad9c81", "metadata": {}, "outputs": [ @@ -209,7 +208,7 @@ } ], "source": [ - "grammar_ = (DATA_DIR / \"raw\" / \"code\" / \"BNF_v1.txt\").read_text(\n", + "grammar_ = (settings.data_path / \"raw\" / \"code\" / \"BNF_v1.txt\").read_text(\n", " encoding=\"utf-8\"\n", ")\n", "grammar(grammar_)" diff --git a/scratches/pseco/ingestion/Code Ingestion/n01 BNF Check.ipynb b/scratches/pseco/ingestion/Code Ingestion/n01 BNF Check.ipynb index 785549e..faf385f 100644 --- a/scratches/pseco/ingestion/Code Ingestion/n01 BNF Check.ipynb +++ b/scratches/pseco/ingestion/Code Ingestion/n01 BNF Check.ipynb @@ -2,24 +2,55 @@ "cells": [ { "cell_type": "code", - "execution_count": 51, + "execution_count": 1, + "id": "5b646fb1", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[2mUsing Python 3.12.11 environment at: /home/pseco/VsCodeProjects/assistance-engine/.venv\u001b[0m\n", + "\u001b[2mAudited \u001b[1m1 package\u001b[0m \u001b[2min 2ms\u001b[0m\u001b[0m\n" + ] + } + ], + "source": [ + "! uv pip install bnf" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "274d6d68", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[2mUsing Python 3.12.11 environment at: /home/pseco/VsCodeProjects/assistance-engine/.venv\u001b[0m\n", + "\u001b[2mAudited \u001b[1m1 package\u001b[0m \u001b[2min 2ms\u001b[0m\u001b[0m\n" + ] + } + ], + "source": [ + "! uv pip install ebnf" + ] + }, + { + "cell_type": "code", + "execution_count": 4, "id": "0a8abbfa", "metadata": {}, "outputs": [], "source": [ "import re\n", - "\n", "from dataclasses import dataclass\n", - "\n", "from typing import Any, Dict, List, Optional, Tuple\n", - "\n", "from lark import Tree, Lark\n", - "\n", - "\n", "from bnf import grammar as bnf_grammar, parse as bnf_parse\n", - "from ebnf import grammar as ebnf_grammar, parse as ebnf_parse\n", - "\n", - "from src.config import DATA_DIR" + "from src.config import settings" ] }, { diff --git a/scripts/pipelines/flows/bnf_files_generator.py b/scripts/pipelines/flows/bnf_files_generator.py new file mode 100644 index 0000000..81e14e1 --- /dev/null +++ b/scripts/pipelines/flows/bnf_files_generator.py @@ -0,0 +1,244 @@ +""" +Generator for BNF specification files from AVAP documentation. + +This script extracts BNF specifications from the AVAP Language Reference Manual (LRM) +and generates individual text files for each BNF section. + +Output format: n0X_BNF.txt (where X is the section number) +Default output directory: ingestion/code/BNF/ +Default markdown source: docs/LRM/avap.md + +USAGE EXAMPLES: + +Use default configuration: + python scripts/pipelines/flows/bnf_files_generator.py + +Customize input and output paths: + python scripts/pipelines/flows/bnf_files_generator.py --markdown docs/LRM/avap.md --output ingestion/code + python scripts/pipelines/flows/bnf_files_generator.py -m docs/LRM/avap.md -o ingestion/code + +OPTIONS: + --markdown, -m: Path to the AVAP markdown file (relative to project root) + --output, -o: Output directory for BNF files (relative to project root) +""" + +import re +import typer +from pathlib import Path +from typing import List, Tuple, Optional + +app = typer.Typer() + + +class BNFExtractor: + """Extract BNF specifications from AVAP markdown documentation.""" + + def __init__(self, markdown_file: Path, output_dir: Path): + """ + Initialize BNF extractor. + + Args: + markdown_file: Path to the AVAP markdown file + output_dir: Directory where BNF files will be saved + """ + self.markdown_file = markdown_file + self.output_dir = output_dir + self.bnf_sections: List[Tuple[int, str, str]] = [] + + @staticmethod + def _roman_to_int(roman: str) -> int: + """ + Convert Roman numerals to integers. + + Args: + roman: Roman numeral string (e.g., 'I', 'IV', 'IX', 'XII') + + Returns: + Integer value of the Roman numeral + """ + roman_values = { + 'I': 1, 'V': 5, 'X': 10, 'L': 50, + 'C': 100, 'D': 500, 'M': 1000 + } + total = 0 + prev_value = 0 + + for char in reversed(roman): + value = roman_values.get(char, 0) + if value < prev_value: + total -= value + else: + total += value + prev_value = value + + return total + + def read_markdown_file(self) -> str: + """Read the markdown file content.""" + with open(self.markdown_file, "r", encoding="utf-8") as f: + return f.read() + + def extract_bnf_sections(self, content: str) -> List[Tuple[int, str, str]]: + """ + Extract all BNF specifications from markdown content. + + Pattern: ### Especificación BNF (Sección I) + ```bnf + ... BNF content ... + ``` + + Args: + content: Markdown file content + + Returns: + List of tuples: (section_number, section_title, bnf_content) + """ + bnf_sections = [] + + # Pattern to find BNF specification headers and extract Roman numerals + # Matches: ### Especificación BNF (Sección I), (Sección II), etc. + header_pattern = r"### Especificación BNF \(Sección ([IVXLCDM]+)\)" + + # Find all BNF headers with their positions + for match in re.finditer(header_pattern, content): + roman_numeral = match.group(1) + section_number = self._roman_to_int(roman_numeral) + header_start = match.start() + header_end = match.end() + + # Find the code block after this header + code_block_pattern = r"```bnf\n(.*?)```" + search_start = header_end + + code_match = re.search(code_block_pattern, content[search_start:], re.DOTALL) + + if code_match: + bnf_content = code_match.group(1).strip() + section_title = f"Especificación BNF - Sección {roman_numeral}" + bnf_sections.append((section_number, section_title, bnf_content)) + + self.bnf_sections = bnf_sections + return bnf_sections + + def format_bnf_file_content(self, section_number: int, title: str, bnf_content: str) -> str: + """ + Format BNF content for file output. + + Args: + section_number: Section number (1-9, etc.) + title: Section title + bnf_content: Raw BNF grammar content + + Returns: + BNF content without additional formatting + """ + return bnf_content + + def save_bnf_files(self) -> int: + """ + Save extracted BNF sections to individual files. + + File naming convention: n0X_BNF.txt (e.g., n01_BNF.txt, n02_BNF.txt, etc.) + + Returns: + Number of files created + """ + # Ensure output directory exists + self.output_dir.mkdir(parents=True, exist_ok=True) + + files_created = 0 + + for section_number, title, bnf_content in self.bnf_sections: + # Format filename with zero-padded section number + filename = f"n{section_number:02d}_BNF.txt" + filepath = self.output_dir / filename + + # Format and write file content + formatted_content = self.format_bnf_file_content( + section_number, title, bnf_content + ) + + with open(filepath, "w", encoding="utf-8") as f: + f.write(formatted_content) + + print(f"Created: {filepath}") + files_created += 1 + + return files_created + + def generate(self) -> Tuple[int, List[str]]: + """ + Execute the complete BNF extraction and file generation process. + + Returns: + Tuple of (number_of_files_created, list_of_file_paths) + """ + print(f"Reading markdown file: {self.markdown_file}") + content = self.read_markdown_file() + + print(f"Extracting BNF specifications...") + bnf_sections = self.extract_bnf_sections(content) + + print(f"Found {len(bnf_sections)} BNF sections:") + for section_number, title, _ in bnf_sections: + print(f" - {title}") + + print(f"\nSaving BNF files to: {self.output_dir}") + files_created = self.save_bnf_files() + + # Generate list of created file paths + file_paths = [ + str(self.output_dir / f"n{i:02d}_BNF.txt") + for i, _, _ in bnf_sections + ] + + return files_created, file_paths + + +@app.command() +def main( + markdown_file: str = typer.Option( + "docs/LRM/avap.md", + "--markdown", + "-m", + help="Path to AVAP markdown file (relative to project root)" + ), + output_dir: str = typer.Option( + "ingestion/code/BNF/", + "--output", + "-o", + help="Output directory for BNF files (relative to project root)" + ) +): + """Extract BNF specifications from AVAP documentation. + + Default behavior: + - Reads from: docs/LRM/avap.md + - Writes to: ingestion/code/BNF/ + """ + # Get project root directory (scripts/pipelines/flows -> project root) + script_dir = Path(__file__).parent + project_root = script_dir.parent.parent.parent + + # Convert relative paths to absolute + markdown_path = project_root / markdown_file + output_path = project_root / output_dir + + # Verify markdown file exists + if not markdown_path.exists(): + typer.echo(f"Error: Markdown file not found: {markdown_path}", err=True) + raise typer.Exit(code=1) + + # Create extractor and generate files + extractor = BNFExtractor(markdown_path, output_path) + files_created, file_paths = extractor.generate() + + print(f"\n{'='*80}") + print(f"BNF extraction complete!") + print(f"Total files created: {files_created}") + print(f"Output directory: {output_path}") + print(f"{'='*80}") + + +if __name__ == "__main__": + app()