508 lines
15 KiB
Plaintext
508 lines
15 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 67,
|
|
"id": "5b646fb1",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"\u001b[2mUsing Python 3.12.11 environment at: /home/pseco/VsCodeProjects/assistance-engine/.venv\u001b[0m\n",
|
|
"\u001b[2mAudited \u001b[1m1 package\u001b[0m \u001b[2min 2ms\u001b[0m\u001b[0m\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"! uv pip install bnf"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 68,
|
|
"id": "274d6d68",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"\u001b[2mUsing Python 3.12.11 environment at: /home/pseco/VsCodeProjects/assistance-engine/.venv\u001b[0m\n",
|
|
"\u001b[2mAudited \u001b[1m1 package\u001b[0m \u001b[2min 2ms\u001b[0m\u001b[0m\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"! uv pip install ebnf"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 69,
|
|
"id": "0a8abbfa",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import re\n",
|
|
"from dataclasses import dataclass\n",
|
|
"import pprint\n",
|
|
"from typing import Any, Dict, List, Optional, Tuple\n",
|
|
"from lark import Tree, Lark\n",
|
|
"from bnf import grammar as bnf_grammar, parse as bnf_parse\n",
|
|
"from src.config import settings"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "baa779f3",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Functions"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 81,
|
|
"id": "26927d0c",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import re\n",
|
|
"\n",
|
|
"def bnf_to_lark(bnf_text):\n",
|
|
" # 1. ELIMINAR COMENTARIOS HUMANOS (/* ... */) COMPLETAMENTE\n",
|
|
" # Limpiamos cualquier texto entre /* y */ antes de procesar el BNF\n",
|
|
" text = re.sub(r\"/\\*.*?\\*/\", \"\", bnf_text, flags=re.DOTALL)\n",
|
|
"\n",
|
|
" # 2. TRANSFORMACIÓN ESTRUCTURAL\n",
|
|
" text = re.sub(r\"<([^>]+)>\", r\"\\1\", text) # Quitar < >\n",
|
|
" text = text.replace(\"::=\", \":\") # Cambiar ::= por :\n",
|
|
"\n",
|
|
" # 3. LIMPIEZA DE LÍNEAS RESIDUALES\n",
|
|
" # Eliminamos líneas que quedaron vacías o solo con texto descriptivo\n",
|
|
" lines = []\n",
|
|
" for line in text.split('\\n'):\n",
|
|
" line = line.strip()\n",
|
|
" # Solo conservamos líneas que parezcan reglas (tengan :) o sean parte de una definición\n",
|
|
" if \":\" in line or \"|\" in line or line.startswith(\" \") or line == \"\":\n",
|
|
" lines.append(line)\n",
|
|
" text = \"\\n\".join(lines)\n",
|
|
"\n",
|
|
" # 4. FORZAR MAYÚSCULAS PARA TERMINALES (LEXER)\n",
|
|
" # Lark LALR necesita tokens en MAYÚSCULAS\n",
|
|
" terminals = [\n",
|
|
" 'identifier', 'stringliteral', 'eol', 'doc_comment', \n",
|
|
" 'line_comment', 'block_comment', 'any_text', 'any_content', 'number'\n",
|
|
" ]\n",
|
|
" for t in terminals:\n",
|
|
" text = re.sub(rf'\\b{t}\\b', t.upper(), text)\n",
|
|
"\n",
|
|
" # 5. INYECTAR DEFINICIONES LARK VÁLIDAS\n",
|
|
" # Sustituimos definiciones rotas por las correctas al final\n",
|
|
" text = re.sub(r\"EOL\\s*:.*\", \"\", text)\n",
|
|
" text = re.sub(r\"ANY_TEXT\\s*:.*\", \"\", text)\n",
|
|
" text = re.sub(r\"ANY_CONTENT\\s*:.*\", \"\", text)\n",
|
|
" text = re.sub(r\"IDENTIFIER\\s*:.*\", \"\", text)\n",
|
|
"\n",
|
|
" footer = r\"\"\"\n",
|
|
"// --- TERMINALES DEFINITIVOS ---\n",
|
|
"IDENTIFIER: /[a-zA-Z_][a-zA-Z0-9_]*/\n",
|
|
"STRINGLITERAL: /\"([^\"\\\\\\\\]|\\\\\\\\.)*\"/\n",
|
|
"NUMBER: /\\d+(\\.\\d+)?/\n",
|
|
"EOL: /\\r?\\n/\n",
|
|
"DOC_COMMENT.2: \"///\" /[^\\r\\n]*/\n",
|
|
"LINE_COMMENT: \"//\" /[^\\r\\n]*/\n",
|
|
"BLOCK_COMMENT: \"/*\" /(.|\\n)*?/ \"*/\"\n",
|
|
"\n",
|
|
"// --- REGLAS DE APOYO ---\n",
|
|
"expression: IDENTIFIER | STRINGLITERAL | NUMBER | list_display\n",
|
|
"literal: STRINGLITERAL | NUMBER\n",
|
|
"argument_list: expression ( \",\" expression )*\n",
|
|
"list_display: \"[\" [argument_list] \"]\"\n",
|
|
"\n",
|
|
"// --- STUBS PARA REGLAS MENCIONADAS ---\n",
|
|
"function_decl: \"def\" IDENTIFIER \"(\" [argument_list] \")\" \":\" EOL\n",
|
|
"return_stmt: \"return\" [expression]\n",
|
|
"io_command: \"print\" \"(\" expression \")\"\n",
|
|
"control_flow: \"if\" expression \":\" EOL\n",
|
|
"async_command: \"async\" statement\n",
|
|
"connector_cmd: \"connect\" \"(\" \")\"\n",
|
|
"db_command: \"db\" \".\" IDENTIFIER \"(\" [argument_list] \")\"\n",
|
|
"http_command: \"http\" \".\" IDENTIFIER \"(\" [argument_list] \")\"\n",
|
|
"util_command: \"util\" \".\" IDENTIFIER \"(\" [argument_list] \")\"\n",
|
|
"modularity_cmd: \"import\" IDENTIFIER\n",
|
|
"\n",
|
|
"%import common.WS_INLINE\n",
|
|
"%ignore WS_INLINE\n",
|
|
"\"\"\"\n",
|
|
" return text.strip() + \"\\n\" + footer.strip()\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 71,
|
|
"id": "89be8bf6",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"@dataclass\n",
|
|
"class Chunk:\n",
|
|
" text: str\n",
|
|
" kind: str\n",
|
|
" metadata: Dict[str, Any]\n",
|
|
"\n",
|
|
"def _span(node: Tree) -> Optional[Tuple[int, int]]:\n",
|
|
" m = node.meta\n",
|
|
" s = getattr(m, \"start_pos\", None)\n",
|
|
" e = getattr(m, \"end_pos\", None)\n",
|
|
" if s is None or e is None:\n",
|
|
" return None\n",
|
|
" return s, e\n",
|
|
"\n",
|
|
"def _iter_trees(t: Tree):\n",
|
|
" yield t\n",
|
|
" for c in t.children:\n",
|
|
" if isinstance(c, Tree):\n",
|
|
" yield from _iter_trees(c)\n",
|
|
"\n",
|
|
"def _cmd_name(line: str) -> Optional[str]:\n",
|
|
" m = re.match(r\"^\\s*([A-Za-z_][A-Za-z0-9_]*)\\s*\\(\", line)\n",
|
|
" return m.group(1) if m else None\n",
|
|
"\n",
|
|
"def chunk_atomic_lines(code: str) -> List[Chunk]:\n",
|
|
" tree = parser.parse(code)\n",
|
|
" chunks: List[Chunk] = []\n",
|
|
"\n",
|
|
" for node in _iter_trees(tree):\n",
|
|
" if node.data == \"stmt_line\":\n",
|
|
" sp = _span(node)\n",
|
|
" if not sp:\n",
|
|
" continue\n",
|
|
" s, e = sp\n",
|
|
" text = code[s:e].strip()\n",
|
|
" if not text:\n",
|
|
" continue\n",
|
|
"\n",
|
|
" chunks.append(\n",
|
|
" Chunk(\n",
|
|
" text=text,\n",
|
|
" kind=\"line\",\n",
|
|
" metadata={\n",
|
|
" \"granularity\": \"atomic\",\n",
|
|
" \"command\": _cmd_name(text)\n",
|
|
" }\n",
|
|
" )\n",
|
|
" )\n",
|
|
" return chunks\n",
|
|
"\n",
|
|
"def chunk_blocks(code: str) -> List[Chunk]:\n",
|
|
" tree = parser.parse(code)\n",
|
|
" chunks: List[Chunk] = []\n",
|
|
"\n",
|
|
" for node in _iter_trees(tree):\n",
|
|
" if node.data in (\"if_block\", \"loop_block\", \"try_block\", \"go_async_block\", \"function_block\"):\n",
|
|
" sp = _span(node)\n",
|
|
" if not sp:\n",
|
|
" continue\n",
|
|
" s, e = sp\n",
|
|
" text = code[s:e].strip()\n",
|
|
" if not text:\n",
|
|
" continue\n",
|
|
"\n",
|
|
" chunks.append(\n",
|
|
" Chunk(\n",
|
|
" text=text,\n",
|
|
" kind=node.data,\n",
|
|
" metadata={\"granularity\": \"block\"}\n",
|
|
" )\n",
|
|
" )\n",
|
|
" return chunks\n",
|
|
"\n",
|
|
"def chunk_avap_code(code: str) -> List[Chunk]:\n",
|
|
" # Keep original offsets: do NOT lstrip. Grammar already accepts leading _NL.\n",
|
|
" blocks = chunk_blocks(code)\n",
|
|
" lines = chunk_atomic_lines(code)\n",
|
|
" return blocks + lines"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "23a92e13",
|
|
"metadata": {},
|
|
"source": [
|
|
"# BNF to Lark"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 77,
|
|
"id": "c66842c7",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from pathlib import Path\n",
|
|
"\n",
|
|
"bnf_path = Path(settings.proj_root / \"ingestion/code/BNF/n01_BNF.txt\")\n",
|
|
"if not bnf_path.exists():\n",
|
|
" raise FileNotFoundError(f\"BNF file not found: {bnf_path}\")\n",
|
|
"\n",
|
|
"bnf_grammar: str = bnf_path.read_text(encoding=\"utf-8\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 78,
|
|
"id": "ebf4aaac",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"lark_bnf = bnf_to_lark(bnf_grammar)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 79,
|
|
"id": "8122b603",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"('program : ( line | BLOCK_COMMENT )*\\n'\n",
|
|
" 'line : [ statement ] [ LINE_COMMENT | DOC_COMMENT ] EOL\\n'\n",
|
|
" ' | ( LINE_COMMENT | DOC_COMMENT ) EOL\\n'\n",
|
|
" 'EOL\\n'\n",
|
|
" '\\n'\n",
|
|
" 'DOC_COMMENT : \"///\" ANY_TEXT\\n'\n",
|
|
" 'LINE_COMMENT : \"//\" ANY_TEXT\\n'\n",
|
|
" 'BLOCK_COMMENT : \"/*\" ANY_CONTENT \"*/\"\\n'\n",
|
|
" 'ANY_TEXT\\n'\n",
|
|
" 'ANY_CONTENT\\n'\n",
|
|
" '\\n'\n",
|
|
" 'statement : assignment\\n'\n",
|
|
" ' | method_call_stmt\\n'\n",
|
|
" ' | function_call_stmt\\n'\n",
|
|
" ' | function_decl\\n'\n",
|
|
" ' | return_stmt\\n'\n",
|
|
" ' | system_command\\n'\n",
|
|
" ' | io_command\\n'\n",
|
|
" ' | control_flow\\n'\n",
|
|
" ' | async_command\\n'\n",
|
|
" ' | connector_cmd\\n'\n",
|
|
" ' | db_command\\n'\n",
|
|
" ' | http_command\\n'\n",
|
|
" ' | util_command\\n'\n",
|
|
" ' | modularity_cmd\\n'\n",
|
|
" '\\n'\n",
|
|
" 'assignment : IDENTIFIER \"=\" expression\\n'\n",
|
|
" '\\n'\n",
|
|
" 'function_call_stmt : IDENTIFIER \"(\" [argument_list] \")\"\\n'\n",
|
|
" '\\n'\n",
|
|
" 'method_call_stmt : IDENTIFIER \"=\" IDENTIFIER \".\" IDENTIFIER \"(\" '\n",
|
|
" '[argument_list] \")\"\\n'\n",
|
|
" '\\n'\n",
|
|
" 'system_command : register_cmd | addvar_cmd\\n'\n",
|
|
" 'register_cmd : \"registerEndpoint(\" STRINGLITERAL \",\" STRINGLITERAL \",\" '\n",
|
|
" 'list_display \",\" STRINGLITERAL \",\" IDENTIFIER \",\" IDENTIFIER \")\"\\n'\n",
|
|
" 'addvar_cmd : \"addVar(\" addvar_arg \",\" addvar_arg \")\"\\n'\n",
|
|
" 'addvar_arg : IDENTIFIER | literal | \"$\" IDENTIFIER\\n'\n",
|
|
" '\\n'\n",
|
|
" 'IDENTIFIER\\n'\n",
|
|
" '\\n'\n",
|
|
" 'system_variable : \"_status\"\\n'\n",
|
|
" '// --- TERMINALES (LEXER) ---\\n'\n",
|
|
" 'IDENTIFIER: /[a-zA-Z_][a-zA-Z0-9_]*/\\n'\n",
|
|
" 'STRINGLITERAL: /\"[^\"\\\\r\\\\n]*\"/\\n'\n",
|
|
" 'NUMBER: /\\\\d+(\\\\.\\\\d+)?/\\n'\n",
|
|
" 'EOL: /\\\\r?\\\\n/\\n'\n",
|
|
" 'DOC_COMMENT.2: \"///\" ANY_TEXT\\n'\n",
|
|
" 'LINE_COMMENT: \"//\" ANY_TEXT\\n'\n",
|
|
" 'BLOCK_COMMENT: \"/*\" ANY_CONTENT \"*/\"\\n'\n",
|
|
" 'ANY_TEXT: /[^\\\\r\\\\n]*/\\n'\n",
|
|
" 'ANY_CONTENT: /(.|\\\\n)*?/(?=\\\\*/|$)\\n'\n",
|
|
" '\\n'\n",
|
|
" '// --- REGLAS DE SOPORTE (STUBS) ---\\n'\n",
|
|
" '// Estas reglas deben existir para que el parser no dé error de \"Undefined '\n",
|
|
" 'Rule\"\\n'\n",
|
|
" 'expression: IDENTIFIER | STRINGLITERAL | NUMBER | list_display\\n'\n",
|
|
" 'literal: STRINGLITERAL | NUMBER\\n'\n",
|
|
" 'argument_list: expression ( \",\" expression )*\\n'\n",
|
|
" 'list_display: \"[\" [argument_list] \"]\"\\n'\n",
|
|
" '\\n'\n",
|
|
" '// Stubs para los comandos que mencionas pero no defines en el BNF\\n'\n",
|
|
" 'function_decl: \"def\" IDENTIFIER \"(\" [argument_list] \")\" \":\" EOL\\n'\n",
|
|
" 'return_stmt: \"return\" [expression]\\n'\n",
|
|
" 'io_command: \"print\" \"(\" expression \")\"\\n'\n",
|
|
" 'control_flow: \"if\" expression \":\" EOL\\n'\n",
|
|
" 'async_command: \"async\" statement\\n'\n",
|
|
" 'connector_cmd: \"connect\" \"(\" \")\"\\n'\n",
|
|
" 'db_command: \"db\" \".\" IDENTIFIER \"(\" [argument_list] \")\"\\n'\n",
|
|
" 'http_command: \"http\" \".\" IDENTIFIER \"(\" [argument_list] \")\"\\n'\n",
|
|
" 'util_command: \"util\" \".\" IDENTIFIER \"(\" [argument_list] \")\"\\n'\n",
|
|
" 'modularity_cmd: \"import\" IDENTIFIER\\n'\n",
|
|
" '\\n'\n",
|
|
" '%import common.WS_INLINE\\n'\n",
|
|
" '%ignore WS_INLINE')\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"pprint.PrettyPrinter().pprint(lark_bnf)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 80,
|
|
"id": "993a3d63",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Unexpected token Token('_NL', '\\n\\n') at line 4, column 4.\n",
|
|
"Expected one of: \n",
|
|
"\t* _COLON\n",
|
|
"\t* _DOT\n",
|
|
"\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"try:\n",
|
|
" parser = Lark(lark_bnf, parser=\"lalr\", start=\"start\")\n",
|
|
"except Exception as e:\n",
|
|
" print(e) # shows which symbol is missing and where"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "49953efd",
|
|
"metadata": {},
|
|
"source": [
|
|
"# BNF conversion to EBNF"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "32dbc2c5",
|
|
"metadata": {},
|
|
"source": [
|
|
"# EBNF Check"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 63,
|
|
"id": "37968906",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"ebnf_text = r\"\"\"\n",
|
|
"assign ::= name '=' num ;\n",
|
|
"name ::= 'a' | 'b' | 'c' ;\n",
|
|
"num ::= [0-9] ;\n",
|
|
"\"\"\""
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 64,
|
|
"id": "b234f2c4",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"BNF: True\n"
|
|
]
|
|
},
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Generating LALR tables\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"ebnf_grammar(ebnf_text)\n",
|
|
"print(\"BNF:\", ebnf_parse(\"a=7\"))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "66fb8fee",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Lark check EBNF Style"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 54,
|
|
"id": "08e53ccb",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"ebnf_text = r\"\"\"\n",
|
|
"start: assign\n",
|
|
"\n",
|
|
"assign: name \"=\" num\n",
|
|
"name: \"a\" | \"b\" | \"c\"\n",
|
|
"num: DIGIT\n",
|
|
"\n",
|
|
"DIGIT: /[0-9]/\n",
|
|
"\n",
|
|
"%ignore \" \"\n",
|
|
"\"\"\""
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 55,
|
|
"id": "52935608",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Tree(Token('RULE', 'start'), [Tree(Token('RULE', 'assign'), [Tree(Token('RULE', 'name'), []), Tree(Token('RULE', 'num'), [Token('DIGIT', '7')])])])\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"parser = Lark(ebnf_text)\n",
|
|
"\n",
|
|
"print(parser.parse(\"a=7\"))"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "assistance-engine",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.12.11"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|