{ "cells": [ { "cell_type": "code", "execution_count": 67, "id": "5b646fb1", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\u001b[2mUsing Python 3.12.11 environment at: /home/pseco/VsCodeProjects/assistance-engine/.venv\u001b[0m\n", "\u001b[2mAudited \u001b[1m1 package\u001b[0m \u001b[2min 2ms\u001b[0m\u001b[0m\n" ] } ], "source": [ "! uv pip install bnf" ] }, { "cell_type": "code", "execution_count": 68, "id": "274d6d68", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\u001b[2mUsing Python 3.12.11 environment at: /home/pseco/VsCodeProjects/assistance-engine/.venv\u001b[0m\n", "\u001b[2mAudited \u001b[1m1 package\u001b[0m \u001b[2min 2ms\u001b[0m\u001b[0m\n" ] } ], "source": [ "! uv pip install ebnf" ] }, { "cell_type": "code", "execution_count": 69, "id": "0a8abbfa", "metadata": {}, "outputs": [], "source": [ "import re\n", "from dataclasses import dataclass\n", "import pprint\n", "from typing import Any, Dict, List, Optional, Tuple\n", "from lark import Tree, Lark\n", "from bnf import grammar as bnf_grammar, parse as bnf_parse\n", "from src.config import settings" ] }, { "cell_type": "markdown", "id": "baa779f3", "metadata": {}, "source": [ "# Functions" ] }, { "cell_type": "code", "execution_count": 81, "id": "26927d0c", "metadata": {}, "outputs": [], "source": [ "import re\n", "\n", "def bnf_to_lark(bnf_text):\n", " # 1. ELIMINAR COMENTARIOS HUMANOS (/* ... */) COMPLETAMENTE\n", " # Limpiamos cualquier texto entre /* y */ antes de procesar el BNF\n", " text = re.sub(r\"/\\*.*?\\*/\", \"\", bnf_text, flags=re.DOTALL)\n", "\n", " # 2. TRANSFORMACIÓN ESTRUCTURAL\n", " text = re.sub(r\"<([^>]+)>\", r\"\\1\", text) # Quitar < >\n", " text = text.replace(\"::=\", \":\") # Cambiar ::= por :\n", "\n", " # 3. LIMPIEZA DE LÍNEAS RESIDUALES\n", " # Eliminamos líneas que quedaron vacías o solo con texto descriptivo\n", " lines = []\n", " for line in text.split('\\n'):\n", " line = line.strip()\n", " # Solo conservamos líneas que parezcan reglas (tengan :) o sean parte de una definición\n", " if \":\" in line or \"|\" in line or line.startswith(\" \") or line == \"\":\n", " lines.append(line)\n", " text = \"\\n\".join(lines)\n", "\n", " # 4. FORZAR MAYÚSCULAS PARA TERMINALES (LEXER)\n", " # Lark LALR necesita tokens en MAYÚSCULAS\n", " terminals = [\n", " 'identifier', 'stringliteral', 'eol', 'doc_comment', \n", " 'line_comment', 'block_comment', 'any_text', 'any_content', 'number'\n", " ]\n", " for t in terminals:\n", " text = re.sub(rf'\\b{t}\\b', t.upper(), text)\n", "\n", " # 5. INYECTAR DEFINICIONES LARK VÁLIDAS\n", " # Sustituimos definiciones rotas por las correctas al final\n", " text = re.sub(r\"EOL\\s*:.*\", \"\", text)\n", " text = re.sub(r\"ANY_TEXT\\s*:.*\", \"\", text)\n", " text = re.sub(r\"ANY_CONTENT\\s*:.*\", \"\", text)\n", " text = re.sub(r\"IDENTIFIER\\s*:.*\", \"\", text)\n", "\n", " footer = r\"\"\"\n", "// --- TERMINALES DEFINITIVOS ---\n", "IDENTIFIER: /[a-zA-Z_][a-zA-Z0-9_]*/\n", "STRINGLITERAL: /\"([^\"\\\\\\\\]|\\\\\\\\.)*\"/\n", "NUMBER: /\\d+(\\.\\d+)?/\n", "EOL: /\\r?\\n/\n", "DOC_COMMENT.2: \"///\" /[^\\r\\n]*/\n", "LINE_COMMENT: \"//\" /[^\\r\\n]*/\n", "BLOCK_COMMENT: \"/*\" /(.|\\n)*?/ \"*/\"\n", "\n", "// --- REGLAS DE APOYO ---\n", "expression: IDENTIFIER | STRINGLITERAL | NUMBER | list_display\n", "literal: STRINGLITERAL | NUMBER\n", "argument_list: expression ( \",\" expression )*\n", "list_display: \"[\" [argument_list] \"]\"\n", "\n", "// --- STUBS PARA REGLAS MENCIONADAS ---\n", "function_decl: \"def\" IDENTIFIER \"(\" [argument_list] \")\" \":\" EOL\n", "return_stmt: \"return\" [expression]\n", "io_command: \"print\" \"(\" expression \")\"\n", "control_flow: \"if\" expression \":\" EOL\n", "async_command: \"async\" statement\n", "connector_cmd: \"connect\" \"(\" \")\"\n", "db_command: \"db\" \".\" IDENTIFIER \"(\" [argument_list] \")\"\n", "http_command: \"http\" \".\" IDENTIFIER \"(\" [argument_list] \")\"\n", "util_command: \"util\" \".\" IDENTIFIER \"(\" [argument_list] \")\"\n", "modularity_cmd: \"import\" IDENTIFIER\n", "\n", "%import common.WS_INLINE\n", "%ignore WS_INLINE\n", "\"\"\"\n", " return text.strip() + \"\\n\" + footer.strip()\n" ] }, { "cell_type": "code", "execution_count": 71, "id": "89be8bf6", "metadata": {}, "outputs": [], "source": [ "@dataclass\n", "class Chunk:\n", " text: str\n", " kind: str\n", " metadata: Dict[str, Any]\n", "\n", "def _span(node: Tree) -> Optional[Tuple[int, int]]:\n", " m = node.meta\n", " s = getattr(m, \"start_pos\", None)\n", " e = getattr(m, \"end_pos\", None)\n", " if s is None or e is None:\n", " return None\n", " return s, e\n", "\n", "def _iter_trees(t: Tree):\n", " yield t\n", " for c in t.children:\n", " if isinstance(c, Tree):\n", " yield from _iter_trees(c)\n", "\n", "def _cmd_name(line: str) -> Optional[str]:\n", " m = re.match(r\"^\\s*([A-Za-z_][A-Za-z0-9_]*)\\s*\\(\", line)\n", " return m.group(1) if m else None\n", "\n", "def chunk_atomic_lines(code: str) -> List[Chunk]:\n", " tree = parser.parse(code)\n", " chunks: List[Chunk] = []\n", "\n", " for node in _iter_trees(tree):\n", " if node.data == \"stmt_line\":\n", " sp = _span(node)\n", " if not sp:\n", " continue\n", " s, e = sp\n", " text = code[s:e].strip()\n", " if not text:\n", " continue\n", "\n", " chunks.append(\n", " Chunk(\n", " text=text,\n", " kind=\"line\",\n", " metadata={\n", " \"granularity\": \"atomic\",\n", " \"command\": _cmd_name(text)\n", " }\n", " )\n", " )\n", " return chunks\n", "\n", "def chunk_blocks(code: str) -> List[Chunk]:\n", " tree = parser.parse(code)\n", " chunks: List[Chunk] = []\n", "\n", " for node in _iter_trees(tree):\n", " if node.data in (\"if_block\", \"loop_block\", \"try_block\", \"go_async_block\", \"function_block\"):\n", " sp = _span(node)\n", " if not sp:\n", " continue\n", " s, e = sp\n", " text = code[s:e].strip()\n", " if not text:\n", " continue\n", "\n", " chunks.append(\n", " Chunk(\n", " text=text,\n", " kind=node.data,\n", " metadata={\"granularity\": \"block\"}\n", " )\n", " )\n", " return chunks\n", "\n", "def chunk_avap_code(code: str) -> List[Chunk]:\n", " # Keep original offsets: do NOT lstrip. Grammar already accepts leading _NL.\n", " blocks = chunk_blocks(code)\n", " lines = chunk_atomic_lines(code)\n", " return blocks + lines" ] }, { "cell_type": "markdown", "id": "23a92e13", "metadata": {}, "source": [ "# BNF to Lark" ] }, { "cell_type": "code", "execution_count": 77, "id": "c66842c7", "metadata": {}, "outputs": [], "source": [ "from pathlib import Path\n", "\n", "bnf_path = Path(settings.proj_root / \"ingestion/code/BNF/n01_BNF.txt\")\n", "if not bnf_path.exists():\n", " raise FileNotFoundError(f\"BNF file not found: {bnf_path}\")\n", "\n", "bnf_grammar: str = bnf_path.read_text(encoding=\"utf-8\")" ] }, { "cell_type": "code", "execution_count": 78, "id": "ebf4aaac", "metadata": {}, "outputs": [], "source": [ "lark_bnf = bnf_to_lark(bnf_grammar)" ] }, { "cell_type": "code", "execution_count": 79, "id": "8122b603", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "('program : ( line | BLOCK_COMMENT )*\\n'\n", " 'line : [ statement ] [ LINE_COMMENT | DOC_COMMENT ] EOL\\n'\n", " ' | ( LINE_COMMENT | DOC_COMMENT ) EOL\\n'\n", " 'EOL\\n'\n", " '\\n'\n", " 'DOC_COMMENT : \"///\" ANY_TEXT\\n'\n", " 'LINE_COMMENT : \"//\" ANY_TEXT\\n'\n", " 'BLOCK_COMMENT : \"/*\" ANY_CONTENT \"*/\"\\n'\n", " 'ANY_TEXT\\n'\n", " 'ANY_CONTENT\\n'\n", " '\\n'\n", " 'statement : assignment\\n'\n", " ' | method_call_stmt\\n'\n", " ' | function_call_stmt\\n'\n", " ' | function_decl\\n'\n", " ' | return_stmt\\n'\n", " ' | system_command\\n'\n", " ' | io_command\\n'\n", " ' | control_flow\\n'\n", " ' | async_command\\n'\n", " ' | connector_cmd\\n'\n", " ' | db_command\\n'\n", " ' | http_command\\n'\n", " ' | util_command\\n'\n", " ' | modularity_cmd\\n'\n", " '\\n'\n", " 'assignment : IDENTIFIER \"=\" expression\\n'\n", " '\\n'\n", " 'function_call_stmt : IDENTIFIER \"(\" [argument_list] \")\"\\n'\n", " '\\n'\n", " 'method_call_stmt : IDENTIFIER \"=\" IDENTIFIER \".\" IDENTIFIER \"(\" '\n", " '[argument_list] \")\"\\n'\n", " '\\n'\n", " 'system_command : register_cmd | addvar_cmd\\n'\n", " 'register_cmd : \"registerEndpoint(\" STRINGLITERAL \",\" STRINGLITERAL \",\" '\n", " 'list_display \",\" STRINGLITERAL \",\" IDENTIFIER \",\" IDENTIFIER \")\"\\n'\n", " 'addvar_cmd : \"addVar(\" addvar_arg \",\" addvar_arg \")\"\\n'\n", " 'addvar_arg : IDENTIFIER | literal | \"$\" IDENTIFIER\\n'\n", " '\\n'\n", " 'IDENTIFIER\\n'\n", " '\\n'\n", " 'system_variable : \"_status\"\\n'\n", " '// --- TERMINALES (LEXER) ---\\n'\n", " 'IDENTIFIER: /[a-zA-Z_][a-zA-Z0-9_]*/\\n'\n", " 'STRINGLITERAL: /\"[^\"\\\\r\\\\n]*\"/\\n'\n", " 'NUMBER: /\\\\d+(\\\\.\\\\d+)?/\\n'\n", " 'EOL: /\\\\r?\\\\n/\\n'\n", " 'DOC_COMMENT.2: \"///\" ANY_TEXT\\n'\n", " 'LINE_COMMENT: \"//\" ANY_TEXT\\n'\n", " 'BLOCK_COMMENT: \"/*\" ANY_CONTENT \"*/\"\\n'\n", " 'ANY_TEXT: /[^\\\\r\\\\n]*/\\n'\n", " 'ANY_CONTENT: /(.|\\\\n)*?/(?=\\\\*/|$)\\n'\n", " '\\n'\n", " '// --- REGLAS DE SOPORTE (STUBS) ---\\n'\n", " '// Estas reglas deben existir para que el parser no dé error de \"Undefined '\n", " 'Rule\"\\n'\n", " 'expression: IDENTIFIER | STRINGLITERAL | NUMBER | list_display\\n'\n", " 'literal: STRINGLITERAL | NUMBER\\n'\n", " 'argument_list: expression ( \",\" expression )*\\n'\n", " 'list_display: \"[\" [argument_list] \"]\"\\n'\n", " '\\n'\n", " '// Stubs para los comandos que mencionas pero no defines en el BNF\\n'\n", " 'function_decl: \"def\" IDENTIFIER \"(\" [argument_list] \")\" \":\" EOL\\n'\n", " 'return_stmt: \"return\" [expression]\\n'\n", " 'io_command: \"print\" \"(\" expression \")\"\\n'\n", " 'control_flow: \"if\" expression \":\" EOL\\n'\n", " 'async_command: \"async\" statement\\n'\n", " 'connector_cmd: \"connect\" \"(\" \")\"\\n'\n", " 'db_command: \"db\" \".\" IDENTIFIER \"(\" [argument_list] \")\"\\n'\n", " 'http_command: \"http\" \".\" IDENTIFIER \"(\" [argument_list] \")\"\\n'\n", " 'util_command: \"util\" \".\" IDENTIFIER \"(\" [argument_list] \")\"\\n'\n", " 'modularity_cmd: \"import\" IDENTIFIER\\n'\n", " '\\n'\n", " '%import common.WS_INLINE\\n'\n", " '%ignore WS_INLINE')\n" ] } ], "source": [ "pprint.PrettyPrinter().pprint(lark_bnf)" ] }, { "cell_type": "code", "execution_count": 80, "id": "993a3d63", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Unexpected token Token('_NL', '\\n\\n') at line 4, column 4.\n", "Expected one of: \n", "\t* _COLON\n", "\t* _DOT\n", "\n" ] } ], "source": [ "try:\n", " parser = Lark(lark_bnf, parser=\"lalr\", start=\"start\")\n", "except Exception as e:\n", " print(e) # shows which symbol is missing and where" ] }, { "cell_type": "markdown", "id": "49953efd", "metadata": {}, "source": [ "# BNF conversion to EBNF" ] }, { "cell_type": "markdown", "id": "32dbc2c5", "metadata": {}, "source": [ "# EBNF Check" ] }, { "cell_type": "code", "execution_count": 63, "id": "37968906", "metadata": {}, "outputs": [], "source": [ "ebnf_text = r\"\"\"\n", "assign ::= name '=' num ;\n", "name ::= 'a' | 'b' | 'c' ;\n", "num ::= [0-9] ;\n", "\"\"\"" ] }, { "cell_type": "code", "execution_count": 64, "id": "b234f2c4", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "BNF: True\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Generating LALR tables\n" ] } ], "source": [ "ebnf_grammar(ebnf_text)\n", "print(\"BNF:\", ebnf_parse(\"a=7\"))" ] }, { "cell_type": "markdown", "id": "66fb8fee", "metadata": {}, "source": [ "# Lark check EBNF Style" ] }, { "cell_type": "code", "execution_count": 54, "id": "08e53ccb", "metadata": {}, "outputs": [], "source": [ "ebnf_text = r\"\"\"\n", "start: assign\n", "\n", "assign: name \"=\" num\n", "name: \"a\" | \"b\" | \"c\"\n", "num: DIGIT\n", "\n", "DIGIT: /[0-9]/\n", "\n", "%ignore \" \"\n", "\"\"\"" ] }, { "cell_type": "code", "execution_count": 55, "id": "52935608", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Tree(Token('RULE', 'start'), [Tree(Token('RULE', 'assign'), [Tree(Token('RULE', 'name'), []), Tree(Token('RULE', 'num'), [Token('DIGIT', '7')])])])\n" ] } ], "source": [ "parser = Lark(ebnf_text)\n", "\n", "print(parser.parse(\"a=7\"))" ] } ], "metadata": { "kernelspec": { "display_name": "assistance-engine", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.11" } }, "nbformat": 4, "nbformat_minor": 5 }