{ "cells": [ { "cell_type": "code", "execution_count": 51, "id": "0a8abbfa", "metadata": {}, "outputs": [], "source": [ "import re\n", "\n", "from dataclasses import dataclass\n", "\n", "from typing import Any, Dict, List, Optional, Tuple\n", "\n", "from lark import Tree, Lark\n", "\n", "\n", "from bnf import grammar as bnf_grammar, parse as bnf_parse\n", "from ebnf import grammar as ebnf_grammar, parse as ebnf_parse\n", "\n", "from src.config import DATA_DIR" ] }, { "cell_type": "markdown", "id": "baa779f3", "metadata": {}, "source": [ "# Functions" ] }, { "cell_type": "code", "execution_count": 3, "id": "26927d0c", "metadata": {}, "outputs": [], "source": [ "def bnf_to_lark(bnf_text):\n", " text = re.sub(r\"<([^>]+)>\", r\"\\1\", bnf_text) # remove <>\n", " text = text.replace(\"::=\", \":\")\n", " return text" ] }, { "cell_type": "code", "execution_count": 4, "id": "89be8bf6", "metadata": {}, "outputs": [], "source": [ "@dataclass\n", "class Chunk:\n", " text: str\n", " kind: str\n", " metadata: Dict[str, Any]\n", "\n", "def _span(node: Tree) -> Optional[Tuple[int, int]]:\n", " m = node.meta\n", " s = getattr(m, \"start_pos\", None)\n", " e = getattr(m, \"end_pos\", None)\n", " if s is None or e is None:\n", " return None\n", " return s, e\n", "\n", "def _iter_trees(t: Tree):\n", " yield t\n", " for c in t.children:\n", " if isinstance(c, Tree):\n", " yield from _iter_trees(c)\n", "\n", "def _cmd_name(line: str) -> Optional[str]:\n", " m = re.match(r\"^\\s*([A-Za-z_][A-Za-z0-9_]*)\\s*\\(\", line)\n", " return m.group(1) if m else None\n", "\n", "def chunk_atomic_lines(code: str) -> List[Chunk]:\n", " tree = parser.parse(code)\n", " chunks: List[Chunk] = []\n", "\n", " for node in _iter_trees(tree):\n", " if node.data == \"stmt_line\":\n", " sp = _span(node)\n", " if not sp:\n", " continue\n", " s, e = sp\n", " text = code[s:e].strip()\n", " if not text:\n", " continue\n", "\n", " chunks.append(\n", " Chunk(\n", " text=text,\n", " kind=\"line\",\n", " metadata={\n", " \"granularity\": \"atomic\",\n", " \"command\": _cmd_name(text)\n", " }\n", " )\n", " )\n", " return chunks\n", "\n", "def chunk_blocks(code: str) -> List[Chunk]:\n", " tree = parser.parse(code)\n", " chunks: List[Chunk] = []\n", "\n", " for node in _iter_trees(tree):\n", " if node.data in (\"if_block\", \"loop_block\", \"try_block\", \"go_async_block\", \"function_block\"):\n", " sp = _span(node)\n", " if not sp:\n", " continue\n", " s, e = sp\n", " text = code[s:e].strip()\n", " if not text:\n", " continue\n", "\n", " chunks.append(\n", " Chunk(\n", " text=text,\n", " kind=node.data,\n", " metadata={\"granularity\": \"block\"}\n", " )\n", " )\n", " return chunks\n", "\n", "def chunk_avap_code(code: str) -> List[Chunk]:\n", " # Keep original offsets: do NOT lstrip. Grammar already accepts leading _NL.\n", " blocks = chunk_blocks(code)\n", " lines = chunk_atomic_lines(code)\n", " return blocks + lines" ] }, { "cell_type": "markdown", "id": "23a92e13", "metadata": {}, "source": [ "# BNF check" ] }, { "cell_type": "code", "execution_count": null, "id": "a10a1017", "metadata": {}, "outputs": [], "source": [ "bnf_text = r\"\"\"\n", " ::= \n", " ::= a | b | c\n", " ::= [0-9]\n", "\"\"\"" ] }, { "cell_type": "code", "execution_count": 68, "id": "4790023e", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "line 1 : syntax error at or before EOL = \n", "\n", "line 2 : illegal character '='\n", "BNF: False\n" ] } ], "source": [ "bnf_grammar(bnf_text)\n", "print(\"BNF:\", bnf_parse(\"a=7\"))" ] }, { "cell_type": "markdown", "id": "49953efd", "metadata": {}, "source": [ "# BNF conversion to EBNF" ] }, { "cell_type": "markdown", "id": "32dbc2c5", "metadata": {}, "source": [ "# EBNF Check" ] }, { "cell_type": "code", "execution_count": 63, "id": "37968906", "metadata": {}, "outputs": [], "source": [ "ebnf_text = r\"\"\"\n", "assign ::= name '=' num ;\n", "name ::= 'a' | 'b' | 'c' ;\n", "num ::= [0-9] ;\n", "\"\"\"" ] }, { "cell_type": "code", "execution_count": 64, "id": "b234f2c4", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "BNF: True\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Generating LALR tables\n" ] } ], "source": [ "ebnf_grammar(ebnf_text)\n", "print(\"BNF:\", ebnf_parse(\"a=7\"))" ] }, { "cell_type": "markdown", "id": "66fb8fee", "metadata": {}, "source": [ "# Lark check EBNF Style" ] }, { "cell_type": "code", "execution_count": 54, "id": "08e53ccb", "metadata": {}, "outputs": [], "source": [ "ebnf_text = r\"\"\"\n", "start: assign\n", "\n", "assign: name \"=\" num\n", "name: \"a\" | \"b\" | \"c\"\n", "num: DIGIT\n", "\n", "DIGIT: /[0-9]/\n", "\n", "%ignore \" \"\n", "\"\"\"" ] }, { "cell_type": "code", "execution_count": 55, "id": "52935608", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Tree(Token('RULE', 'start'), [Tree(Token('RULE', 'assign'), [Tree(Token('RULE', 'name'), []), Tree(Token('RULE', 'num'), [Token('DIGIT', '7')])])])\n" ] } ], "source": [ "parser = Lark(ebnf_text)\n", "\n", "print(parser.parse(\"a=7\"))" ] } ], "metadata": { "kernelspec": { "display_name": "assistance-engine", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.11" } }, "nbformat": 4, "nbformat_minor": 5 }