diff --git a/scratches/pseco/ingestion/Code Ingestion/n02 BNF Lark example.ipynb b/scratches/pseco/ingestion/Code Ingestion/n02 BNF Lark example.ipynb new file mode 100644 index 0000000..3bc266a --- /dev/null +++ b/scratches/pseco/ingestion/Code Ingestion/n02 BNF Lark example.ipynb @@ -0,0 +1,596 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "5b646fb1", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[2mUsing Python 3.12.11 environment at: /home/pseco/VsCodeProjects/assistance-engine/.venv\u001b[0m\n", + "\u001b[2mAudited \u001b[1m1 package\u001b[0m \u001b[2min 2ms\u001b[0m\u001b[0m\n" + ] + } + ], + "source": [ + "! uv pip install bnf" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "274d6d68", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[2mUsing Python 3.12.11 environment at: /home/pseco/VsCodeProjects/assistance-engine/.venv\u001b[0m\n", + "\u001b[2mAudited \u001b[1m1 package\u001b[0m \u001b[2min 2ms\u001b[0m\u001b[0m\n" + ] + } + ], + "source": [ + "! uv pip install ebnf" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "id": "0a8abbfa", + "metadata": {}, + "outputs": [], + "source": [ + "import re\n", + "from dataclasses import dataclass\n", + "import pprint\n", + "from pathlib import Path\n", + "from typing import Any, Dict, List, Optional, Tuple\n", + "from lark import Tree, Lark\n", + "from bnf import grammar as bnf_grammar, parse as bnf_parse\n", + "from src.config import settings\n", + "from lark import Lark" + ] + }, + { + "cell_type": "markdown", + "id": "baa779f3", + "metadata": {}, + "source": [ + "# Functions" + ] + }, + { + "cell_type": "markdown", + "id": "23a92e13", + "metadata": {}, + "source": [ + "# BNF to Lark" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "id": "93d2db25", + "metadata": {}, + "outputs": [], + "source": [ + "grammar = r\"\"\"\n", + "start: program\n", + " \n", + "program: (line | BLOCK_COMMENT)*\n", + " \n", + "line: statement comment? EOL\n", + "\n", + " | comment EOL\n", + "\n", + " | EOL\n", + " \n", + "comment: DOC_COMMENT | LINE_COMMENT\n", + " \n", + "EOL: /\\r?\\n/\n", + " \n", + "DOC_COMMENT.2: /\\/\\/\\/[^\\r\\n]*/\n", + "\n", + "LINE_COMMENT.1: /\\/\\/[^\\r\\n]*/\n", + "\n", + "BLOCK_COMMENT: /\\/\\*[\\s\\S]*?\\*\\//\n", + " \n", + "statement: assignment\n", + "\n", + " | function_decl\n", + "\n", + " | return_stmt\n", + "\n", + " | system_command\n", + "\n", + " | io_command\n", + "\n", + " | control_flow\n", + "\n", + " | async_command\n", + "\n", + " | connector_cmd\n", + "\n", + " | db_command\n", + "\n", + " | http_command\n", + "\n", + " | util_command\n", + "\n", + " | modularity_cmd\n", + "\n", + " | call_stmt\n", + " \n", + "assignment: identifier \"=\" expression\n", + " \n", + "call_stmt: identifier \"(\" argument_list? \")\"\n", + "\n", + " | identifier \"=\" identifier \".\" identifier \"(\" argument_list? \")\"\n", + "\n", + " | identifier \".\" identifier \"(\" argument_list? \")\"\n", + " \n", + "system_command: register_cmd\n", + "\n", + " | addvar_cmd\n", + " \n", + "register_cmd: \"registerEndpoint\" \"(\" stringliteral \",\" stringliteral \",\" list_display \",\" stringliteral \",\" identifier \",\" identifier \")\"\n", + " \n", + "addvar_cmd: \"addVar\" \"(\" addvar_arg \",\" addvar_arg \")\"\n", + "\n", + "addvar_arg: identifier\n", + "\n", + " | literal\n", + "\n", + " | \"$\" identifier\n", + " \n", + "identifier: IDENTIFIER\n", + "\n", + "system_variable: \"_status\"\n", + " \n", + "io_command: addparam_cmd\n", + "\n", + " | getlistlen_cmd\n", + "\n", + " | addresult_cmd\n", + "\n", + " | getparamlist_cmd\n", + " \n", + "addparam_cmd: \"addParam\" \"(\" stringliteral \",\" identifier \")\"\n", + "\n", + "getlistlen_cmd: \"getListLen\" \"(\" identifier \",\" identifier \")\"\n", + "\n", + "getparamlist_cmd: \"getQueryParamList\" \"(\" stringliteral \",\" identifier \")\"\n", + "\n", + "addresult_cmd: \"addResult\" \"(\" identifier \")\"\n", + " \n", + "control_flow: if_stmt\n", + "\n", + " | loop_stmt\n", + "\n", + " | try_stmt\n", + " \n", + "if_stmt: \"if\" \"(\" if_condition \")\" EOL block (\"else\" \"(\" \")\" EOL block)? \"end\" \"(\" \")\" EOL\n", + " \n", + "if_condition: if_atom \",\" if_atom \",\" stringliteral\n", + "\n", + " | \"None\" \",\" \"None\" \",\" stringliteral\n", + " \n", + "if_atom: identifier\n", + "\n", + " | literal\n", + " \n", + "loop_stmt: \"startLoop\" \"(\" identifier \",\" expression \",\" expression \")\" EOL block \"endLoop\" \"(\" \")\" EOL\n", + " \n", + "try_stmt: \"try\" \"(\" \")\" EOL block \"exception\" \"(\" identifier \")\" EOL block \"end\" \"(\" \")\" EOL\n", + " \n", + "block: line*\n", + " \n", + "async_command: go_stmt\n", + "\n", + " | gather_stmt\n", + " \n", + "go_stmt: identifier \"=\" \"go\" identifier \"(\" argument_list? \")\"\n", + "\n", + "gather_stmt: identifier \"=\" \"gather\" \"(\" identifier (\",\" expression)? \")\"\n", + " \n", + "connector_cmd: connector_instantiation\n", + " \n", + "connector_instantiation: identifier \"=\" \"avapConnector\" \"(\" stringliteral \")\"\n", + " \n", + "http_command: req_post_cmd\n", + "\n", + " | req_get_cmd\n", + " \n", + "req_post_cmd: \"RequestPost\" \"(\" expression \",\" expression \",\" expression \",\" expression \",\" identifier \",\" expression \")\"\n", + "\n", + "req_get_cmd: \"RequestGet\" \"(\" expression \",\" expression \",\" expression \",\" identifier \",\" expression \")\"\n", + " \n", + "db_command: orm_direct\n", + "\n", + " | orm_check\n", + "\n", + " | orm_create\n", + "\n", + " | orm_select\n", + "\n", + " | orm_insert\n", + "\n", + " | orm_update\n", + " \n", + "orm_direct: \"ormDirect\" \"(\" expression \",\" identifier \")\"\n", + "\n", + "orm_check: \"ormCheckTable\" \"(\" expression \",\" identifier \")\"\n", + "\n", + "orm_create: \"ormCreateTable\" \"(\" expression \",\" expression \",\" expression \",\" identifier \")\"\n", + " \n", + "orm_select: \"ormAccessSelect\" \"(\" orm_fields \",\" expression (\",\" expression)? \",\" identifier \")\"\n", + "\n", + "orm_fields: \"*\"\n", + "\n", + " | expression\n", + " \n", + "orm_insert: \"ormAccessInsert\" \"(\" expression \",\" expression \",\" identifier \")\"\n", + " \n", + "orm_update: \"ormAccessUpdate\" \"(\" expression \",\" expression \",\" expression \",\" expression \",\" identifier \")\"\n", + " \n", + "util_command: json_list_cmd\n", + "\n", + " | crypto_cmd\n", + "\n", + " | regex_cmd\n", + "\n", + " | datetime_cmd\n", + "\n", + " | stamp_cmd\n", + "\n", + " | string_cmd\n", + "\n", + " | replace_cmd\n", + " \n", + "json_list_cmd: \"variableToList\" \"(\" expression \",\" identifier \")\"\n", + "\n", + " | \"itemFromList\" \"(\" identifier \",\" expression \",\" identifier \")\"\n", + "\n", + " | \"variableFromJSON\" \"(\" identifier \",\" expression \",\" identifier \")\"\n", + "\n", + " | \"AddVariableToJSON\" \"(\" expression \",\" expression \",\" identifier \")\"\n", + " \n", + "crypto_cmd: \"encodeSHA256\" \"(\" identifier_or_string \",\" identifier \")\"\n", + "\n", + " | \"encodeMD5\" \"(\" identifier_or_string \",\" identifier \")\"\n", + " \n", + "regex_cmd: \"getRegex\" \"(\" identifier \",\" stringliteral \",\" identifier \")\"\n", + " \n", + "datetime_cmd: \"getDateTime\" \"(\" stringliteral \",\" expression \",\" stringliteral \",\" identifier \")\"\n", + " \n", + "stamp_cmd: \"stampToDatetime\" \"(\" expression \",\" stringliteral \",\" expression \",\" identifier \")\"\n", + "\n", + " | \"getTimeStamp\" \"(\" stringliteral \",\" stringliteral \",\" expression \",\" identifier \")\"\n", + " \n", + "string_cmd: \"randomString\" \"(\" expression \",\" identifier \")\"\n", + " \n", + "replace_cmd: \"replace\" \"(\" identifier_or_string \",\" stringliteral \",\" stringliteral \",\" identifier \")\"\n", + " \n", + "function_decl: \"function\" identifier \"(\" param_list? \")\" \"{\" EOL block \"}\" EOL\n", + " \n", + "param_list: identifier (\",\" identifier)*\n", + " \n", + "return_stmt: \"return\" \"(\" expression? \")\"\n", + " \n", + "modularity_cmd: include_stmt\n", + "\n", + " | import_stmt\n", + " \n", + "include_stmt: \"include\" stringliteral\n", + "\n", + "import_stmt: \"import\" (\"<\" identifier \">\" | stringliteral)\n", + " \n", + "?expression: logical_or\n", + " \n", + "?logical_or: logical_and (\"or\" logical_and)*\n", + "\n", + "?logical_and: logical_not (\"and\" logical_not)*\n", + "\n", + "?logical_not: \"not\" logical_not\n", + "\n", + " | comparison\n", + " \n", + "?comparison: arithmetic (comp_op arithmetic)*\n", + "\n", + "comp_op: \"==\" | \"!=\" | \"<\" | \">\" | \"<=\" | \">=\" | \"in\" | \"is\"\n", + " \n", + "?arithmetic: term ((\"+\" | \"-\") term)*\n", + "\n", + "?term: factor ((\"*\" | \"/\" | \"%\") factor)*\n", + "\n", + "?factor: (\"+\" | \"-\") factor\n", + "\n", + " | power\n", + " \n", + "?power: primary (\"**\" factor)?\n", + " \n", + "?primary: atom postfix*\n", + " \n", + "postfix: \".\" identifier\n", + "\n", + " | \"[\" expression \"]\"\n", + "\n", + " | \"[\" expression? \":\" expression? (\":\" expression?)? \"]\"\n", + "\n", + " | \"(\" argument_list? \")\"\n", + " \n", + "?atom: identifier\n", + "\n", + " | \"$\" identifier\n", + "\n", + " | literal\n", + "\n", + " | \"(\" expression \")\"\n", + "\n", + " | list_display\n", + "\n", + " | dict_display\n", + " \n", + "list_display: \"[\" argument_list? \"]\"\n", + "\n", + " | \"[\" expression \"for\" identifier \"in\" expression if_clause? \"]\"\n", + " \n", + "if_clause: \"if\" expression\n", + " \n", + "dict_display: \"{\" key_datum_list? \"}\"\n", + "\n", + "key_datum_list: key_datum (\",\" key_datum)*\n", + "\n", + "key_datum: expression \":\" expression\n", + " \n", + "argument_list: expression (\",\" expression)*\n", + " \n", + "number: FLOATNUMBER\n", + "\n", + " | INTEGER\n", + " \n", + "literal: stringliteral\n", + "\n", + " | number\n", + "\n", + " | boolean\n", + "\n", + " | \"None\"\n", + " \n", + "boolean: \"True\" | \"False\"\n", + " \n", + "INTEGER: /[0-9]+/\n", + "\n", + "FLOATNUMBER: /(?:[0-9]+\\.[0-9]*|\\.[0-9]+)/\n", + " \n", + "stringliteral: STRING_DOUBLE\n", + "\n", + " | STRING_SINGLE\n", + " \n", + "STRING_DOUBLE: /\"([^\"\\\\]|\\\\[\"'\\\\ntr0])*\"/\n", + "\n", + "STRING_SINGLE: /'([^'\\\\]|\\\\[\"'\\\\ntr0])*'/\n", + " \n", + "identifier_or_string: identifier\n", + "\n", + " | stringliteral\n", + " \n", + "IDENTIFIER: /[A-Za-z_][A-Za-z0-9_]*/\n", + " \n", + "%ignore /[ \\t]+/\n", + " \n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0cab2125", + "metadata": {}, + "outputs": [], + "source": [ + "code = \"\"\"\n", + "addVar(mensaje, \"Hola mundo desde AVAP\")\n", + "addResult(mensaje)\n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "id": "6a266b2a", + "metadata": {}, + "outputs": [], + "source": [ + "folder = \"/home/pseco/VsCodeProjects/assistance-engine/docs/samples/\"" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "id": "522bdb3b", + "metadata": {}, + "outputs": [], + "source": [ + "parser = Lark(grammar, parser=\"lalr\", propagate_positions=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "id": "d3aa8026", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Parsed 0 files successfully\n", + "Failed to parse 33 files\n", + "\n", + "Files processed:\n", + " - calculo_de_expiracion.avap: error\n", + " - fecha_para_base_de_datos.avap: error\n", + " - comparacion_simple.avap: error\n", + " - ormAccessCreate.avap: error\n", + " - if_desigualdad.avap: error\n", + " - conversion_timestamp_legible.avap: error\n", + " - validacion_de_nulo.avap: error\n", + " - concatenacion_dinamica.avap: error\n", + " - obtencion_timestamp.avap: error\n", + " - manejo_error_sql_critico.avap: error\n", + " - construccion_dinamica_de_objeto.avap: error\n", + " - respuesta_multiple.avap: error\n", + " - limpieza_de_strings.avap: error\n", + " - captura_de_listas_multiples.avap: error\n", + " - contador_de_parametros.avap: error\n", + " - captura_de_id.avap: error\n", + " - hello_world.avap: error\n", + " - bucle_1_10.avap: error\n", + " - hash_SHA256_para_integridad.avap: error\n", + " - funcion_de_suma.avap: error\n", + " - expresion_compleja.avap: error\n", + " - hola_mundo.avap: error\n", + " - generador_de_tokens_aleatorios.avap: error\n", + " - asignacion_matematica.avap: error\n", + " - salida_bucle_correcta.avap: error\n", + " - bucle_longitud_de_datos.avap: error\n", + " - asignacion_booleana.avap: error\n", + " - validacion_in_pertenece_a_lista.avap: error\n", + " - try_catch_request.avap: error\n", + " - funcion_validacion_acceso.avap: error\n", + " - referencia_por_valor.avap: error\n", + " - paginacion_dinamica_recursos.avap: error\n", + " - else_estandar.avap: error\n" + ] + } + ], + "source": [ + "# Initialize dictionary to store parsed results\n", + "parsed_files = {}\n", + "folder_path = Path(folder)\n", + "\n", + "# Parse each .avap file\n", + "for file_path in folder_path.glob(\"*.avap\"):\n", + " try:\n", + " # Read file with encoding handling and normalize line breaks\n", + " file_content = file_path.read_text(encoding='utf-8')\n", + " # Normalize all line endings to \\n\n", + " file_content = file_content.replace('\\r\\n', '\\n').replace('\\r', '\\n')\n", + " \n", + " ast = parser.parse(file_content)\n", + " \n", + " # Store AST with metadata\n", + " parsed_files[file_path.name] = {\n", + " \"ast\": ast,\n", + " \"tree\": ast.pretty(), # Pretty-printed BNF structure\n", + " \"status\": \"success\"\n", + " }\n", + " except UnicodeDecodeError as e:\n", + " # If UTF-8 fails, try with latin-1 (more permissive)\n", + " try:\n", + " file_content = file_path.read_text(encoding='latin-1')\n", + " file_content = file_content.replace('\\r\\n', '\\n').replace('\\r', '\\n')\n", + " ast = parser.parse(file_content)\n", + " parsed_files[file_path.name] = {\n", + " \"ast\": ast,\n", + " \"tree\": ast.pretty(),\n", + " \"status\": \"success\"\n", + " }\n", + " except Exception as e2:\n", + " parsed_files[file_path.name] = {\n", + " \"error\": f\"Encoding error: {str(e2)}\",\n", + " \"status\": \"error\"\n", + " }\n", + " except Exception as e:\n", + " parsed_files[file_path.name] = {\n", + " \"error\": str(e),\n", + " \"status\": \"error\"\n", + " }\n", + "\n", + "# Display results summary\n", + "print(f\"Parsed {len([f for f in parsed_files.values() if f['status'] == 'success'])} files successfully\")\n", + "print(f\"Failed to parse {len([f for f in parsed_files.values() if f['status'] == 'error'])} files\")\n", + "print(\"\\nFiles processed:\")\n", + "for filename, result in parsed_files.items():\n", + " print(f\" - {filename}: {result['status']}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "id": "35b34377", + "metadata": {}, + "outputs": [], + "source": [ + "tree = parser.parse(code)" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "id": "025b46a7", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "start\n", + " program\n", + " line\t\n", + "\n", + " line\n", + " statement\n", + " system_command\n", + " addvar_cmd\n", + " addvar_arg\n", + " identifier\tmensaje\n", + " addvar_arg\n", + " literal\n", + " stringliteral\t\"Hola mundo desde AVAP\"\n", + " \n", + "\n", + " line\n", + " statement\n", + " io_command\n", + " addresult_cmd\n", + " identifier\tmensaje\n", + " \n", + "\n", + "\n" + ] + } + ], + "source": [ + "print(tree.pretty())" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "assistance-engine", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}