diff --git a/scratches/pseco/ingestion/Code Ingestion/n02 BNF Lark example.ipynb b/scratches/pseco/ingestion/Code Ingestion/n02 BNF Lark example.ipynb index 3bc266a..2f3c07d 100644 --- a/scratches/pseco/ingestion/Code Ingestion/n02 BNF Lark example.ipynb +++ b/scratches/pseco/ingestion/Code Ingestion/n02 BNF Lark example.ipynb @@ -3,44 +3,6 @@ { "cell_type": "code", "execution_count": 1, - "id": "5b646fb1", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[2mUsing Python 3.12.11 environment at: /home/pseco/VsCodeProjects/assistance-engine/.venv\u001b[0m\n", - "\u001b[2mAudited \u001b[1m1 package\u001b[0m \u001b[2min 2ms\u001b[0m\u001b[0m\n" - ] - } - ], - "source": [ - "! uv pip install bnf" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "274d6d68", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[2mUsing Python 3.12.11 environment at: /home/pseco/VsCodeProjects/assistance-engine/.venv\u001b[0m\n", - "\u001b[2mAudited \u001b[1m1 package\u001b[0m \u001b[2min 2ms\u001b[0m\u001b[0m\n" - ] - } - ], - "source": [ - "! uv pip install ebnf" - ] - }, - { - "cell_type": "code", - "execution_count": 49, "id": "0a8abbfa", "metadata": {}, "outputs": [], @@ -51,7 +13,6 @@ "from pathlib import Path\n", "from typing import Any, Dict, List, Optional, Tuple\n", "from lark import Tree, Lark\n", - "from bnf import grammar as bnf_grammar, parse as bnf_parse\n", "from src.config import settings\n", "from lark import Lark" ] @@ -74,322 +35,255 @@ }, { "cell_type": "code", - "execution_count": 60, + "execution_count": 29, "id": "93d2db25", "metadata": {}, "outputs": [], "source": [ "grammar = r\"\"\"\n", "start: program\n", - " \n", - "program: (line | BLOCK_COMMENT)*\n", - " \n", - "line: statement comment? EOL\n", "\n", - " | comment EOL\n", + "program: separator* line_or_comment (separator+ line_or_comment)* separator*\n", + "\n", + "?line_or_comment: simple_stmt comment?\n", + " | compound_stmt\n", + " | comment\n", + " | BLOCK_COMMENT\n", + "\n", + "?separator: EOL+\n", "\n", - " | EOL\n", - " \n", "comment: DOC_COMMENT | LINE_COMMENT\n", - " \n", + "\n", "EOL: /\\r?\\n/\n", - " \n", + "\n", "DOC_COMMENT.2: /\\/\\/\\/[^\\r\\n]*/\n", - "\n", "LINE_COMMENT.1: /\\/\\/[^\\r\\n]*/\n", - "\n", "BLOCK_COMMENT: /\\/\\*[\\s\\S]*?\\*\\//\n", - " \n", - "statement: assignment\n", "\n", - " | function_decl\n", + "?simple_stmt: assignment\n", + " | return_stmt\n", + " | system_command\n", + " | io_command\n", + " | async_command\n", + " | connector_cmd\n", + " | db_command\n", + " | http_command\n", + " | util_command\n", + " | modularity_cmd\n", + " | call_stmt\n", "\n", - " | return_stmt\n", + "?compound_stmt: function_decl\n", + " | if_stmt\n", + " | loop_stmt\n", + " | try_stmt\n", "\n", - " | system_command\n", - "\n", - " | io_command\n", - "\n", - " | control_flow\n", - "\n", - " | async_command\n", - "\n", - " | connector_cmd\n", - "\n", - " | db_command\n", - "\n", - " | http_command\n", - "\n", - " | util_command\n", - "\n", - " | modularity_cmd\n", - "\n", - " | call_stmt\n", - " \n", "assignment: identifier \"=\" expression\n", - " \n", + "\n", "call_stmt: identifier \"(\" argument_list? \")\"\n", - "\n", " | identifier \"=\" identifier \".\" identifier \"(\" argument_list? \")\"\n", - "\n", " | identifier \".\" identifier \"(\" argument_list? \")\"\n", - " \n", - "system_command: register_cmd\n", "\n", + "system_command: register_cmd\n", " | addvar_cmd\n", - " \n", + "\n", "register_cmd: \"registerEndpoint\" \"(\" stringliteral \",\" stringliteral \",\" list_display \",\" stringliteral \",\" identifier \",\" identifier \")\"\n", - " \n", + "\n", "addvar_cmd: \"addVar\" \"(\" addvar_arg \",\" addvar_arg \")\"\n", "\n", "addvar_arg: identifier\n", - "\n", " | literal\n", - "\n", " | \"$\" identifier\n", - " \n", + "\n", "identifier: IDENTIFIER\n", "\n", "system_variable: \"_status\"\n", - " \n", + "\n", "io_command: addparam_cmd\n", - "\n", " | getlistlen_cmd\n", - "\n", - " | addresult_cmd\n", - "\n", + " | addresult\n", " | getparamlist_cmd\n", - " \n", + "\n", "addparam_cmd: \"addParam\" \"(\" stringliteral \",\" identifier \")\"\n", - "\n", "getlistlen_cmd: \"getListLen\" \"(\" identifier \",\" identifier \")\"\n", - "\n", "getparamlist_cmd: \"getQueryParamList\" \"(\" stringliteral \",\" identifier \")\"\n", + "addresult: \"addResult\" \"(\" identifier \")\"\n", "\n", - "addresult_cmd: \"addResult\" \"(\" identifier \")\"\n", - " \n", - "control_flow: if_stmt\n", + "if_stmt: \"if\" \"(\" if_condition \")\" separator block (\"else\" \"(\" \")\" separator block)? \"end\" \"(\" \")\"\n", "\n", - " | loop_stmt\n", - "\n", - " | try_stmt\n", - " \n", - "if_stmt: \"if\" \"(\" if_condition \")\" EOL block (\"else\" \"(\" \")\" EOL block)? \"end\" \"(\" \")\" EOL\n", - " \n", "if_condition: if_atom \",\" if_atom \",\" stringliteral\n", - "\n", " | \"None\" \",\" \"None\" \",\" stringliteral\n", - " \n", + "\n", "if_atom: identifier\n", - "\n", " | literal\n", - " \n", - "loop_stmt: \"startLoop\" \"(\" identifier \",\" expression \",\" expression \")\" EOL block \"endLoop\" \"(\" \")\" EOL\n", - " \n", - "try_stmt: \"try\" \"(\" \")\" EOL block \"exception\" \"(\" identifier \")\" EOL block \"end\" \"(\" \")\" EOL\n", - " \n", - "block: line*\n", - " \n", + "\n", + "loop_stmt: \"startLoop\" \"(\" identifier \",\" expression \",\" expression \")\" separator block \"endLoop\" \"(\" \")\"\n", + "\n", + "try_stmt: \"try\" \"(\" \")\" separator block \"exception\" \"(\" identifier \")\" separator block \"end\" \"(\" \")\"\n", + "\n", + "block: separator* line_or_comment (separator+ line_or_comment)* separator*\n", + "\n", "async_command: go_stmt\n", - "\n", " | gather_stmt\n", - " \n", + "\n", "go_stmt: identifier \"=\" \"go\" identifier \"(\" argument_list? \")\"\n", - "\n", "gather_stmt: identifier \"=\" \"gather\" \"(\" identifier (\",\" expression)? \")\"\n", - " \n", + "\n", "connector_cmd: connector_instantiation\n", - " \n", + "\n", "connector_instantiation: identifier \"=\" \"avapConnector\" \"(\" stringliteral \")\"\n", - " \n", + "\n", "http_command: req_post_cmd\n", - "\n", " | req_get_cmd\n", - " \n", + "\n", "req_post_cmd: \"RequestPost\" \"(\" expression \",\" expression \",\" expression \",\" expression \",\" identifier \",\" expression \")\"\n", - "\n", "req_get_cmd: \"RequestGet\" \"(\" expression \",\" expression \",\" expression \",\" identifier \",\" expression \")\"\n", - " \n", + "\n", "db_command: orm_direct\n", - "\n", " | orm_check\n", - "\n", " | orm_create\n", - "\n", " | orm_select\n", - "\n", " | orm_insert\n", - "\n", " | orm_update\n", - " \n", + "\n", "orm_direct: \"ormDirect\" \"(\" expression \",\" identifier \")\"\n", - "\n", "orm_check: \"ormCheckTable\" \"(\" expression \",\" identifier \")\"\n", - "\n", "orm_create: \"ormCreateTable\" \"(\" expression \",\" expression \",\" expression \",\" identifier \")\"\n", - " \n", + "\n", "orm_select: \"ormAccessSelect\" \"(\" orm_fields \",\" expression (\",\" expression)? \",\" identifier \")\"\n", "\n", "orm_fields: \"*\"\n", - "\n", " | expression\n", - " \n", + "\n", "orm_insert: \"ormAccessInsert\" \"(\" expression \",\" expression \",\" identifier \")\"\n", - " \n", "orm_update: \"ormAccessUpdate\" \"(\" expression \",\" expression \",\" expression \",\" expression \",\" identifier \")\"\n", - " \n", + "\n", "util_command: json_list_cmd\n", - "\n", " | crypto_cmd\n", - "\n", " | regex_cmd\n", - "\n", " | datetime_cmd\n", - "\n", " | stamp_cmd\n", - "\n", " | string_cmd\n", - "\n", " | replace_cmd\n", - " \n", + "\n", "json_list_cmd: \"variableToList\" \"(\" expression \",\" identifier \")\"\n", - "\n", " | \"itemFromList\" \"(\" identifier \",\" expression \",\" identifier \")\"\n", - "\n", " | \"variableFromJSON\" \"(\" identifier \",\" expression \",\" identifier \")\"\n", - "\n", " | \"AddVariableToJSON\" \"(\" expression \",\" expression \",\" identifier \")\"\n", - " \n", + "\n", "crypto_cmd: \"encodeSHA256\" \"(\" identifier_or_string \",\" identifier \")\"\n", - "\n", " | \"encodeMD5\" \"(\" identifier_or_string \",\" identifier \")\"\n", - " \n", + "\n", "regex_cmd: \"getRegex\" \"(\" identifier \",\" stringliteral \",\" identifier \")\"\n", - " \n", + "\n", "datetime_cmd: \"getDateTime\" \"(\" stringliteral \",\" expression \",\" stringliteral \",\" identifier \")\"\n", - " \n", + "\n", "stamp_cmd: \"stampToDatetime\" \"(\" expression \",\" stringliteral \",\" expression \",\" identifier \")\"\n", - "\n", " | \"getTimeStamp\" \"(\" stringliteral \",\" stringliteral \",\" expression \",\" identifier \")\"\n", - " \n", - "string_cmd: \"randomString\" \"(\" expression \",\" identifier \")\"\n", - " \n", + "\n", + "string_cmd: \"randomString\" \"(\" expression \",\" expression \",\" identifier \")\"\n", + "\n", "replace_cmd: \"replace\" \"(\" identifier_or_string \",\" stringliteral \",\" stringliteral \",\" identifier \")\"\n", - " \n", - "function_decl: \"function\" identifier \"(\" param_list? \")\" \"{\" EOL block \"}\" EOL\n", - " \n", + "\n", + "function_decl: \"function\" identifier \"(\" param_list? \")\" \"{\" separator block \"}\"\n", + "\n", "param_list: identifier (\",\" identifier)*\n", - " \n", + "\n", "return_stmt: \"return\" \"(\" expression? \")\"\n", - " \n", + "\n", "modularity_cmd: include_stmt\n", - "\n", " | import_stmt\n", - " \n", + "\n", "include_stmt: \"include\" stringliteral\n", - "\n", "import_stmt: \"import\" (\"<\" identifier \">\" | stringliteral)\n", - " \n", - "?expression: logical_or\n", - " \n", - "?logical_or: logical_and (\"or\" logical_and)*\n", "\n", + "?expression: logical_or\n", + "\n", + "?logical_or: logical_and (\"or\" logical_and)*\n", "?logical_and: logical_not (\"and\" logical_not)*\n", "\n", "?logical_not: \"not\" logical_not\n", - "\n", " | comparison\n", - " \n", + "\n", "?comparison: arithmetic (comp_op arithmetic)*\n", "\n", "comp_op: \"==\" | \"!=\" | \"<\" | \">\" | \"<=\" | \">=\" | \"in\" | \"is\"\n", - " \n", - "?arithmetic: term ((\"+\" | \"-\") term)*\n", "\n", + "?arithmetic: term ((\"+\" | \"-\") term)*\n", "?term: factor ((\"*\" | \"/\" | \"%\") factor)*\n", "\n", "?factor: (\"+\" | \"-\") factor\n", - "\n", " | power\n", - " \n", + "\n", "?power: primary (\"**\" factor)?\n", - " \n", + "\n", "?primary: atom postfix*\n", - " \n", + "\n", "postfix: \".\" identifier\n", - "\n", " | \"[\" expression \"]\"\n", - "\n", " | \"[\" expression? \":\" expression? (\":\" expression?)? \"]\"\n", - "\n", " | \"(\" argument_list? \")\"\n", - " \n", + "\n", "?atom: identifier\n", - "\n", " | \"$\" identifier\n", - "\n", " | literal\n", - "\n", " | \"(\" expression \")\"\n", - "\n", " | list_display\n", - "\n", " | dict_display\n", - " \n", - "list_display: \"[\" argument_list? \"]\"\n", "\n", + "list_display: \"[\" argument_list? \"]\"\n", " | \"[\" expression \"for\" identifier \"in\" expression if_clause? \"]\"\n", - " \n", + "\n", "if_clause: \"if\" expression\n", - " \n", + "\n", "dict_display: \"{\" key_datum_list? \"}\"\n", "\n", "key_datum_list: key_datum (\",\" key_datum)*\n", - "\n", "key_datum: expression \":\" expression\n", - " \n", + "\n", "argument_list: expression (\",\" expression)*\n", - " \n", + "\n", "number: FLOATNUMBER\n", - "\n", " | INTEGER\n", - " \n", + "\n", "literal: stringliteral\n", - "\n", " | number\n", - "\n", " | boolean\n", - "\n", " | \"None\"\n", - " \n", + "\n", "boolean: \"True\" | \"False\"\n", - " \n", + "\n", "INTEGER: /[0-9]+/\n", - "\n", "FLOATNUMBER: /(?:[0-9]+\\.[0-9]*|\\.[0-9]+)/\n", - " \n", + "\n", "stringliteral: STRING_DOUBLE\n", - "\n", " | STRING_SINGLE\n", - " \n", - "STRING_DOUBLE: /\"([^\"\\\\]|\\\\[\"'\\\\ntr0])*\"/\n", "\n", - "STRING_SINGLE: /'([^'\\\\]|\\\\[\"'\\\\ntr0])*'/\n", - " \n", + "STRING_DOUBLE: /\"([^\"\\\\]|\\\\.)*\"/\n", + "STRING_SINGLE: /'([^'\\\\]|\\\\.)*'/\n", + "\n", "identifier_or_string: identifier\n", - "\n", " | stringliteral\n", - " \n", + "\n", "IDENTIFIER: /[A-Za-z_][A-Za-z0-9_]*/\n", - " \n", + "\n", "%ignore /[ \\t]+/\n", - " \n", "\"\"\"" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 30, + "id": "95267b2a", + "metadata": {}, + "outputs": [], + "source": [ + "# STRING_DOUBLE: /\"([^\"\\\\]|\\\\[\"'\\\\ntr0])*\"/\n", + "# STRING_SINGLE: /'([^'\\\\]|\\\\[\"'\\\\ntr0])*'/" + ] + }, + { + "cell_type": "code", + "execution_count": 31, "id": "0cab2125", "metadata": {}, "outputs": [], @@ -402,7 +296,7 @@ }, { "cell_type": "code", - "execution_count": 57, + "execution_count": 32, "id": "6a266b2a", "metadata": {}, "outputs": [], @@ -412,17 +306,17 @@ }, { "cell_type": "code", - "execution_count": 58, + "execution_count": 33, "id": "522bdb3b", "metadata": {}, "outputs": [], "source": [ - "parser = Lark(grammar, parser=\"lalr\", propagate_positions=True)" + "parser = Lark(grammar, parser=\"lalr\", propagate_positions=True, start=\"program\")" ] }, { "cell_type": "code", - "execution_count": 64, + "execution_count": 23, "id": "d3aa8026", "metadata": {}, "outputs": [ @@ -430,42 +324,16 @@ "name": "stdout", "output_type": "stream", "text": [ - "Parsed 0 files successfully\n", - "Failed to parse 33 files\n", + "Parsed 26 files successfully\n", + "Failed to parse 7 files\n", "\n", "Files processed:\n", - " - calculo_de_expiracion.avap: error\n", - " - fecha_para_base_de_datos.avap: error\n", - " - comparacion_simple.avap: error\n", - " - ormAccessCreate.avap: error\n", " - if_desigualdad.avap: error\n", - " - conversion_timestamp_legible.avap: error\n", - " - validacion_de_nulo.avap: error\n", - " - concatenacion_dinamica.avap: error\n", - " - obtencion_timestamp.avap: error\n", " - manejo_error_sql_critico.avap: error\n", - " - construccion_dinamica_de_objeto.avap: error\n", - " - respuesta_multiple.avap: error\n", - " - limpieza_de_strings.avap: error\n", " - captura_de_listas_multiples.avap: error\n", - " - contador_de_parametros.avap: error\n", - " - captura_de_id.avap: error\n", - " - hello_world.avap: error\n", - " - bucle_1_10.avap: error\n", - " - hash_SHA256_para_integridad.avap: error\n", - " - funcion_de_suma.avap: error\n", " - expresion_compleja.avap: error\n", - " - hola_mundo.avap: error\n", - " - generador_de_tokens_aleatorios.avap: error\n", - " - asignacion_matematica.avap: error\n", - " - salida_bucle_correcta.avap: error\n", - " - bucle_longitud_de_datos.avap: error\n", - " - asignacion_booleana.avap: error\n", " - validacion_in_pertenece_a_lista.avap: error\n", " - try_catch_request.avap: error\n", - " - funcion_validacion_acceso.avap: error\n", - " - referencia_por_valor.avap: error\n", - " - paginacion_dinamica_recursos.avap: error\n", " - else_estandar.avap: error\n" ] } @@ -518,57 +386,39 @@ "print(f\"Failed to parse {len([f for f in parsed_files.values() if f['status'] == 'error'])} files\")\n", "print(\"\\nFiles processed:\")\n", "for filename, result in parsed_files.items():\n", - " print(f\" - {filename}: {result['status']}\")" + " if result[\"status\"] == \"error\":\n", + " print(f\" - {filename}: {result['status']}\")" ] }, { "cell_type": "code", - "execution_count": 70, + "execution_count": null, "id": "35b34377", "metadata": {}, "outputs": [], "source": [ - "tree = parser.parse(code)" + "parsed_files" ] }, { "cell_type": "code", - "execution_count": 71, - "id": "025b46a7", + "execution_count": 34, + "id": "e9e6c0fb", "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "start\n", - " program\n", - " line\t\n", - "\n", - " line\n", - " statement\n", - " system_command\n", - " addvar_cmd\n", - " addvar_arg\n", - " identifier\tmensaje\n", - " addvar_arg\n", - " literal\n", - " stringliteral\t\"Hola mundo desde AVAP\"\n", - " \n", - "\n", - " line\n", - " statement\n", - " io_command\n", - " addresult_cmd\n", - " identifier\tmensaje\n", - " \n", - "\n", - "\n" - ] + "data": { + "text/plain": [ + "Tree('program', [Token('EOL', '\\n'), Tree('system_command', [Tree('addvar_cmd', [Tree('addvar_arg', [Tree('identifier', [Token('IDENTIFIER', 'mensaje')])]), Tree('addvar_arg', [Tree('literal', [Tree('stringliteral', [Token('STRING_DOUBLE', '\"Hola mundo desde AVAP\"')])])])])]), Token('EOL', '\\n'), Tree('io_command', [Tree('addresult', [Tree('identifier', [Token('IDENTIFIER', 'mensaje')])])]), Token('EOL', '\\n')])" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ - "print(tree.pretty())" + "parser.parse(code)" ] } ],