diff --git a/scratches/pseco/ingestion/n00 Dual Index v1.ipynb b/scratches/pseco/ingestion/Code Ingestion/n00 Dual Index v1.ipynb similarity index 100% rename from scratches/pseco/ingestion/n00 Dual Index v1.ipynb rename to scratches/pseco/ingestion/Code Ingestion/n00 Dual Index v1.ipynb diff --git a/scratches/pseco/ingestion/n00 Dual Index v2.ipynb b/scratches/pseco/ingestion/Code Ingestion/n00 Dual Index v2.ipynb similarity index 100% rename from scratches/pseco/ingestion/n00 Dual Index v2.ipynb rename to scratches/pseco/ingestion/Code Ingestion/n00 Dual Index v2.ipynb diff --git a/scratches/pseco/ingestion/n00 Dual Index v3.ipynb b/scratches/pseco/ingestion/Code Ingestion/n00 Dual Index v3.ipynb similarity index 100% rename from scratches/pseco/ingestion/n00 Dual Index v3.ipynb rename to scratches/pseco/ingestion/Code Ingestion/n00 Dual Index v3.ipynb diff --git a/scratches/pseco/ingestion/n01 Proper Lark Chunking.ipynb b/scratches/pseco/ingestion/Code Ingestion/n00 Proper Lark Chunking.ipynb similarity index 95% rename from scratches/pseco/ingestion/n01 Proper Lark Chunking.ipynb rename to scratches/pseco/ingestion/Code Ingestion/n00 Proper Lark Chunking.ipynb index 76ef77a..20981a1 100644 --- a/scratches/pseco/ingestion/n01 Proper Lark Chunking.ipynb +++ b/scratches/pseco/ingestion/Code Ingestion/n00 Proper Lark Chunking.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 9, + "execution_count": 1, "id": "0a8abbfa", "metadata": {}, "outputs": [ @@ -12,7 +12,7 @@ "True" ] }, - "execution_count": 9, + "execution_count": 1, "metadata": {}, "output_type": "execute_result" } @@ -24,7 +24,7 @@ "from dataclasses import dataclass\n", "from pathlib import Path\n", "from typing import Any, Dict, List, Optional, Tuple\n", - "\n", + "from bnf import grammar\n", "import nltk\n", "from elasticsearch import Elasticsearch\n", "from langchain_core.documents import Document\n", @@ -44,7 +44,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 2, "id": "5c9d292b", "metadata": {}, "outputs": [], @@ -53,28 +53,6 @@ "embedding_dim = config.hidden_size" ] }, - { - "cell_type": "code", - "execution_count": 15, - "id": "0e1cd9b9", - "metadata": {}, - "outputs": [], - "source": [ - "grammar = (DATA_DIR / \"raw\" / \"code\" / \"BNF_v1.lark\").read_text(\n", - " encoding=\"utf-8\"\n", - ")\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "95e21900", - "metadata": {}, - "outputs": [], - "source": [ - "parser = Lark(grammar=grammar, parser=\"lalr\", propagate_positions=True)" - ] - }, { "cell_type": "markdown", "id": "baa779f3", @@ -85,7 +63,20 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 3, + "id": "26927d0c", + "metadata": {}, + "outputs": [], + "source": [ + "def bnf_to_lark(bnf_text):\n", + " text = re.sub(r\"<([^>]+)>\", r\"\\1\", bnf_text) # remove <>\n", + " text = text.replace(\"::=\", \":\")\n", + " return text" + ] + }, + { + "cell_type": "code", + "execution_count": 4, "id": "89be8bf6", "metadata": {}, "outputs": [], @@ -178,6 +169,48 @@ "# BNF " ] }, + { + "cell_type": "code", + "execution_count": 5, + "id": "26ad9c81", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "line 9 : syntax error at or before | = |\n", + "syntax error at end of file (missing ; ?)\n" + ] + }, + { + "data": { + "text/plain": [ + "({}, None)" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "grammar_ = (DATA_DIR / \"raw\" / \"code\" / \"BNF_v1.txt\").read_text(\n", + " encoding=\"utf-8\"\n", + ")\n", + "grammar(grammar_)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7cdf69c4", + "metadata": {}, + "outputs": [], + "source": [ + "parser = Lark(grammar=grammar, parser=\"lalr\", propagate_positions=True)" + ] + }, { "cell_type": "code", "execution_count": 18, diff --git a/scratches/pseco/ingestion/Code Ingestion/n01 BNF Check.ipynb b/scratches/pseco/ingestion/Code Ingestion/n01 BNF Check.ipynb new file mode 100644 index 0000000..785549e --- /dev/null +++ b/scratches/pseco/ingestion/Code Ingestion/n01 BNF Check.ipynb @@ -0,0 +1,303 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 51, + "id": "0a8abbfa", + "metadata": {}, + "outputs": [], + "source": [ + "import re\n", + "\n", + "from dataclasses import dataclass\n", + "\n", + "from typing import Any, Dict, List, Optional, Tuple\n", + "\n", + "from lark import Tree, Lark\n", + "\n", + "\n", + "from bnf import grammar as bnf_grammar, parse as bnf_parse\n", + "from ebnf import grammar as ebnf_grammar, parse as ebnf_parse\n", + "\n", + "from src.config import DATA_DIR" + ] + }, + { + "cell_type": "markdown", + "id": "baa779f3", + "metadata": {}, + "source": [ + "# Functions" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "26927d0c", + "metadata": {}, + "outputs": [], + "source": [ + "def bnf_to_lark(bnf_text):\n", + " text = re.sub(r\"<([^>]+)>\", r\"\\1\", bnf_text) # remove <>\n", + " text = text.replace(\"::=\", \":\")\n", + " return text" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "89be8bf6", + "metadata": {}, + "outputs": [], + "source": [ + "@dataclass\n", + "class Chunk:\n", + " text: str\n", + " kind: str\n", + " metadata: Dict[str, Any]\n", + "\n", + "def _span(node: Tree) -> Optional[Tuple[int, int]]:\n", + " m = node.meta\n", + " s = getattr(m, \"start_pos\", None)\n", + " e = getattr(m, \"end_pos\", None)\n", + " if s is None or e is None:\n", + " return None\n", + " return s, e\n", + "\n", + "def _iter_trees(t: Tree):\n", + " yield t\n", + " for c in t.children:\n", + " if isinstance(c, Tree):\n", + " yield from _iter_trees(c)\n", + "\n", + "def _cmd_name(line: str) -> Optional[str]:\n", + " m = re.match(r\"^\\s*([A-Za-z_][A-Za-z0-9_]*)\\s*\\(\", line)\n", + " return m.group(1) if m else None\n", + "\n", + "def chunk_atomic_lines(code: str) -> List[Chunk]:\n", + " tree = parser.parse(code)\n", + " chunks: List[Chunk] = []\n", + "\n", + " for node in _iter_trees(tree):\n", + " if node.data == \"stmt_line\":\n", + " sp = _span(node)\n", + " if not sp:\n", + " continue\n", + " s, e = sp\n", + " text = code[s:e].strip()\n", + " if not text:\n", + " continue\n", + "\n", + " chunks.append(\n", + " Chunk(\n", + " text=text,\n", + " kind=\"line\",\n", + " metadata={\n", + " \"granularity\": \"atomic\",\n", + " \"command\": _cmd_name(text)\n", + " }\n", + " )\n", + " )\n", + " return chunks\n", + "\n", + "def chunk_blocks(code: str) -> List[Chunk]:\n", + " tree = parser.parse(code)\n", + " chunks: List[Chunk] = []\n", + "\n", + " for node in _iter_trees(tree):\n", + " if node.data in (\"if_block\", \"loop_block\", \"try_block\", \"go_async_block\", \"function_block\"):\n", + " sp = _span(node)\n", + " if not sp:\n", + " continue\n", + " s, e = sp\n", + " text = code[s:e].strip()\n", + " if not text:\n", + " continue\n", + "\n", + " chunks.append(\n", + " Chunk(\n", + " text=text,\n", + " kind=node.data,\n", + " metadata={\"granularity\": \"block\"}\n", + " )\n", + " )\n", + " return chunks\n", + "\n", + "def chunk_avap_code(code: str) -> List[Chunk]:\n", + " # Keep original offsets: do NOT lstrip. Grammar already accepts leading _NL.\n", + " blocks = chunk_blocks(code)\n", + " lines = chunk_atomic_lines(code)\n", + " return blocks + lines" + ] + }, + { + "cell_type": "markdown", + "id": "23a92e13", + "metadata": {}, + "source": [ + "# BNF check" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a10a1017", + "metadata": {}, + "outputs": [], + "source": [ + "bnf_text = r\"\"\"\n", + " ::= \n", + " ::= a | b | c\n", + " ::= [0-9]\n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "id": "4790023e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "line 1 : syntax error at or before EOL = \n", + "\n", + "line 2 : illegal character '='\n", + "BNF: False\n" + ] + } + ], + "source": [ + "bnf_grammar(bnf_text)\n", + "print(\"BNF:\", bnf_parse(\"a=7\"))" + ] + }, + { + "cell_type": "markdown", + "id": "49953efd", + "metadata": {}, + "source": [ + "# BNF conversion to EBNF" + ] + }, + { + "cell_type": "markdown", + "id": "32dbc2c5", + "metadata": {}, + "source": [ + "# EBNF Check" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "id": "37968906", + "metadata": {}, + "outputs": [], + "source": [ + "ebnf_text = r\"\"\"\n", + "assign ::= name '=' num ;\n", + "name ::= 'a' | 'b' | 'c' ;\n", + "num ::= [0-9] ;\n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "id": "b234f2c4", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "BNF: True\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Generating LALR tables\n" + ] + } + ], + "source": [ + "ebnf_grammar(ebnf_text)\n", + "print(\"BNF:\", ebnf_parse(\"a=7\"))" + ] + }, + { + "cell_type": "markdown", + "id": "66fb8fee", + "metadata": {}, + "source": [ + "# Lark check EBNF Style" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "id": "08e53ccb", + "metadata": {}, + "outputs": [], + "source": [ + "ebnf_text = r\"\"\"\n", + "start: assign\n", + "\n", + "assign: name \"=\" num\n", + "name: \"a\" | \"b\" | \"c\"\n", + "num: DIGIT\n", + "\n", + "DIGIT: /[0-9]/\n", + "\n", + "%ignore \" \"\n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "id": "52935608", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Tree(Token('RULE', 'start'), [Tree(Token('RULE', 'assign'), [Tree(Token('RULE', 'name'), []), Tree(Token('RULE', 'num'), [Token('DIGIT', '7')])])])\n" + ] + } + ], + "source": [ + "parser = Lark(ebnf_text)\n", + "\n", + "print(parser.parse(\"a=7\"))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "assistance-engine", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/scratches/pseco/ingestion/n00 Count tokens.ipynb b/scratches/pseco/ingestion/Doc Ingestion/n00 Count tokens.ipynb similarity index 100% rename from scratches/pseco/ingestion/n00 Count tokens.ipynb rename to scratches/pseco/ingestion/Doc Ingestion/n00 Count tokens.ipynb diff --git a/scratches/pseco/ingestion/n00 ingestion notebook langgraph.ipynb b/scratches/pseco/ingestion/Doc Ingestion/n00 ingestion notebook langgraph.ipynb similarity index 100% rename from scratches/pseco/ingestion/n00 ingestion notebook langgraph.ipynb rename to scratches/pseco/ingestion/Doc Ingestion/n00 ingestion notebook langgraph.ipynb diff --git a/scratches/pseco/ingestion/n00 ingestion notebook v1.ipynb b/scratches/pseco/ingestion/Doc Ingestion/n00 ingestion notebook v1.ipynb similarity index 100% rename from scratches/pseco/ingestion/n00 ingestion notebook v1.ipynb rename to scratches/pseco/ingestion/Doc Ingestion/n00 ingestion notebook v1.ipynb diff --git a/scratches/pseco/ingestion/n00 ingestion notebook v2.ipynb b/scratches/pseco/ingestion/Doc Ingestion/n00 ingestion notebook v2.ipynb similarity index 100% rename from scratches/pseco/ingestion/n00 ingestion notebook v2.ipynb rename to scratches/pseco/ingestion/Doc Ingestion/n00 ingestion notebook v2.ipynb diff --git a/scratches/pseco/ingestion/n00 test.ipynb b/scratches/pseco/ingestion/Doc Ingestion/n00 test.ipynb similarity index 100% rename from scratches/pseco/ingestion/n00 test.ipynb rename to scratches/pseco/ingestion/Doc Ingestion/n00 test.ipynb