From 183c04829c5c05b119be12606452c3e0b7314e76 Mon Sep 17 00:00:00 2001
From: pseco <pseco@mrhouston.net>
Date: Thu, 5 Mar 2026 11:00:30 +0100
Subject: [PATCH] Update changelog for version 1.2.0: add new modules, refactor
 server integration, and enhance dependency management

---
 changelog                                     |  24 +
 .../ingestion/n01 Proper Lark Chunking.ipynb  | 412 ++++++++++++++++++
 src/config.py                                 |   2 +
 3 files changed, 438 insertions(+)
 create mode 100644 scratches/pseco/ingestion/n01 Proper Lark Chunking.ipynb

diff --git a/changelog b/changelog
index 87fa691..061cf3e 100644
--- a/changelog
+++ b/changelog
@@ -4,6 +4,30 @@ All notable changes to the **Brunix Assistance Engine** will be documented in th
 
 ---
 
+## [1.2.0] - 2026-03-04
+
+### Added
+- IMPLEMENTED: 
+    - `utils/`: factory modules created for embedding model and LLM generation.
+    - `graph.py`: workflow graph orchestration module added.
+    - `prompts.py`: centralized prompt definitions added.
+    - `state.py`: shared state management module added.
+
+### Changed
+- REFACTORED: `server.py` updated to integrate the new graph/state/prompt and utils-based architecture.
+- DEPENDENCIES: `requirements.txt` updated with new libraries required by the new modules.
+- BUILD/OPS: `Makefile` updated with commands:
+    - `ollama_local`
+    - `tunnels_down`
+    - `sync_data_down`
+    - `sync_data_up`
+
+### Fixed
+- RESOLVED: Command coverage and dependency consistency for local execution and data sync workflows.
+
+
+
+
 ## [1.1.0] - 2026-02-16
 
 ### Added
diff --git a/scratches/pseco/ingestion/n01 Proper Lark Chunking.ipynb b/scratches/pseco/ingestion/n01 Proper Lark Chunking.ipynb
new file mode 100644
index 0000000..183cb7f
--- /dev/null
+++ b/scratches/pseco/ingestion/n01 Proper Lark Chunking.ipynb	
@@ -0,0 +1,412 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0a8abbfa",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 18,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import os\n",
+    "import re\n",
+    "import uuid\n",
+    "from dataclasses import dataclass\n",
+    "from pathlib import Path\n",
+    "from typing import Any, Dict, List, Optional, Tuple\n",
+    "\n",
+    "import nltk\n",
+    "from elasticsearch import Elasticsearch\n",
+    "from langchain_core.documents import Document\n",
+    "from langchain_elasticsearch import ElasticsearchStore\n",
+    "from langchain_ollama import OllamaEmbeddings\n",
+    "from lark import Lark, Token, Transformer, Tree\n",
+    "from transformers import AutoConfig\n",
+    "\n",
+    "from src.config import (DATA_DIR, ELASTICSEARCH_CODE_INDEX,\n",
+    "                        ELASTICSEARCH_DOCS_INDEX, ELASTICSEARCH_INDEX,\n",
+    "                        ELASTICSEARCH_URL, HF_EMB_MODEL_NAME,\n",
+    "                        OLLAMA_EMB_MODEL_NAME, OLLAMA_LOCAL_URL,\n",
+    "                        OLLAMA_MODEL_NAME, OLLAMA_URL, PROJ_ROOT)\n",
+    "\n",
+    "nltk.download(\"punkt\", quiet=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "5c9d292b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "config = AutoConfig.from_pretrained(HF_EMB_MODEL_NAME)\n",
+    "embedding_dim = config.hidden_size"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "0e1cd9b9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "grammar = (DATA_DIR / \"raw\" / \"code\" / \"EBNF_v2.txt\").read_text(\n",
+    "    encoding=\"utf-8\"\n",
+    ")\n",
+    "code = (DATA_DIR / \"raw\" / \"code\" / \"Code_Snippets_v1.txt\").read_text(\n",
+    "    encoding=\"utf-8\"\n",
+    ")\n",
+    "parser = Lark(grammar=grammar, parser=\"lalr\", propagate_positions=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "baa779f3",
+   "metadata": {},
+   "source": [
+    "# Functions"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "89be8bf6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@dataclass\n",
+    "class Chunk:\n",
+    "    text: str\n",
+    "    kind: str\n",
+    "    metadata: Dict[str, Any]\n",
+    "\n",
+    "def _span(node: Tree) -> Optional[Tuple[int, int]]:\n",
+    "    m = node.meta\n",
+    "    s = getattr(m, \"start_pos\", None)\n",
+    "    e = getattr(m, \"end_pos\", None)\n",
+    "    if s is None or e is None:\n",
+    "        return None\n",
+    "    return s, e\n",
+    "\n",
+    "def _iter_trees(t: Tree):\n",
+    "    yield t\n",
+    "    for c in t.children:\n",
+    "        if isinstance(c, Tree):\n",
+    "            yield from _iter_trees(c)\n",
+    "\n",
+    "def _cmd_name(line: str) -> Optional[str]:\n",
+    "    m = re.match(r\"^\\s*([A-Za-z_][A-Za-z0-9_]*)\\s*\\(\", line)\n",
+    "    return m.group(1) if m else None\n",
+    "\n",
+    "def chunk_atomic_lines(code: str) -> List[Chunk]:\n",
+    "    tree = parser.parse(code)\n",
+    "    chunks: List[Chunk] = []\n",
+    "\n",
+    "    for node in _iter_trees(tree):\n",
+    "        if node.data == \"stmt_line\":\n",
+    "            sp = _span(node)\n",
+    "            if not sp:\n",
+    "                continue\n",
+    "            s, e = sp\n",
+    "            text = code[s:e].strip()\n",
+    "            if not text:\n",
+    "                continue\n",
+    "\n",
+    "            chunks.append(\n",
+    "                Chunk(\n",
+    "                    text=text,\n",
+    "                    kind=\"line\",\n",
+    "                    metadata={\n",
+    "                        \"granularity\": \"atomic\",\n",
+    "                        \"command\": _cmd_name(text)\n",
+    "                    }\n",
+    "                )\n",
+    "            )\n",
+    "    return chunks\n",
+    "\n",
+    "def chunk_blocks(code: str) -> List[Chunk]:\n",
+    "    tree = parser.parse(code)\n",
+    "    chunks: List[Chunk] = []\n",
+    "\n",
+    "    for node in _iter_trees(tree):\n",
+    "        if node.data in (\"if_block\", \"loop_block\", \"try_block\", \"go_async_block\", \"function_block\"):\n",
+    "            sp = _span(node)\n",
+    "            if not sp:\n",
+    "                continue\n",
+    "            s, e = sp\n",
+    "            text = code[s:e].strip()\n",
+    "            if not text:\n",
+    "                continue\n",
+    "\n",
+    "            chunks.append(\n",
+    "                Chunk(\n",
+    "                    text=text,\n",
+    "                    kind=node.data,\n",
+    "                    metadata={\"granularity\": \"block\"}\n",
+    "                )\n",
+    "            )\n",
+    "    return chunks\n",
+    "\n",
+    "def chunk_avap_code(code: str) -> List[Chunk]:\n",
+    "    # Keep original offsets: do NOT lstrip. Grammar already accepts leading _NL.\n",
+    "    blocks = chunk_blocks(code)\n",
+    "    lines = chunk_atomic_lines(code)\n",
+    "    return blocks + lines"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "23a92e13",
+   "metadata": {},
+   "source": [
+    "# BNF "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "id": "19253100",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "code = \"\"\"\n",
+    "    addVar(base, 1000)\n",
+    "    addVar(copia, $base) // copia toma el valor 1000, no la cadena \"$base\"\n",
+    "    addResult(copia)\n",
+    "\"\"\"\n",
+    "tree = parser.parse(code)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "id": "04bf9223",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'program\\n  simple_stmt\\t    addVar(base, 1000)\\n  simple_stmt\\t    addVar(copia, $base) // copia toma el valor 1000, no la cadena \"$base\"\\n  simple_stmt\\t    addResult(copia)\\n'"
+      ]
+     },
+     "execution_count": 34,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "tree.pretty()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "id": "b2999a98",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "chunks = chunk_avap_code(code)\n",
+    "\n",
+    "for c in chunks:\n",
+    "    print(\"----\")\n",
+    "    print(\"TYPE:\", c.kind)\n",
+    "    print(\"TEXT:\\n\", c.text)\n",
+    "    print(\"META:\", c.metadata)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "77f6c552",
+   "metadata": {},
+   "source": [
+    "## Elastic Search"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 51,
+   "id": "09ce3e29",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "es = Elasticsearch(\n",
+    "    ELASTICSEARCH_URL,\n",
+    "    request_timeout=120,\n",
+    "    max_retries=5,\n",
+    "    retry_on_timeout=True,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 52,
+   "id": "d575c386",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if es.indices.exists(index=ELASTICSEARCH_CODE_INDEX):\n",
+    "    es.indices.delete(index=ELASTICSEARCH_CODE_INDEX)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 56,
+   "id": "40ea0af8",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "avap-code\n",
+      "avap-docs-test\n"
+     ]
+    }
+   ],
+   "source": [
+    "for index in es.indices.get(index=\"*\"):\n",
+    "    print(index)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 54,
+   "id": "4e091b39",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "OllamaEmbeddings(model='qwen3-0.6B-emb:latest', validate_model_on_init=False, base_url='http://localhost:11434', client_kwargs={}, async_client_kwargs={}, sync_client_kwargs={}, mirostat=None, mirostat_eta=None, mirostat_tau=None, num_ctx=None, num_gpu=None, keep_alive=None, num_thread=None, repeat_last_n=None, repeat_penalty=None, temperature=None, stop=None, tfs_z=None, top_k=None, top_p=None)"
+      ]
+     },
+     "execution_count": 54,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "embeddings = OllamaEmbeddings(base_url=OLLAMA_LOCAL_URL, model=OLLAMA_EMB_MODEL_NAME)\n",
+    "embeddings"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 55,
+   "id": "5aff21c0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# index into Elasticsearch\n",
+    "db = ElasticsearchStore.from_documents(\n",
+    "    code_chunks,\n",
+    "    embeddings,\n",
+    "    client=es,\n",
+    "    index_name=ELASTICSEARCH_CODE_INDEX,\n",
+    "    distance_strategy=\"COSINE\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "74c0a377",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "response = es.search(\n",
+    "    index=ELASTICSEARCH_CODE_INDEX,\n",
+    "    body={\n",
+    "        \"query\": {\"match_all\": {}},\n",
+    "        \"size\": 10 \n",
+    "    }\n",
+    ")\n",
+    "\n",
+    "for hit in response[\"hits\"][\"hits\"]:\n",
+    "    print(\"ID:\", hit[\"_id\"])\n",
+    "    print(\"Source:\", hit[\"_source\"])\n",
+    "    print(\"-\" * 40)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d823650e",
+   "metadata": {},
+   "source": [
+    "# Retrive"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5732a27d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "base_retriever = db.as_retriever(\n",
+    "    search_type=\"similarity\",\n",
+    "    search_kwargs={\"k\": 5}\n",
+    "    ) \n",
+    "\n",
+    "docs = base_retriever.invoke(\"What reserved words does AVAP have?\")\n",
+    "docs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8706506f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "embeddings = OllamaEmbeddings(base_url=OLLAMA_URL, model=OLLAMA_EMB_MODEL_NAME)\n",
+    "\n",
+    "vector_store = ElasticsearchStore(\n",
+    "    client=es,\n",
+    "    index_name=ELASTICSEARCH_DOCS_INDEX,\n",
+    "    embedding=embeddings,\n",
+    "    query_field=\"text\",\n",
+    "    vector_query_field=\"vector\",\n",
+    ")\n",
+    "\n",
+    "results = vector_store.similarity_search_with_score(\n",
+    "    query=\"What data types does AVAP have?\",\n",
+    "    k=50\n",
+    ")\n",
+    "\n",
+    "results"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "assistance-engine",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/src/config.py b/src/config.py
index 952b935..d2e70dc 100644
--- a/src/config.py
+++ b/src/config.py
@@ -10,6 +10,8 @@ OLLAMA_URL=os.getenv("OLLAMA_URL", "http://host.docker.internal:11434")
 OLLAMA_LOCAL_URL=os.getenv("OLLAMA_LOCAL_URL", "http://localhost:11434")
 OLLAMA_MODEL_NAME=os.getenv("OLLAMA_MODEL_NAME", "qwen3-0.6B:latest")
 OLLAMA_EMB_MODEL_NAME=os.getenv("OLLAMA_EMB_MODEL_NAME", "qwen3-0.6B-emb:latest")
+ELASTICSEARCH_DOCS_INDEX = os.getenv("ELASTICSEARCH_DOCS_INDEX")
+ELASTICSEARCH_CODE_INDEX = os.getenv("ELASTICSEARCH_CODE_INDEX")
  
 LANGFUSE_HOST=os.getenv("LANGFUSE_HOST", "http://45.77.119.180")
 LANGFUSE_PUBLIC_KEY=os.getenv("LANGFUSE_PUBLIC_KEY", "pk-lf-0e6db694-3e95-4dd4-aedf-5a2694267058")