From 183c04829c5c05b119be12606452c3e0b7314e76 Mon Sep 17 00:00:00 2001 From: pseco Date: Thu, 5 Mar 2026 11:00:30 +0100 Subject: [PATCH] Update changelog for version 1.2.0: add new modules, refactor server integration, and enhance dependency management --- changelog | 24 + .../ingestion/n01 Proper Lark Chunking.ipynb | 412 ++++++++++++++++++ src/config.py | 2 + 3 files changed, 438 insertions(+) create mode 100644 scratches/pseco/ingestion/n01 Proper Lark Chunking.ipynb diff --git a/changelog b/changelog index 87fa691..061cf3e 100644 --- a/changelog +++ b/changelog @@ -4,6 +4,30 @@ All notable changes to the **Brunix Assistance Engine** will be documented in th --- +## [1.2.0] - 2026-03-04 + +### Added +- IMPLEMENTED: + - `utils/`: factory modules created for embedding model and LLM generation. + - `graph.py`: workflow graph orchestration module added. + - `prompts.py`: centralized prompt definitions added. + - `state.py`: shared state management module added. + +### Changed +- REFACTORED: `server.py` updated to integrate the new graph/state/prompt and utils-based architecture. +- DEPENDENCIES: `requirements.txt` updated with new libraries required by the new modules. +- BUILD/OPS: `Makefile` updated with commands: + - `ollama_local` + - `tunnels_down` + - `sync_data_down` + - `sync_data_up` + +### Fixed +- RESOLVED: Command coverage and dependency consistency for local execution and data sync workflows. + + + + ## [1.1.0] - 2026-02-16 ### Added diff --git a/scratches/pseco/ingestion/n01 Proper Lark Chunking.ipynb b/scratches/pseco/ingestion/n01 Proper Lark Chunking.ipynb new file mode 100644 index 0000000..183cb7f --- /dev/null +++ b/scratches/pseco/ingestion/n01 Proper Lark Chunking.ipynb @@ -0,0 +1,412 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "0a8abbfa", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import os\n", + "import re\n", + "import uuid\n", + "from dataclasses import dataclass\n", + "from pathlib import Path\n", + "from typing import Any, Dict, List, Optional, Tuple\n", + "\n", + "import nltk\n", + "from elasticsearch import Elasticsearch\n", + "from langchain_core.documents import Document\n", + "from langchain_elasticsearch import ElasticsearchStore\n", + "from langchain_ollama import OllamaEmbeddings\n", + "from lark import Lark, Token, Transformer, Tree\n", + "from transformers import AutoConfig\n", + "\n", + "from src.config import (DATA_DIR, ELASTICSEARCH_CODE_INDEX,\n", + " ELASTICSEARCH_DOCS_INDEX, ELASTICSEARCH_INDEX,\n", + " ELASTICSEARCH_URL, HF_EMB_MODEL_NAME,\n", + " OLLAMA_EMB_MODEL_NAME, OLLAMA_LOCAL_URL,\n", + " OLLAMA_MODEL_NAME, OLLAMA_URL, PROJ_ROOT)\n", + "\n", + "nltk.download(\"punkt\", quiet=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "5c9d292b", + "metadata": {}, + "outputs": [], + "source": [ + "config = AutoConfig.from_pretrained(HF_EMB_MODEL_NAME)\n", + "embedding_dim = config.hidden_size" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "0e1cd9b9", + "metadata": {}, + "outputs": [], + "source": [ + "grammar = (DATA_DIR / \"raw\" / \"code\" / \"EBNF_v2.txt\").read_text(\n", + " encoding=\"utf-8\"\n", + ")\n", + "code = (DATA_DIR / \"raw\" / \"code\" / \"Code_Snippets_v1.txt\").read_text(\n", + " encoding=\"utf-8\"\n", + ")\n", + "parser = Lark(grammar=grammar, parser=\"lalr\", propagate_positions=True)" + ] + }, + { + "cell_type": "markdown", + "id": "baa779f3", + "metadata": {}, + "source": [ + "# Functions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "89be8bf6", + "metadata": {}, + "outputs": [], + "source": [ + "@dataclass\n", + "class Chunk:\n", + " text: str\n", + " kind: str\n", + " metadata: Dict[str, Any]\n", + "\n", + "def _span(node: Tree) -> Optional[Tuple[int, int]]:\n", + " m = node.meta\n", + " s = getattr(m, \"start_pos\", None)\n", + " e = getattr(m, \"end_pos\", None)\n", + " if s is None or e is None:\n", + " return None\n", + " return s, e\n", + "\n", + "def _iter_trees(t: Tree):\n", + " yield t\n", + " for c in t.children:\n", + " if isinstance(c, Tree):\n", + " yield from _iter_trees(c)\n", + "\n", + "def _cmd_name(line: str) -> Optional[str]:\n", + " m = re.match(r\"^\\s*([A-Za-z_][A-Za-z0-9_]*)\\s*\\(\", line)\n", + " return m.group(1) if m else None\n", + "\n", + "def chunk_atomic_lines(code: str) -> List[Chunk]:\n", + " tree = parser.parse(code)\n", + " chunks: List[Chunk] = []\n", + "\n", + " for node in _iter_trees(tree):\n", + " if node.data == \"stmt_line\":\n", + " sp = _span(node)\n", + " if not sp:\n", + " continue\n", + " s, e = sp\n", + " text = code[s:e].strip()\n", + " if not text:\n", + " continue\n", + "\n", + " chunks.append(\n", + " Chunk(\n", + " text=text,\n", + " kind=\"line\",\n", + " metadata={\n", + " \"granularity\": \"atomic\",\n", + " \"command\": _cmd_name(text)\n", + " }\n", + " )\n", + " )\n", + " return chunks\n", + "\n", + "def chunk_blocks(code: str) -> List[Chunk]:\n", + " tree = parser.parse(code)\n", + " chunks: List[Chunk] = []\n", + "\n", + " for node in _iter_trees(tree):\n", + " if node.data in (\"if_block\", \"loop_block\", \"try_block\", \"go_async_block\", \"function_block\"):\n", + " sp = _span(node)\n", + " if not sp:\n", + " continue\n", + " s, e = sp\n", + " text = code[s:e].strip()\n", + " if not text:\n", + " continue\n", + "\n", + " chunks.append(\n", + " Chunk(\n", + " text=text,\n", + " kind=node.data,\n", + " metadata={\"granularity\": \"block\"}\n", + " )\n", + " )\n", + " return chunks\n", + "\n", + "def chunk_avap_code(code: str) -> List[Chunk]:\n", + " # Keep original offsets: do NOT lstrip. Grammar already accepts leading _NL.\n", + " blocks = chunk_blocks(code)\n", + " lines = chunk_atomic_lines(code)\n", + " return blocks + lines" + ] + }, + { + "cell_type": "markdown", + "id": "23a92e13", + "metadata": {}, + "source": [ + "# BNF " + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "19253100", + "metadata": {}, + "outputs": [], + "source": [ + "code = \"\"\"\n", + " addVar(base, 1000)\n", + " addVar(copia, $base) // copia toma el valor 1000, no la cadena \"$base\"\n", + " addResult(copia)\n", + "\"\"\"\n", + "tree = parser.parse(code)" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "04bf9223", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'program\\n simple_stmt\\t addVar(base, 1000)\\n simple_stmt\\t addVar(copia, $base) // copia toma el valor 1000, no la cadena \"$base\"\\n simple_stmt\\t addResult(copia)\\n'" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tree.pretty()" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "b2999a98", + "metadata": {}, + "outputs": [], + "source": [ + "chunks = chunk_avap_code(code)\n", + "\n", + "for c in chunks:\n", + " print(\"----\")\n", + " print(\"TYPE:\", c.kind)\n", + " print(\"TEXT:\\n\", c.text)\n", + " print(\"META:\", c.metadata)" + ] + }, + { + "cell_type": "markdown", + "id": "77f6c552", + "metadata": {}, + "source": [ + "## Elastic Search" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "id": "09ce3e29", + "metadata": {}, + "outputs": [], + "source": [ + "es = Elasticsearch(\n", + " ELASTICSEARCH_URL,\n", + " request_timeout=120,\n", + " max_retries=5,\n", + " retry_on_timeout=True,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "id": "d575c386", + "metadata": {}, + "outputs": [], + "source": [ + "if es.indices.exists(index=ELASTICSEARCH_CODE_INDEX):\n", + " es.indices.delete(index=ELASTICSEARCH_CODE_INDEX)" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "id": "40ea0af8", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "avap-code\n", + "avap-docs-test\n" + ] + } + ], + "source": [ + "for index in es.indices.get(index=\"*\"):\n", + " print(index)" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "id": "4e091b39", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "OllamaEmbeddings(model='qwen3-0.6B-emb:latest', validate_model_on_init=False, base_url='http://localhost:11434', client_kwargs={}, async_client_kwargs={}, sync_client_kwargs={}, mirostat=None, mirostat_eta=None, mirostat_tau=None, num_ctx=None, num_gpu=None, keep_alive=None, num_thread=None, repeat_last_n=None, repeat_penalty=None, temperature=None, stop=None, tfs_z=None, top_k=None, top_p=None)" + ] + }, + "execution_count": 54, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "embeddings = OllamaEmbeddings(base_url=OLLAMA_LOCAL_URL, model=OLLAMA_EMB_MODEL_NAME)\n", + "embeddings" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "id": "5aff21c0", + "metadata": {}, + "outputs": [], + "source": [ + "# index into Elasticsearch\n", + "db = ElasticsearchStore.from_documents(\n", + " code_chunks,\n", + " embeddings,\n", + " client=es,\n", + " index_name=ELASTICSEARCH_CODE_INDEX,\n", + " distance_strategy=\"COSINE\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "74c0a377", + "metadata": {}, + "outputs": [], + "source": [ + "response = es.search(\n", + " index=ELASTICSEARCH_CODE_INDEX,\n", + " body={\n", + " \"query\": {\"match_all\": {}},\n", + " \"size\": 10 \n", + " }\n", + ")\n", + "\n", + "for hit in response[\"hits\"][\"hits\"]:\n", + " print(\"ID:\", hit[\"_id\"])\n", + " print(\"Source:\", hit[\"_source\"])\n", + " print(\"-\" * 40)" + ] + }, + { + "cell_type": "markdown", + "id": "d823650e", + "metadata": {}, + "source": [ + "# Retrive" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5732a27d", + "metadata": {}, + "outputs": [], + "source": [ + "base_retriever = db.as_retriever(\n", + " search_type=\"similarity\",\n", + " search_kwargs={\"k\": 5}\n", + " ) \n", + "\n", + "docs = base_retriever.invoke(\"What reserved words does AVAP have?\")\n", + "docs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8706506f", + "metadata": {}, + "outputs": [], + "source": [ + "embeddings = OllamaEmbeddings(base_url=OLLAMA_URL, model=OLLAMA_EMB_MODEL_NAME)\n", + "\n", + "vector_store = ElasticsearchStore(\n", + " client=es,\n", + " index_name=ELASTICSEARCH_DOCS_INDEX,\n", + " embedding=embeddings,\n", + " query_field=\"text\",\n", + " vector_query_field=\"vector\",\n", + ")\n", + "\n", + "results = vector_store.similarity_search_with_score(\n", + " query=\"What data types does AVAP have?\",\n", + " k=50\n", + ")\n", + "\n", + "results" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "assistance-engine", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/src/config.py b/src/config.py index 952b935..d2e70dc 100644 --- a/src/config.py +++ b/src/config.py @@ -10,6 +10,8 @@ OLLAMA_URL=os.getenv("OLLAMA_URL", "http://host.docker.internal:11434") OLLAMA_LOCAL_URL=os.getenv("OLLAMA_LOCAL_URL", "http://localhost:11434") OLLAMA_MODEL_NAME=os.getenv("OLLAMA_MODEL_NAME", "qwen3-0.6B:latest") OLLAMA_EMB_MODEL_NAME=os.getenv("OLLAMA_EMB_MODEL_NAME", "qwen3-0.6B-emb:latest") +ELASTICSEARCH_DOCS_INDEX = os.getenv("ELASTICSEARCH_DOCS_INDEX") +ELASTICSEARCH_CODE_INDEX = os.getenv("ELASTICSEARCH_CODE_INDEX") LANGFUSE_HOST=os.getenv("LANGFUSE_HOST", "http://45.77.119.180") LANGFUSE_PUBLIC_KEY=os.getenv("LANGFUSE_PUBLIC_KEY", "pk-lf-0e6db694-3e95-4dd4-aedf-5a2694267058")