Update changelog for version 1.2.0: add new modules, refactor server integration, and enhance dependency management

2026-03-05 11:00:30 +01:00 · 2026-03-05 11:00:30 +01:00 · 183c04829c
parent f15266f345
commit 183c04829c
3 changed files with 438 additions and 0 deletions
--- a/24
+++ b/24
@ -4,6 +4,30 @@ All notable changes to the **Brunix Assistance Engine** will be documented in th
 ---
 ## [1.2.0] - 2026-03-04
 ### Added
 - IMPLEMENTED: 
    - `utils/`: factory modules created for embedding model and LLM generation.
    - `graph.py`: workflow graph orchestration module added.
    - `prompts.py`: centralized prompt definitions added.
    - `state.py`: shared state management module added.
 ### Changed
 - REFACTORED: `server.py` updated to integrate the new graph/state/prompt and utils-based architecture.
 - DEPENDENCIES: `requirements.txt` updated with new libraries required by the new modules.
 - BUILD/OPS: `Makefile` updated with commands:
    - `ollama_local`
    - `tunnels_down`
    - `sync_data_down`
    - `sync_data_up`
 ### Fixed
 - RESOLVED: Command coverage and dependency consistency for local execution and data sync workflows.
 ## [1.1.0] - 2026-02-16
 ### Added
--- a/scratches/pseco/ingestion/n01
+++ b/scratches/pseco/ingestion/n01
@ -0,0 +1,412 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0a8abbfa",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import os\n",
    "import re\n",
    "import uuid\n",
    "from dataclasses import dataclass\n",
    "from pathlib import Path\n",
    "from typing import Any, Dict, List, Optional, Tuple\n",
    "\n",
    "import nltk\n",
    "from elasticsearch import Elasticsearch\n",
    "from langchain_core.documents import Document\n",
    "from langchain_elasticsearch import ElasticsearchStore\n",
    "from langchain_ollama import OllamaEmbeddings\n",
    "from lark import Lark, Token, Transformer, Tree\n",
    "from transformers import AutoConfig\n",
    "\n",
    "from src.config import (DATA_DIR, ELASTICSEARCH_CODE_INDEX,\n",
    "                        ELASTICSEARCH_DOCS_INDEX, ELASTICSEARCH_INDEX,\n",
    "                        ELASTICSEARCH_URL, HF_EMB_MODEL_NAME,\n",
    "                        OLLAMA_EMB_MODEL_NAME, OLLAMA_LOCAL_URL,\n",
    "                        OLLAMA_MODEL_NAME, OLLAMA_URL, PROJ_ROOT)\n",
    "\n",
    "nltk.download(\"punkt\", quiet=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "5c9d292b",
   "metadata": {},
   "outputs": [],
   "source": [
    "config = AutoConfig.from_pretrained(HF_EMB_MODEL_NAME)\n",
    "embedding_dim = config.hidden_size"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "0e1cd9b9",
   "metadata": {},
   "outputs": [],
   "source": [
    "grammar = (DATA_DIR / \"raw\" / \"code\" / \"EBNF_v2.txt\").read_text(\n",
    "    encoding=\"utf-8\"\n",
    ")\n",
    "code = (DATA_DIR / \"raw\" / \"code\" / \"Code_Snippets_v1.txt\").read_text(\n",
    "    encoding=\"utf-8\"\n",
    ")\n",
    "parser = Lark(grammar=grammar, parser=\"lalr\", propagate_positions=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "baa779f3",
   "metadata": {},
   "source": [
    "# Functions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "89be8bf6",
   "metadata": {},
   "outputs": [],
   "source": [
    "@dataclass\n",
    "class Chunk:\n",
    "    text: str\n",
    "    kind: str\n",
    "    metadata: Dict[str, Any]\n",
    "\n",
    "def _span(node: Tree) -> Optional[Tuple[int, int]]:\n",
    "    m = node.meta\n",
    "    s = getattr(m, \"start_pos\", None)\n",
    "    e = getattr(m, \"end_pos\", None)\n",
    "    if s is None or e is None:\n",
    "        return None\n",
    "    return s, e\n",
    "\n",
    "def _iter_trees(t: Tree):\n",
    "    yield t\n",
    "    for c in t.children:\n",
    "        if isinstance(c, Tree):\n",
    "            yield from _iter_trees(c)\n",
    "\n",
    "def _cmd_name(line: str) -> Optional[str]:\n",
    "    m = re.match(r\"^\\s*([A-Za-z_][A-Za-z0-9_]*)\\s*\\(\", line)\n",
    "    return m.group(1) if m else None\n",
    "\n",
    "def chunk_atomic_lines(code: str) -> List[Chunk]:\n",
    "    tree = parser.parse(code)\n",
    "    chunks: List[Chunk] = []\n",
    "\n",
    "    for node in _iter_trees(tree):\n",
    "        if node.data == \"stmt_line\":\n",
    "            sp = _span(node)\n",
    "            if not sp:\n",
    "                continue\n",
    "            s, e = sp\n",
    "            text = code[s:e].strip()\n",
    "            if not text:\n",
    "                continue\n",
    "\n",
    "            chunks.append(\n",
    "                Chunk(\n",
    "                    text=text,\n",
    "                    kind=\"line\",\n",
    "                    metadata={\n",
    "                        \"granularity\": \"atomic\",\n",
    "                        \"command\": _cmd_name(text)\n",
    "                    }\n",
    "                )\n",
    "            )\n",
    "    return chunks\n",
    "\n",
    "def chunk_blocks(code: str) -> List[Chunk]:\n",
    "    tree = parser.parse(code)\n",
    "    chunks: List[Chunk] = []\n",
    "\n",
    "    for node in _iter_trees(tree):\n",
    "        if node.data in (\"if_block\", \"loop_block\", \"try_block\", \"go_async_block\", \"function_block\"):\n",
    "            sp = _span(node)\n",
    "            if not sp:\n",
    "                continue\n",
    "            s, e = sp\n",
    "            text = code[s:e].strip()\n",
    "            if not text:\n",
    "                continue\n",
    "\n",
    "            chunks.append(\n",
    "                Chunk(\n",
    "                    text=text,\n",
    "                    kind=node.data,\n",
    "                    metadata={\"granularity\": \"block\"}\n",
    "                )\n",
    "            )\n",
    "    return chunks\n",
    "\n",
    "def chunk_avap_code(code: str) -> List[Chunk]:\n",
    "    # Keep original offsets: do NOT lstrip. Grammar already accepts leading _NL.\n",
    "    blocks = chunk_blocks(code)\n",
    "    lines = chunk_atomic_lines(code)\n",
    "    return blocks + lines"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "23a92e13",
   "metadata": {},
   "source": [
    "# BNF "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "id": "19253100",
   "metadata": {},
   "outputs": [],
   "source": [
    "code = \"\"\"\n",
    "    addVar(base, 1000)\n",
    "    addVar(copia, $base) // copia toma el valor 1000, no la cadena \"$base\"\n",
    "    addResult(copia)\n",
    "\"\"\"\n",
    "tree = parser.parse(code)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "id": "04bf9223",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'program\\n  simple_stmt\\t    addVar(base, 1000)\\n  simple_stmt\\t    addVar(copia, $base) // copia toma el valor 1000, no la cadena \"$base\"\\n  simple_stmt\\t    addResult(copia)\\n'"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tree.pretty()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "id": "b2999a98",
   "metadata": {},
   "outputs": [],
   "source": [
    "chunks = chunk_avap_code(code)\n",
    "\n",
    "for c in chunks:\n",
    "    print(\"----\")\n",
    "    print(\"TYPE:\", c.kind)\n",
    "    print(\"TEXT:\\n\", c.text)\n",
    "    print(\"META:\", c.metadata)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "77f6c552",
   "metadata": {},
   "source": [
    "## Elastic Search"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "id": "09ce3e29",
   "metadata": {},
   "outputs": [],
   "source": [
    "es = Elasticsearch(\n",
    "    ELASTICSEARCH_URL,\n",
    "    request_timeout=120,\n",
    "    max_retries=5,\n",
    "    retry_on_timeout=True,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "id": "d575c386",
   "metadata": {},
   "outputs": [],
   "source": [
    "if es.indices.exists(index=ELASTICSEARCH_CODE_INDEX):\n",
    "    es.indices.delete(index=ELASTICSEARCH_CODE_INDEX)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "id": "40ea0af8",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "avap-code\n",
      "avap-docs-test\n"
     ]
    }
   ],
   "source": [
    "for index in es.indices.get(index=\"*\"):\n",
    "    print(index)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "id": "4e091b39",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "OllamaEmbeddings(model='qwen3-0.6B-emb:latest', validate_model_on_init=False, base_url='http://localhost:11434', client_kwargs={}, async_client_kwargs={}, sync_client_kwargs={}, mirostat=None, mirostat_eta=None, mirostat_tau=None, num_ctx=None, num_gpu=None, keep_alive=None, num_thread=None, repeat_last_n=None, repeat_penalty=None, temperature=None, stop=None, tfs_z=None, top_k=None, top_p=None)"
      ]
     },
     "execution_count": 54,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "embeddings = OllamaEmbeddings(base_url=OLLAMA_LOCAL_URL, model=OLLAMA_EMB_MODEL_NAME)\n",
    "embeddings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "id": "5aff21c0",
   "metadata": {},
   "outputs": [],
   "source": [
    "# index into Elasticsearch\n",
    "db = ElasticsearchStore.from_documents(\n",
    "    code_chunks,\n",
    "    embeddings,\n",
    "    client=es,\n",
    "    index_name=ELASTICSEARCH_CODE_INDEX,\n",
    "    distance_strategy=\"COSINE\",\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "74c0a377",
   "metadata": {},
   "outputs": [],
   "source": [
    "response = es.search(\n",
    "    index=ELASTICSEARCH_CODE_INDEX,\n",
    "    body={\n",
    "        \"query\": {\"match_all\": {}},\n",
    "        \"size\": 10 \n",
    "    }\n",
    ")\n",
    "\n",
    "for hit in response[\"hits\"][\"hits\"]:\n",
    "    print(\"ID:\", hit[\"_id\"])\n",
    "    print(\"Source:\", hit[\"_source\"])\n",
    "    print(\"-\" * 40)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d823650e",
   "metadata": {},
   "source": [
    "# Retrive"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5732a27d",
   "metadata": {},
   "outputs": [],
   "source": [
    "base_retriever = db.as_retriever(\n",
    "    search_type=\"similarity\",\n",
    "    search_kwargs={\"k\": 5}\n",
    "    ) \n",
    "\n",
    "docs = base_retriever.invoke(\"What reserved words does AVAP have?\")\n",
    "docs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8706506f",
   "metadata": {},
   "outputs": [],
   "source": [
    "embeddings = OllamaEmbeddings(base_url=OLLAMA_URL, model=OLLAMA_EMB_MODEL_NAME)\n",
    "\n",
    "vector_store = ElasticsearchStore(\n",
    "    client=es,\n",
    "    index_name=ELASTICSEARCH_DOCS_INDEX,\n",
    "    embedding=embeddings,\n",
    "    query_field=\"text\",\n",
    "    vector_query_field=\"vector\",\n",
    ")\n",
    "\n",
    "results = vector_store.similarity_search_with_score(\n",
    "    query=\"What data types does AVAP have?\",\n",
    "    k=50\n",
    ")\n",
    "\n",
    "results"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "assistance-engine",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
--- a/src/config.py
+++ b/src/config.py
@ -10,6 +10,8 @@ OLLAMA_URL=os.getenv("OLLAMA_URL", "http://host.docker.internal:11434")
 OLLAMA_LOCAL_URL=os.getenv("OLLAMA_LOCAL_URL", "http://localhost:11434")
 OLLAMA_MODEL_NAME=os.getenv("OLLAMA_MODEL_NAME", "qwen3-0.6B:latest")
 OLLAMA_EMB_MODEL_NAME=os.getenv("OLLAMA_EMB_MODEL_NAME", "qwen3-0.6B-emb:latest")
 ELASTICSEARCH_DOCS_INDEX = os.getenv("ELASTICSEARCH_DOCS_INDEX")
 ELASTICSEARCH_CODE_INDEX = os.getenv("ELASTICSEARCH_CODE_INDEX")
 LANGFUSE_HOST=os.getenv("LANGFUSE_HOST", "http://45.77.119.180")
 LANGFUSE_PUBLIC_KEY=os.getenv("LANGFUSE_PUBLIC_KEY", "pk-lf-0e6db694-3e95-4dd4-aedf-5a2694267058")