{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "0a8abbfa", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import os\n", "import re\n", "import uuid\n", "from dataclasses import dataclass\n", "from pathlib import Path\n", "from typing import Any, Dict, List, Optional, Tuple\n", "\n", "import nltk\n", "from elasticsearch import Elasticsearch\n", "from langchain_core.documents import Document\n", "from langchain_elasticsearch import ElasticsearchStore\n", "from langchain_ollama import OllamaEmbeddings\n", "from lark import Lark, Token, Transformer, Tree\n", "from transformers import AutoConfig\n", "\n", "from src.config import (DATA_DIR, ELASTICSEARCH_CODE_INDEX,\n", " ELASTICSEARCH_DOCS_INDEX, ELASTICSEARCH_INDEX,\n", " ELASTICSEARCH_URL, HF_EMB_MODEL_NAME,\n", " OLLAMA_EMB_MODEL_NAME, OLLAMA_LOCAL_URL,\n", " OLLAMA_MODEL_NAME, OLLAMA_URL, PROJ_ROOT)\n", "\n", "nltk.download(\"punkt\", quiet=True)" ] }, { "cell_type": "code", "execution_count": 19, "id": "5c9d292b", "metadata": {}, "outputs": [], "source": [ "config = AutoConfig.from_pretrained(HF_EMB_MODEL_NAME)\n", "embedding_dim = config.hidden_size" ] }, { "cell_type": "code", "execution_count": 20, "id": "0e1cd9b9", "metadata": {}, "outputs": [], "source": [ "grammar = (DATA_DIR / \"raw\" / \"code\" / \"EBNF_v2.txt\").read_text(\n", " encoding=\"utf-8\"\n", ")\n", "code = (DATA_DIR / \"raw\" / \"code\" / \"Code_Snippets_v1.txt\").read_text(\n", " encoding=\"utf-8\"\n", ")\n", "parser = Lark(grammar=grammar, parser=\"lalr\", propagate_positions=True)" ] }, { "cell_type": "markdown", "id": "baa779f3", "metadata": {}, "source": [ "# Functions" ] }, { "cell_type": "code", "execution_count": null, "id": "89be8bf6", "metadata": {}, "outputs": [], "source": [ "@dataclass\n", "class Chunk:\n", " text: str\n", " kind: str\n", " metadata: Dict[str, Any]\n", "\n", "def _span(node: Tree) -> Optional[Tuple[int, int]]:\n", " m = node.meta\n", " s = getattr(m, \"start_pos\", None)\n", " e = getattr(m, \"end_pos\", None)\n", " if s is None or e is None:\n", " return None\n", " return s, e\n", "\n", "def _iter_trees(t: Tree):\n", " yield t\n", " for c in t.children:\n", " if isinstance(c, Tree):\n", " yield from _iter_trees(c)\n", "\n", "def _cmd_name(line: str) -> Optional[str]:\n", " m = re.match(r\"^\\s*([A-Za-z_][A-Za-z0-9_]*)\\s*\\(\", line)\n", " return m.group(1) if m else None\n", "\n", "def chunk_atomic_lines(code: str) -> List[Chunk]:\n", " tree = parser.parse(code)\n", " chunks: List[Chunk] = []\n", "\n", " for node in _iter_trees(tree):\n", " if node.data == \"stmt_line\":\n", " sp = _span(node)\n", " if not sp:\n", " continue\n", " s, e = sp\n", " text = code[s:e].strip()\n", " if not text:\n", " continue\n", "\n", " chunks.append(\n", " Chunk(\n", " text=text,\n", " kind=\"line\",\n", " metadata={\n", " \"granularity\": \"atomic\",\n", " \"command\": _cmd_name(text)\n", " }\n", " )\n", " )\n", " return chunks\n", "\n", "def chunk_blocks(code: str) -> List[Chunk]:\n", " tree = parser.parse(code)\n", " chunks: List[Chunk] = []\n", "\n", " for node in _iter_trees(tree):\n", " if node.data in (\"if_block\", \"loop_block\", \"try_block\", \"go_async_block\", \"function_block\"):\n", " sp = _span(node)\n", " if not sp:\n", " continue\n", " s, e = sp\n", " text = code[s:e].strip()\n", " if not text:\n", " continue\n", "\n", " chunks.append(\n", " Chunk(\n", " text=text,\n", " kind=node.data,\n", " metadata={\"granularity\": \"block\"}\n", " )\n", " )\n", " return chunks\n", "\n", "def chunk_avap_code(code: str) -> List[Chunk]:\n", " # Keep original offsets: do NOT lstrip. Grammar already accepts leading _NL.\n", " blocks = chunk_blocks(code)\n", " lines = chunk_atomic_lines(code)\n", " return blocks + lines" ] }, { "cell_type": "markdown", "id": "23a92e13", "metadata": {}, "source": [ "# BNF " ] }, { "cell_type": "code", "execution_count": 33, "id": "19253100", "metadata": {}, "outputs": [], "source": [ "code = \"\"\"\n", " addVar(base, 1000)\n", " addVar(copia, $base) // copia toma el valor 1000, no la cadena \"$base\"\n", " addResult(copia)\n", "\"\"\"\n", "tree = parser.parse(code)" ] }, { "cell_type": "code", "execution_count": 34, "id": "04bf9223", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'program\\n simple_stmt\\t addVar(base, 1000)\\n simple_stmt\\t addVar(copia, $base) // copia toma el valor 1000, no la cadena \"$base\"\\n simple_stmt\\t addResult(copia)\\n'" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tree.pretty()" ] }, { "cell_type": "code", "execution_count": 35, "id": "b2999a98", "metadata": {}, "outputs": [], "source": [ "chunks = chunk_avap_code(code)\n", "\n", "for c in chunks:\n", " print(\"----\")\n", " print(\"TYPE:\", c.kind)\n", " print(\"TEXT:\\n\", c.text)\n", " print(\"META:\", c.metadata)" ] }, { "cell_type": "markdown", "id": "77f6c552", "metadata": {}, "source": [ "## Elastic Search" ] }, { "cell_type": "code", "execution_count": 51, "id": "09ce3e29", "metadata": {}, "outputs": [], "source": [ "es = Elasticsearch(\n", " ELASTICSEARCH_URL,\n", " request_timeout=120,\n", " max_retries=5,\n", " retry_on_timeout=True,\n", ")" ] }, { "cell_type": "code", "execution_count": 52, "id": "d575c386", "metadata": {}, "outputs": [], "source": [ "if es.indices.exists(index=ELASTICSEARCH_CODE_INDEX):\n", " es.indices.delete(index=ELASTICSEARCH_CODE_INDEX)" ] }, { "cell_type": "code", "execution_count": 56, "id": "40ea0af8", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "avap-code\n", "avap-docs-test\n" ] } ], "source": [ "for index in es.indices.get(index=\"*\"):\n", " print(index)" ] }, { "cell_type": "code", "execution_count": 54, "id": "4e091b39", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "OllamaEmbeddings(model='qwen3-0.6B-emb:latest', validate_model_on_init=False, base_url='http://localhost:11434', client_kwargs={}, async_client_kwargs={}, sync_client_kwargs={}, mirostat=None, mirostat_eta=None, mirostat_tau=None, num_ctx=None, num_gpu=None, keep_alive=None, num_thread=None, repeat_last_n=None, repeat_penalty=None, temperature=None, stop=None, tfs_z=None, top_k=None, top_p=None)" ] }, "execution_count": 54, "metadata": {}, "output_type": "execute_result" } ], "source": [ "embeddings = OllamaEmbeddings(base_url=OLLAMA_LOCAL_URL, model=OLLAMA_EMB_MODEL_NAME)\n", "embeddings" ] }, { "cell_type": "code", "execution_count": 55, "id": "5aff21c0", "metadata": {}, "outputs": [], "source": [ "# index into Elasticsearch\n", "db = ElasticsearchStore.from_documents(\n", " code_chunks,\n", " embeddings,\n", " client=es,\n", " index_name=ELASTICSEARCH_CODE_INDEX,\n", " distance_strategy=\"COSINE\",\n", ")" ] }, { "cell_type": "code", "execution_count": null, "id": "74c0a377", "metadata": {}, "outputs": [], "source": [ "response = es.search(\n", " index=ELASTICSEARCH_CODE_INDEX,\n", " body={\n", " \"query\": {\"match_all\": {}},\n", " \"size\": 10 \n", " }\n", ")\n", "\n", "for hit in response[\"hits\"][\"hits\"]:\n", " print(\"ID:\", hit[\"_id\"])\n", " print(\"Source:\", hit[\"_source\"])\n", " print(\"-\" * 40)" ] }, { "cell_type": "markdown", "id": "d823650e", "metadata": {}, "source": [ "# Retrive" ] }, { "cell_type": "code", "execution_count": null, "id": "5732a27d", "metadata": {}, "outputs": [], "source": [ "base_retriever = db.as_retriever(\n", " search_type=\"similarity\",\n", " search_kwargs={\"k\": 5}\n", " ) \n", "\n", "docs = base_retriever.invoke(\"What reserved words does AVAP have?\")\n", "docs" ] }, { "cell_type": "code", "execution_count": null, "id": "8706506f", "metadata": {}, "outputs": [], "source": [ "embeddings = OllamaEmbeddings(base_url=OLLAMA_URL, model=OLLAMA_EMB_MODEL_NAME)\n", "\n", "vector_store = ElasticsearchStore(\n", " client=es,\n", " index_name=ELASTICSEARCH_DOCS_INDEX,\n", " embedding=embeddings,\n", " query_field=\"text\",\n", " vector_query_field=\"vector\",\n", ")\n", "\n", "results = vector_store.similarity_search_with_score(\n", " query=\"What data types does AVAP have?\",\n", " k=50\n", ")\n", "\n", "results" ] } ], "metadata": { "kernelspec": { "display_name": "assistance-engine", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.11" } }, "nbformat": 4, "nbformat_minor": 5 }