Add initial Jupyter notebook for document ingestion using Ollama embeddings

- Implemented code to utilize OllamaEmbeddings for embedding documents.
- Included example usage with sample text inputs.
- Demonstrated response handling from the Ollama LLM.
- Noted deprecation warning for the Ollama class in LangChain.
This commit is contained in:
pseco 2026-03-10 14:40:27 +01:00
parent a9bf84fa79
commit 4c56dc29c4
10 changed files with 363 additions and 27 deletions

View File

@ -2,7 +2,7 @@
"cells": [ "cells": [
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 9, "execution_count": 1,
"id": "0a8abbfa", "id": "0a8abbfa",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@ -12,7 +12,7 @@
"True" "True"
] ]
}, },
"execution_count": 9, "execution_count": 1,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -24,7 +24,7 @@
"from dataclasses import dataclass\n", "from dataclasses import dataclass\n",
"from pathlib import Path\n", "from pathlib import Path\n",
"from typing import Any, Dict, List, Optional, Tuple\n", "from typing import Any, Dict, List, Optional, Tuple\n",
"\n", "from bnf import grammar\n",
"import nltk\n", "import nltk\n",
"from elasticsearch import Elasticsearch\n", "from elasticsearch import Elasticsearch\n",
"from langchain_core.documents import Document\n", "from langchain_core.documents import Document\n",
@ -44,7 +44,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 10, "execution_count": 2,
"id": "5c9d292b", "id": "5c9d292b",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -53,28 +53,6 @@
"embedding_dim = config.hidden_size" "embedding_dim = config.hidden_size"
] ]
}, },
{
"cell_type": "code",
"execution_count": 15,
"id": "0e1cd9b9",
"metadata": {},
"outputs": [],
"source": [
"grammar = (DATA_DIR / \"raw\" / \"code\" / \"BNF_v1.lark\").read_text(\n",
" encoding=\"utf-8\"\n",
")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "95e21900",
"metadata": {},
"outputs": [],
"source": [
"parser = Lark(grammar=grammar, parser=\"lalr\", propagate_positions=True)"
]
},
{ {
"cell_type": "markdown", "cell_type": "markdown",
"id": "baa779f3", "id": "baa779f3",
@ -85,7 +63,20 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 17, "execution_count": 3,
"id": "26927d0c",
"metadata": {},
"outputs": [],
"source": [
"def bnf_to_lark(bnf_text):\n",
" text = re.sub(r\"<([^>]+)>\", r\"\\1\", bnf_text) # remove <>\n",
" text = text.replace(\"::=\", \":\")\n",
" return text"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "89be8bf6", "id": "89be8bf6",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -178,6 +169,48 @@
"# BNF " "# BNF "
] ]
}, },
{
"cell_type": "code",
"execution_count": 5,
"id": "26ad9c81",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"line 9 : syntax error at or before | = |\n",
"syntax error at end of file (missing ; ?)\n"
]
},
{
"data": {
"text/plain": [
"({}, None)"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"grammar_ = (DATA_DIR / \"raw\" / \"code\" / \"BNF_v1.txt\").read_text(\n",
" encoding=\"utf-8\"\n",
")\n",
"grammar(grammar_)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7cdf69c4",
"metadata": {},
"outputs": [],
"source": [
"parser = Lark(grammar=grammar, parser=\"lalr\", propagate_positions=True)"
]
},
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 18, "execution_count": 18,

View File

@ -0,0 +1,303 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 51,
"id": "0a8abbfa",
"metadata": {},
"outputs": [],
"source": [
"import re\n",
"\n",
"from dataclasses import dataclass\n",
"\n",
"from typing import Any, Dict, List, Optional, Tuple\n",
"\n",
"from lark import Tree, Lark\n",
"\n",
"\n",
"from bnf import grammar as bnf_grammar, parse as bnf_parse\n",
"from ebnf import grammar as ebnf_grammar, parse as ebnf_parse\n",
"\n",
"from src.config import DATA_DIR"
]
},
{
"cell_type": "markdown",
"id": "baa779f3",
"metadata": {},
"source": [
"# Functions"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "26927d0c",
"metadata": {},
"outputs": [],
"source": [
"def bnf_to_lark(bnf_text):\n",
" text = re.sub(r\"<([^>]+)>\", r\"\\1\", bnf_text) # remove <>\n",
" text = text.replace(\"::=\", \":\")\n",
" return text"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "89be8bf6",
"metadata": {},
"outputs": [],
"source": [
"@dataclass\n",
"class Chunk:\n",
" text: str\n",
" kind: str\n",
" metadata: Dict[str, Any]\n",
"\n",
"def _span(node: Tree) -> Optional[Tuple[int, int]]:\n",
" m = node.meta\n",
" s = getattr(m, \"start_pos\", None)\n",
" e = getattr(m, \"end_pos\", None)\n",
" if s is None or e is None:\n",
" return None\n",
" return s, e\n",
"\n",
"def _iter_trees(t: Tree):\n",
" yield t\n",
" for c in t.children:\n",
" if isinstance(c, Tree):\n",
" yield from _iter_trees(c)\n",
"\n",
"def _cmd_name(line: str) -> Optional[str]:\n",
" m = re.match(r\"^\\s*([A-Za-z_][A-Za-z0-9_]*)\\s*\\(\", line)\n",
" return m.group(1) if m else None\n",
"\n",
"def chunk_atomic_lines(code: str) -> List[Chunk]:\n",
" tree = parser.parse(code)\n",
" chunks: List[Chunk] = []\n",
"\n",
" for node in _iter_trees(tree):\n",
" if node.data == \"stmt_line\":\n",
" sp = _span(node)\n",
" if not sp:\n",
" continue\n",
" s, e = sp\n",
" text = code[s:e].strip()\n",
" if not text:\n",
" continue\n",
"\n",
" chunks.append(\n",
" Chunk(\n",
" text=text,\n",
" kind=\"line\",\n",
" metadata={\n",
" \"granularity\": \"atomic\",\n",
" \"command\": _cmd_name(text)\n",
" }\n",
" )\n",
" )\n",
" return chunks\n",
"\n",
"def chunk_blocks(code: str) -> List[Chunk]:\n",
" tree = parser.parse(code)\n",
" chunks: List[Chunk] = []\n",
"\n",
" for node in _iter_trees(tree):\n",
" if node.data in (\"if_block\", \"loop_block\", \"try_block\", \"go_async_block\", \"function_block\"):\n",
" sp = _span(node)\n",
" if not sp:\n",
" continue\n",
" s, e = sp\n",
" text = code[s:e].strip()\n",
" if not text:\n",
" continue\n",
"\n",
" chunks.append(\n",
" Chunk(\n",
" text=text,\n",
" kind=node.data,\n",
" metadata={\"granularity\": \"block\"}\n",
" )\n",
" )\n",
" return chunks\n",
"\n",
"def chunk_avap_code(code: str) -> List[Chunk]:\n",
" # Keep original offsets: do NOT lstrip. Grammar already accepts leading _NL.\n",
" blocks = chunk_blocks(code)\n",
" lines = chunk_atomic_lines(code)\n",
" return blocks + lines"
]
},
{
"cell_type": "markdown",
"id": "23a92e13",
"metadata": {},
"source": [
"# BNF check"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a10a1017",
"metadata": {},
"outputs": [],
"source": [
"bnf_text = r\"\"\"\n",
"<assign> ::= <name> <num>\n",
"<name> ::= a | b | c\n",
"<num> ::= [0-9]\n",
"\"\"\""
]
},
{
"cell_type": "code",
"execution_count": 68,
"id": "4790023e",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"line 1 : syntax error at or before EOL = \n",
"\n",
"line 2 : illegal character '='\n",
"BNF: False\n"
]
}
],
"source": [
"bnf_grammar(bnf_text)\n",
"print(\"BNF:\", bnf_parse(\"a=7\"))"
]
},
{
"cell_type": "markdown",
"id": "49953efd",
"metadata": {},
"source": [
"# BNF conversion to EBNF"
]
},
{
"cell_type": "markdown",
"id": "32dbc2c5",
"metadata": {},
"source": [
"# EBNF Check"
]
},
{
"cell_type": "code",
"execution_count": 63,
"id": "37968906",
"metadata": {},
"outputs": [],
"source": [
"ebnf_text = r\"\"\"\n",
"assign ::= name '=' num ;\n",
"name ::= 'a' | 'b' | 'c' ;\n",
"num ::= [0-9] ;\n",
"\"\"\""
]
},
{
"cell_type": "code",
"execution_count": 64,
"id": "b234f2c4",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"BNF: True\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Generating LALR tables\n"
]
}
],
"source": [
"ebnf_grammar(ebnf_text)\n",
"print(\"BNF:\", ebnf_parse(\"a=7\"))"
]
},
{
"cell_type": "markdown",
"id": "66fb8fee",
"metadata": {},
"source": [
"# Lark check EBNF Style"
]
},
{
"cell_type": "code",
"execution_count": 54,
"id": "08e53ccb",
"metadata": {},
"outputs": [],
"source": [
"ebnf_text = r\"\"\"\n",
"start: assign\n",
"\n",
"assign: name \"=\" num\n",
"name: \"a\" | \"b\" | \"c\"\n",
"num: DIGIT\n",
"\n",
"DIGIT: /[0-9]/\n",
"\n",
"%ignore \" \"\n",
"\"\"\""
]
},
{
"cell_type": "code",
"execution_count": 55,
"id": "52935608",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Tree(Token('RULE', 'start'), [Tree(Token('RULE', 'assign'), [Tree(Token('RULE', 'name'), []), Tree(Token('RULE', 'num'), [Token('DIGIT', '7')])])])\n"
]
}
],
"source": [
"parser = Lark(ebnf_text)\n",
"\n",
"print(parser.parse(\"a=7\"))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "assistance-engine",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.11"
}
},
"nbformat": 4,
"nbformat_minor": 5
}