assistance-engine/scratches/pseco/ingestion/Code Ingestion/n00 Proper Lark Chunking.ipynb

490 lines
22 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "0a8abbfa",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import os\n",
"import re\n",
"import uuid\n",
"from dataclasses import dataclass\n",
"from pathlib import Path\n",
"from typing import Any, Dict, List, Optional, Tuple\n",
"import nltk\n",
"from elasticsearch import Elasticsearch\n",
"from langchain_core.documents import Document\n",
"from langchain_elasticsearch import ElasticsearchStore\n",
"from langchain_ollama import OllamaEmbeddings\n",
"from lark import Lark, Token, Transformer, Tree\n",
"from transformers import AutoConfig\n",
"\n",
"from src.config import settings\n",
"\n",
"nltk.download(\"punkt\", quiet=True)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "5c9d292b",
"metadata": {},
"outputs": [],
"source": [
"config = AutoConfig.from_pretrained(settings.hf_emb_model_name)\n",
"embedding_dim = config.hidden_size"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "d2009c2b",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"qwen3.5:2b\n"
]
}
],
"source": [
"print(settings.ollama_model_name)"
]
},
{
"cell_type": "markdown",
"id": "baa779f3",
"metadata": {},
"source": [
"# Functions"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "26927d0c",
"metadata": {},
"outputs": [],
"source": [
"def bnf_to_lark(bnf_text):\n",
" text = re.sub(r\"<([^>]+)>\", r\"\\1\", bnf_text) # remove <>\n",
" text = text.replace(\"::=\", \":\")\n",
" return text"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "89be8bf6",
"metadata": {},
"outputs": [],
"source": [
"@dataclass\n",
"class Chunk:\n",
" text: str\n",
" kind: str\n",
" metadata: Dict[str, Any]\n",
"\n",
"def _span(node: Tree) -> Optional[Tuple[int, int]]:\n",
" m = node.meta\n",
" s = getattr(m, \"start_pos\", None)\n",
" e = getattr(m, \"end_pos\", None)\n",
" if s is None or e is None:\n",
" return None\n",
" return s, e\n",
"\n",
"def _iter_trees(t: Tree):\n",
" yield t\n",
" for c in t.children:\n",
" if isinstance(c, Tree):\n",
" yield from _iter_trees(c)\n",
"\n",
"def _cmd_name(line: str) -> Optional[str]:\n",
" m = re.match(r\"^\\s*([A-Za-z_][A-Za-z0-9_]*)\\s*\\(\", line)\n",
" return m.group(1) if m else None\n",
"\n",
"def chunk_atomic_lines(code: str) -> List[Chunk]:\n",
" tree = parser.parse(code)\n",
" chunks: List[Chunk] = []\n",
"\n",
" for node in _iter_trees(tree):\n",
" if node.data == \"stmt_line\":\n",
" sp = _span(node)\n",
" if not sp:\n",
" continue\n",
" s, e = sp\n",
" text = code[s:e].strip()\n",
" if not text:\n",
" continue\n",
"\n",
" chunks.append(\n",
" Chunk(\n",
" text=text,\n",
" kind=\"line\",\n",
" metadata={\n",
" \"granularity\": \"atomic\",\n",
" \"command\": _cmd_name(text)\n",
" }\n",
" )\n",
" )\n",
" return chunks\n",
"\n",
"def chunk_blocks(code: str) -> List[Chunk]:\n",
" tree = parser.parse(code)\n",
" chunks: List[Chunk] = []\n",
"\n",
" for node in _iter_trees(tree):\n",
" if node.data in (\"if_block\", \"loop_block\", \"try_block\", \"go_async_block\", \"function_block\"):\n",
" sp = _span(node)\n",
" if not sp:\n",
" continue\n",
" s, e = sp\n",
" text = code[s:e].strip()\n",
" if not text:\n",
" continue\n",
"\n",
" chunks.append(\n",
" Chunk(\n",
" text=text,\n",
" kind=node.data,\n",
" metadata={\"granularity\": \"block\"}\n",
" )\n",
" )\n",
" return chunks\n",
"\n",
"def chunk_avap_code(code: str) -> List[Chunk]:\n",
" # Keep original offsets: do NOT lstrip. Grammar already accepts leading _NL.\n",
" blocks = chunk_blocks(code)\n",
" lines = chunk_atomic_lines(code)\n",
" return blocks + lines"
]
},
{
"cell_type": "markdown",
"id": "23a92e13",
"metadata": {},
"source": [
"# BNF "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "26ad9c81",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"line 9 : syntax error at or before | = |\n",
"syntax error at end of file (missing ; ?)\n"
]
},
{
"data": {
"text/plain": [
"({}, None)"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"grammar_ = (settings.data_path / \"raw\" / \"code\" / \"BNF_v1.txt\").read_text(\n",
" encoding=\"utf-8\"\n",
")\n",
"grammar(grammar_)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7cdf69c4",
"metadata": {},
"outputs": [],
"source": [
"parser = Lark(grammar=grammar, parser=\"lalr\", propagate_positions=True)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "19253100",
"metadata": {},
"outputs": [
{
"ename": "UnexpectedToken",
"evalue": "Unexpected token Token('COMMAND', '(base, 1000)') at line 2, column 11.\nExpected one of: \n\t* EQUAL\nPrevious tokens: [Token('NAME', 'addVar')]\n",
"output_type": "error",
"traceback": [
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
"\u001b[31mUnexpectedCharacters\u001b[39m Traceback (most recent call last)",
"\u001b[36mFile \u001b[39m\u001b[32m~/VsCodeProjects/assistance-engine/.venv/lib/python3.12/site-packages/lark/lexer.py:689\u001b[39m, in \u001b[36mContextualLexer.lex\u001b[39m\u001b[34m(self, lexer_state, parser_state)\u001b[39m\n\u001b[32m 688\u001b[39m lexer = \u001b[38;5;28mself\u001b[39m.lexers[parser_state.position]\n\u001b[32m--> \u001b[39m\u001b[32m689\u001b[39m \u001b[38;5;28;01myield\u001b[39;00m \u001b[43mlexer\u001b[49m\u001b[43m.\u001b[49m\u001b[43mnext_token\u001b[49m\u001b[43m(\u001b[49m\u001b[43mlexer_state\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mparser_state\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 690\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mEOFError\u001b[39;00m:\n",
"\u001b[36mFile \u001b[39m\u001b[32m~/VsCodeProjects/assistance-engine/.venv/lib/python3.12/site-packages/lark/lexer.py:622\u001b[39m, in \u001b[36mBasicLexer.next_token\u001b[39m\u001b[34m(self, lex_state, parser_state)\u001b[39m\n\u001b[32m 621\u001b[39m allowed = {\u001b[33m\"\u001b[39m\u001b[33m<END-OF-FILE>\u001b[39m\u001b[33m\"\u001b[39m}\n\u001b[32m--> \u001b[39m\u001b[32m622\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m UnexpectedCharacters(lex_state.text.text, line_ctr.char_pos, line_ctr.line, line_ctr.column,\n\u001b[32m 623\u001b[39m allowed=allowed, token_history=lex_state.last_token \u001b[38;5;129;01mand\u001b[39;00m [lex_state.last_token],\n\u001b[32m 624\u001b[39m state=parser_state, terminals_by_name=\u001b[38;5;28mself\u001b[39m.terminals_by_name)\n\u001b[32m 626\u001b[39m value, type_ = res\n",
"\u001b[31mUnexpectedCharacters\u001b[39m: No terminal matches '(' in the current parser context, at line 2 col 11\n\n addVar(base, 1000)\n ^\nExpected one of: \n\t* RPAR\n\t* EQUAL\n\nPrevious tokens: Token('NAME', 'addVar')\n",
"\nDuring handling of the above exception, another exception occurred:\n",
"\u001b[31mUnexpectedToken\u001b[39m Traceback (most recent call last)",
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[18]\u001b[39m\u001b[32m, line 6\u001b[39m\n\u001b[32m 1\u001b[39m code = \u001b[33m\"\"\"\u001b[39m\n\u001b[32m 2\u001b[39m \u001b[33m addVar(base, 1000)\u001b[39m\n\u001b[32m 3\u001b[39m \u001b[33m addVar(copia, $base)\u001b[39m\n\u001b[32m 4\u001b[39m \u001b[33m addResult(copia)\u001b[39m\n\u001b[32m 5\u001b[39m \u001b[33m\"\"\"\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m6\u001b[39m tree = \u001b[43mparser\u001b[49m\u001b[43m.\u001b[49m\u001b[43mparse\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcode\u001b[49m\u001b[43m)\u001b[49m\n",
"\u001b[36mFile \u001b[39m\u001b[32m~/VsCodeProjects/assistance-engine/.venv/lib/python3.12/site-packages/lark/lark.py:677\u001b[39m, in \u001b[36mLark.parse\u001b[39m\u001b[34m(self, text, start, on_error)\u001b[39m\n\u001b[32m 675\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m on_error \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mself\u001b[39m.options.parser != \u001b[33m'\u001b[39m\u001b[33mlalr\u001b[39m\u001b[33m'\u001b[39m:\n\u001b[32m 676\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mNotImplementedError\u001b[39;00m(\u001b[33m\"\u001b[39m\u001b[33mThe on_error option is only implemented for the LALR(1) parser.\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m--> \u001b[39m\u001b[32m677\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mparser\u001b[49m\u001b[43m.\u001b[49m\u001b[43mparse\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtext\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstart\u001b[49m\u001b[43m=\u001b[49m\u001b[43mstart\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mon_error\u001b[49m\u001b[43m=\u001b[49m\u001b[43mon_error\u001b[49m\u001b[43m)\u001b[49m\n",
"\u001b[36mFile \u001b[39m\u001b[32m~/VsCodeProjects/assistance-engine/.venv/lib/python3.12/site-packages/lark/parser_frontends.py:131\u001b[39m, in \u001b[36mParsingFrontend.parse\u001b[39m\u001b[34m(self, text, start, on_error)\u001b[39m\n\u001b[32m 129\u001b[39m kw = {} \u001b[38;5;28;01mif\u001b[39;00m on_error \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m {\u001b[33m'\u001b[39m\u001b[33mon_error\u001b[39m\u001b[33m'\u001b[39m: on_error}\n\u001b[32m 130\u001b[39m stream = \u001b[38;5;28mself\u001b[39m._make_lexer_thread(text)\n\u001b[32m--> \u001b[39m\u001b[32m131\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mparser\u001b[49m\u001b[43m.\u001b[49m\u001b[43mparse\u001b[49m\u001b[43m(\u001b[49m\u001b[43mstream\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mchosen_start\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkw\u001b[49m\u001b[43m)\u001b[49m\n",
"\u001b[36mFile \u001b[39m\u001b[32m~/VsCodeProjects/assistance-engine/.venv/lib/python3.12/site-packages/lark/parsers/lalr_parser.py:42\u001b[39m, in \u001b[36mLALR_Parser.parse\u001b[39m\u001b[34m(self, lexer, start, on_error)\u001b[39m\n\u001b[32m 40\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mparse\u001b[39m(\u001b[38;5;28mself\u001b[39m, lexer, start, on_error=\u001b[38;5;28;01mNone\u001b[39;00m):\n\u001b[32m 41\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m---> \u001b[39m\u001b[32m42\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mparser\u001b[49m\u001b[43m.\u001b[49m\u001b[43mparse\u001b[49m\u001b[43m(\u001b[49m\u001b[43mlexer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstart\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 43\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m UnexpectedInput \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[32m 44\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m on_error \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n",
"\u001b[36mFile \u001b[39m\u001b[32m~/VsCodeProjects/assistance-engine/.venv/lib/python3.12/site-packages/lark/parsers/lalr_parser.py:88\u001b[39m, in \u001b[36m_Parser.parse\u001b[39m\u001b[34m(self, lexer, start, value_stack, state_stack, start_interactive)\u001b[39m\n\u001b[32m 86\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m start_interactive:\n\u001b[32m 87\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m InteractiveParser(\u001b[38;5;28mself\u001b[39m, parser_state, parser_state.lexer)\n\u001b[32m---> \u001b[39m\u001b[32m88\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mparse_from_state\u001b[49m\u001b[43m(\u001b[49m\u001b[43mparser_state\u001b[49m\u001b[43m)\u001b[49m\n",
"\u001b[36mFile \u001b[39m\u001b[32m~/VsCodeProjects/assistance-engine/.venv/lib/python3.12/site-packages/lark/parsers/lalr_parser.py:111\u001b[39m, in \u001b[36m_Parser.parse_from_state\u001b[39m\u001b[34m(self, state, last_token)\u001b[39m\n\u001b[32m 109\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mNameError\u001b[39;00m:\n\u001b[32m 110\u001b[39m \u001b[38;5;28;01mpass\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m111\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m e\n\u001b[32m 112\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[32m 113\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m.debug:\n",
"\u001b[36mFile \u001b[39m\u001b[32m~/VsCodeProjects/assistance-engine/.venv/lib/python3.12/site-packages/lark/parsers/lalr_parser.py:100\u001b[39m, in \u001b[36m_Parser.parse_from_state\u001b[39m\u001b[34m(self, state, last_token)\u001b[39m\n\u001b[32m 98\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m 99\u001b[39m token = last_token\n\u001b[32m--> \u001b[39m\u001b[32m100\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mtoken\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mstate\u001b[49m\u001b[43m.\u001b[49m\u001b[43mlexer\u001b[49m\u001b[43m.\u001b[49m\u001b[43mlex\u001b[49m\u001b[43m(\u001b[49m\u001b[43mstate\u001b[49m\u001b[43m)\u001b[49m\u001b[43m:\u001b[49m\n\u001b[32m 101\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;28;43;01massert\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mtoken\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01mis\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;129;43;01mnot\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\n\u001b[32m 102\u001b[39m \u001b[43m \u001b[49m\u001b[43mstate\u001b[49m\u001b[43m.\u001b[49m\u001b[43mfeed_token\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtoken\u001b[49m\u001b[43m)\u001b[49m\n",
"\u001b[36mFile \u001b[39m\u001b[32m~/VsCodeProjects/assistance-engine/.venv/lib/python3.12/site-packages/lark/lexer.py:698\u001b[39m, in \u001b[36mContextualLexer.lex\u001b[39m\u001b[34m(self, lexer_state, parser_state)\u001b[39m\n\u001b[32m 696\u001b[39m last_token = lexer_state.last_token \u001b[38;5;66;03m# Save last_token. Calling root_lexer.next_token will change this to the wrong token\u001b[39;00m\n\u001b[32m 697\u001b[39m token = \u001b[38;5;28mself\u001b[39m.root_lexer.next_token(lexer_state, parser_state)\n\u001b[32m--> \u001b[39m\u001b[32m698\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m UnexpectedToken(token, e.allowed, state=parser_state, token_history=[last_token], terminals_by_name=\u001b[38;5;28mself\u001b[39m.root_lexer.terminals_by_name)\n\u001b[32m 699\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m UnexpectedCharacters:\n\u001b[32m 700\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m e\n",
"\u001b[31mUnexpectedToken\u001b[39m: Unexpected token Token('COMMAND', '(base, 1000)') at line 2, column 11.\nExpected one of: \n\t* EQUAL\nPrevious tokens: [Token('NAME', 'addVar')]\n"
]
}
],
"source": [
"code = \"\"\"\n",
" addVar(base, 1000)\n",
" addVar(copia, $base)\n",
" addResult(copia)\n",
"\"\"\"\n",
"tree = parser.parse(code)"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "04bf9223",
"metadata": {},
"outputs": [
{
"ename": "NameError",
"evalue": "name 'tree' is not defined",
"output_type": "error",
"traceback": [
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
"\u001b[31mNameError\u001b[39m Traceback (most recent call last)",
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[19]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m \u001b[43mtree\u001b[49m.pretty()\n",
"\u001b[31mNameError\u001b[39m: name 'tree' is not defined"
]
}
],
"source": [
"tree.pretty()"
]
},
{
"cell_type": "code",
"execution_count": 35,
"id": "b2999a98",
"metadata": {},
"outputs": [],
"source": [
"chunks = chunk_avap_code(code)\n",
"\n",
"for c in chunks:\n",
" print(\"----\")\n",
" print(\"TYPE:\", c.kind)\n",
" print(\"TEXT:\\n\", c.text)\n",
" print(\"META:\", c.metadata)"
]
},
{
"cell_type": "markdown",
"id": "77f6c552",
"metadata": {},
"source": [
"## Elastic Search"
]
},
{
"cell_type": "code",
"execution_count": 51,
"id": "09ce3e29",
"metadata": {},
"outputs": [],
"source": [
"es = Elasticsearch(\n",
" ELASTICSEARCH_URL,\n",
" request_timeout=120,\n",
" max_retries=5,\n",
" retry_on_timeout=True,\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 52,
"id": "d575c386",
"metadata": {},
"outputs": [],
"source": [
"if es.indices.exists(index=ELASTICSEARCH_CODE_INDEX):\n",
" es.indices.delete(index=ELASTICSEARCH_CODE_INDEX)"
]
},
{
"cell_type": "code",
"execution_count": 56,
"id": "40ea0af8",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"avap-code\n",
"avap-docs-test\n"
]
}
],
"source": [
"for index in es.indices.get(index=\"*\"):\n",
" print(index)"
]
},
{
"cell_type": "code",
"execution_count": 54,
"id": "4e091b39",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"OllamaEmbeddings(model='qwen3-0.6B-emb:latest', validate_model_on_init=False, base_url='http://localhost:11434', client_kwargs={}, async_client_kwargs={}, sync_client_kwargs={}, mirostat=None, mirostat_eta=None, mirostat_tau=None, num_ctx=None, num_gpu=None, keep_alive=None, num_thread=None, repeat_last_n=None, repeat_penalty=None, temperature=None, stop=None, tfs_z=None, top_k=None, top_p=None)"
]
},
"execution_count": 54,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"embeddings = OllamaEmbeddings(base_url=OLLAMA_LOCAL_URL, model=OLLAMA_EMB_MODEL_NAME)\n",
"embeddings"
]
},
{
"cell_type": "code",
"execution_count": 55,
"id": "5aff21c0",
"metadata": {},
"outputs": [],
"source": [
"# index into Elasticsearch\n",
"db = ElasticsearchStore.from_documents(\n",
" code_chunks,\n",
" embeddings,\n",
" client=es,\n",
" index_name=ELASTICSEARCH_CODE_INDEX,\n",
" distance_strategy=\"COSINE\",\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "74c0a377",
"metadata": {},
"outputs": [],
"source": [
"response = es.search(\n",
" index=ELASTICSEARCH_CODE_INDEX,\n",
" body={\n",
" \"query\": {\"match_all\": {}},\n",
" \"size\": 10 \n",
" }\n",
")\n",
"\n",
"for hit in response[\"hits\"][\"hits\"]:\n",
" print(\"ID:\", hit[\"_id\"])\n",
" print(\"Source:\", hit[\"_source\"])\n",
" print(\"-\" * 40)"
]
},
{
"cell_type": "markdown",
"id": "d823650e",
"metadata": {},
"source": [
"# Retrive"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5732a27d",
"metadata": {},
"outputs": [],
"source": [
"base_retriever = db.as_retriever(\n",
" search_type=\"similarity\",\n",
" search_kwargs={\"k\": 5}\n",
" ) \n",
"\n",
"docs = base_retriever.invoke(\"What reserved words does AVAP have?\")\n",
"docs"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8706506f",
"metadata": {},
"outputs": [],
"source": [
"embeddings = OllamaEmbeddings(base_url=OLLAMA_URL, model=OLLAMA_EMB_MODEL_NAME)\n",
"\n",
"vector_store = ElasticsearchStore(\n",
" client=es,\n",
" index_name=ELASTICSEARCH_DOCS_INDEX,\n",
" embedding=embeddings,\n",
" query_field=\"text\",\n",
" vector_query_field=\"vector\",\n",
")\n",
"\n",
"results = vector_store.similarity_search_with_score(\n",
" query=\"What data types does AVAP have?\",\n",
" k=50\n",
")\n",
"\n",
"results"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "assistance-engine",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.11"
}
},
"nbformat": 4,
"nbformat_minor": 5
}