assistance-engine/scratches/pseco/ingestion/Code Ingestion/n00 Proper Lark Chunking.ipynb

477 lines
22 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "0a8abbfa",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import os\n",
"import re\n",
"import uuid\n",
"from dataclasses import dataclass\n",
"from pathlib import Path\n",
"from typing import Any, Dict, List, Optional, Tuple\n",
"from bnf import grammar\n",
"import nltk\n",
"from elasticsearch import Elasticsearch\n",
"from langchain_core.documents import Document\n",
"from langchain_elasticsearch import ElasticsearchStore\n",
"from langchain_ollama import OllamaEmbeddings\n",
"from lark import Lark, Token, Transformer, Tree\n",
"from transformers import AutoConfig\n",
"\n",
"from src.config import (DATA_DIR, ELASTICSEARCH_CODE_INDEX,\n",
" ELASTICSEARCH_DOCS_INDEX, ELASTICSEARCH_INDEX,\n",
" ELASTICSEARCH_URL, HF_EMB_MODEL_NAME,\n",
" OLLAMA_EMB_MODEL_NAME, OLLAMA_LOCAL_URL,\n",
" OLLAMA_MODEL_NAME, OLLAMA_URL, PROJ_ROOT)\n",
"\n",
"nltk.download(\"punkt\", quiet=True)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "5c9d292b",
"metadata": {},
"outputs": [],
"source": [
"config = AutoConfig.from_pretrained(HF_EMB_MODEL_NAME)\n",
"embedding_dim = config.hidden_size"
]
},
{
"cell_type": "markdown",
"id": "baa779f3",
"metadata": {},
"source": [
"# Functions"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "26927d0c",
"metadata": {},
"outputs": [],
"source": [
"def bnf_to_lark(bnf_text):\n",
" text = re.sub(r\"<([^>]+)>\", r\"\\1\", bnf_text) # remove <>\n",
" text = text.replace(\"::=\", \":\")\n",
" return text"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "89be8bf6",
"metadata": {},
"outputs": [],
"source": [
"@dataclass\n",
"class Chunk:\n",
" text: str\n",
" kind: str\n",
" metadata: Dict[str, Any]\n",
"\n",
"def _span(node: Tree) -> Optional[Tuple[int, int]]:\n",
" m = node.meta\n",
" s = getattr(m, \"start_pos\", None)\n",
" e = getattr(m, \"end_pos\", None)\n",
" if s is None or e is None:\n",
" return None\n",
" return s, e\n",
"\n",
"def _iter_trees(t: Tree):\n",
" yield t\n",
" for c in t.children:\n",
" if isinstance(c, Tree):\n",
" yield from _iter_trees(c)\n",
"\n",
"def _cmd_name(line: str) -> Optional[str]:\n",
" m = re.match(r\"^\\s*([A-Za-z_][A-Za-z0-9_]*)\\s*\\(\", line)\n",
" return m.group(1) if m else None\n",
"\n",
"def chunk_atomic_lines(code: str) -> List[Chunk]:\n",
" tree = parser.parse(code)\n",
" chunks: List[Chunk] = []\n",
"\n",
" for node in _iter_trees(tree):\n",
" if node.data == \"stmt_line\":\n",
" sp = _span(node)\n",
" if not sp:\n",
" continue\n",
" s, e = sp\n",
" text = code[s:e].strip()\n",
" if not text:\n",
" continue\n",
"\n",
" chunks.append(\n",
" Chunk(\n",
" text=text,\n",
" kind=\"line\",\n",
" metadata={\n",
" \"granularity\": \"atomic\",\n",
" \"command\": _cmd_name(text)\n",
" }\n",
" )\n",
" )\n",
" return chunks\n",
"\n",
"def chunk_blocks(code: str) -> List[Chunk]:\n",
" tree = parser.parse(code)\n",
" chunks: List[Chunk] = []\n",
"\n",
" for node in _iter_trees(tree):\n",
" if node.data in (\"if_block\", \"loop_block\", \"try_block\", \"go_async_block\", \"function_block\"):\n",
" sp = _span(node)\n",
" if not sp:\n",
" continue\n",
" s, e = sp\n",
" text = code[s:e].strip()\n",
" if not text:\n",
" continue\n",
"\n",
" chunks.append(\n",
" Chunk(\n",
" text=text,\n",
" kind=node.data,\n",
" metadata={\"granularity\": \"block\"}\n",
" )\n",
" )\n",
" return chunks\n",
"\n",
"def chunk_avap_code(code: str) -> List[Chunk]:\n",
" # Keep original offsets: do NOT lstrip. Grammar already accepts leading _NL.\n",
" blocks = chunk_blocks(code)\n",
" lines = chunk_atomic_lines(code)\n",
" return blocks + lines"
]
},
{
"cell_type": "markdown",
"id": "23a92e13",
"metadata": {},
"source": [
"# BNF "
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "26ad9c81",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"line 9 : syntax error at or before | = |\n",
"syntax error at end of file (missing ; ?)\n"
]
},
{
"data": {
"text/plain": [
"({}, None)"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"grammar_ = (DATA_DIR / \"raw\" / \"code\" / \"BNF_v1.txt\").read_text(\n",
" encoding=\"utf-8\"\n",
")\n",
"grammar(grammar_)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7cdf69c4",
"metadata": {},
"outputs": [],
"source": [
"parser = Lark(grammar=grammar, parser=\"lalr\", propagate_positions=True)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "19253100",
"metadata": {},
"outputs": [
{
"ename": "UnexpectedToken",
"evalue": "Unexpected token Token('COMMAND', '(base, 1000)') at line 2, column 11.\nExpected one of: \n\t* EQUAL\nPrevious tokens: [Token('NAME', 'addVar')]\n",
"output_type": "error",
"traceback": [
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
"\u001b[31mUnexpectedCharacters\u001b[39m Traceback (most recent call last)",
"\u001b[36mFile \u001b[39m\u001b[32m~/VsCodeProjects/assistance-engine/.venv/lib/python3.12/site-packages/lark/lexer.py:689\u001b[39m, in \u001b[36mContextualLexer.lex\u001b[39m\u001b[34m(self, lexer_state, parser_state)\u001b[39m\n\u001b[32m 688\u001b[39m lexer = \u001b[38;5;28mself\u001b[39m.lexers[parser_state.position]\n\u001b[32m--> \u001b[39m\u001b[32m689\u001b[39m \u001b[38;5;28;01myield\u001b[39;00m \u001b[43mlexer\u001b[49m\u001b[43m.\u001b[49m\u001b[43mnext_token\u001b[49m\u001b[43m(\u001b[49m\u001b[43mlexer_state\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mparser_state\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 690\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mEOFError\u001b[39;00m:\n",
"\u001b[36mFile \u001b[39m\u001b[32m~/VsCodeProjects/assistance-engine/.venv/lib/python3.12/site-packages/lark/lexer.py:622\u001b[39m, in \u001b[36mBasicLexer.next_token\u001b[39m\u001b[34m(self, lex_state, parser_state)\u001b[39m\n\u001b[32m 621\u001b[39m allowed = {\u001b[33m\"\u001b[39m\u001b[33m<END-OF-FILE>\u001b[39m\u001b[33m\"\u001b[39m}\n\u001b[32m--> \u001b[39m\u001b[32m622\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m UnexpectedCharacters(lex_state.text.text, line_ctr.char_pos, line_ctr.line, line_ctr.column,\n\u001b[32m 623\u001b[39m allowed=allowed, token_history=lex_state.last_token \u001b[38;5;129;01mand\u001b[39;00m [lex_state.last_token],\n\u001b[32m 624\u001b[39m state=parser_state, terminals_by_name=\u001b[38;5;28mself\u001b[39m.terminals_by_name)\n\u001b[32m 626\u001b[39m value, type_ = res\n",
"\u001b[31mUnexpectedCharacters\u001b[39m: No terminal matches '(' in the current parser context, at line 2 col 11\n\n addVar(base, 1000)\n ^\nExpected one of: \n\t* RPAR\n\t* EQUAL\n\nPrevious tokens: Token('NAME', 'addVar')\n",
"\nDuring handling of the above exception, another exception occurred:\n",
"\u001b[31mUnexpectedToken\u001b[39m Traceback (most recent call last)",
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[18]\u001b[39m\u001b[32m, line 6\u001b[39m\n\u001b[32m 1\u001b[39m code = \u001b[33m\"\"\"\u001b[39m\n\u001b[32m 2\u001b[39m \u001b[33m addVar(base, 1000)\u001b[39m\n\u001b[32m 3\u001b[39m \u001b[33m addVar(copia, $base)\u001b[39m\n\u001b[32m 4\u001b[39m \u001b[33m addResult(copia)\u001b[39m\n\u001b[32m 5\u001b[39m \u001b[33m\"\"\"\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m6\u001b[39m tree = \u001b[43mparser\u001b[49m\u001b[43m.\u001b[49m\u001b[43mparse\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcode\u001b[49m\u001b[43m)\u001b[49m\n",
"\u001b[36mFile \u001b[39m\u001b[32m~/VsCodeProjects/assistance-engine/.venv/lib/python3.12/site-packages/lark/lark.py:677\u001b[39m, in \u001b[36mLark.parse\u001b[39m\u001b[34m(self, text, start, on_error)\u001b[39m\n\u001b[32m 675\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m on_error \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mself\u001b[39m.options.parser != \u001b[33m'\u001b[39m\u001b[33mlalr\u001b[39m\u001b[33m'\u001b[39m:\n\u001b[32m 676\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mNotImplementedError\u001b[39;00m(\u001b[33m\"\u001b[39m\u001b[33mThe on_error option is only implemented for the LALR(1) parser.\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m--> \u001b[39m\u001b[32m677\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mparser\u001b[49m\u001b[43m.\u001b[49m\u001b[43mparse\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtext\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstart\u001b[49m\u001b[43m=\u001b[49m\u001b[43mstart\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mon_error\u001b[49m\u001b[43m=\u001b[49m\u001b[43mon_error\u001b[49m\u001b[43m)\u001b[49m\n",
"\u001b[36mFile \u001b[39m\u001b[32m~/VsCodeProjects/assistance-engine/.venv/lib/python3.12/site-packages/lark/parser_frontends.py:131\u001b[39m, in \u001b[36mParsingFrontend.parse\u001b[39m\u001b[34m(self, text, start, on_error)\u001b[39m\n\u001b[32m 129\u001b[39m kw = {} \u001b[38;5;28;01mif\u001b[39;00m on_error \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m {\u001b[33m'\u001b[39m\u001b[33mon_error\u001b[39m\u001b[33m'\u001b[39m: on_error}\n\u001b[32m 130\u001b[39m stream = \u001b[38;5;28mself\u001b[39m._make_lexer_thread(text)\n\u001b[32m--> \u001b[39m\u001b[32m131\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mparser\u001b[49m\u001b[43m.\u001b[49m\u001b[43mparse\u001b[49m\u001b[43m(\u001b[49m\u001b[43mstream\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mchosen_start\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkw\u001b[49m\u001b[43m)\u001b[49m\n",
"\u001b[36mFile \u001b[39m\u001b[32m~/VsCodeProjects/assistance-engine/.venv/lib/python3.12/site-packages/lark/parsers/lalr_parser.py:42\u001b[39m, in \u001b[36mLALR_Parser.parse\u001b[39m\u001b[34m(self, lexer, start, on_error)\u001b[39m\n\u001b[32m 40\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mparse\u001b[39m(\u001b[38;5;28mself\u001b[39m, lexer, start, on_error=\u001b[38;5;28;01mNone\u001b[39;00m):\n\u001b[32m 41\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m---> \u001b[39m\u001b[32m42\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mparser\u001b[49m\u001b[43m.\u001b[49m\u001b[43mparse\u001b[49m\u001b[43m(\u001b[49m\u001b[43mlexer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstart\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 43\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m UnexpectedInput \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[32m 44\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m on_error \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n",
"\u001b[36mFile \u001b[39m\u001b[32m~/VsCodeProjects/assistance-engine/.venv/lib/python3.12/site-packages/lark/parsers/lalr_parser.py:88\u001b[39m, in \u001b[36m_Parser.parse\u001b[39m\u001b[34m(self, lexer, start, value_stack, state_stack, start_interactive)\u001b[39m\n\u001b[32m 86\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m start_interactive:\n\u001b[32m 87\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m InteractiveParser(\u001b[38;5;28mself\u001b[39m, parser_state, parser_state.lexer)\n\u001b[32m---> \u001b[39m\u001b[32m88\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mparse_from_state\u001b[49m\u001b[43m(\u001b[49m\u001b[43mparser_state\u001b[49m\u001b[43m)\u001b[49m\n",
"\u001b[36mFile \u001b[39m\u001b[32m~/VsCodeProjects/assistance-engine/.venv/lib/python3.12/site-packages/lark/parsers/lalr_parser.py:111\u001b[39m, in \u001b[36m_Parser.parse_from_state\u001b[39m\u001b[34m(self, state, last_token)\u001b[39m\n\u001b[32m 109\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mNameError\u001b[39;00m:\n\u001b[32m 110\u001b[39m \u001b[38;5;28;01mpass\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m111\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m e\n\u001b[32m 112\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[32m 113\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m.debug:\n",
"\u001b[36mFile \u001b[39m\u001b[32m~/VsCodeProjects/assistance-engine/.venv/lib/python3.12/site-packages/lark/parsers/lalr_parser.py:100\u001b[39m, in \u001b[36m_Parser.parse_from_state\u001b[39m\u001b[34m(self, state, last_token)\u001b[39m\n\u001b[32m 98\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m 99\u001b[39m token = last_token\n\u001b[32m--> \u001b[39m\u001b[32m100\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mtoken\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mstate\u001b[49m\u001b[43m.\u001b[49m\u001b[43mlexer\u001b[49m\u001b[43m.\u001b[49m\u001b[43mlex\u001b[49m\u001b[43m(\u001b[49m\u001b[43mstate\u001b[49m\u001b[43m)\u001b[49m\u001b[43m:\u001b[49m\n\u001b[32m 101\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;28;43;01massert\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mtoken\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01mis\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;129;43;01mnot\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\n\u001b[32m 102\u001b[39m \u001b[43m \u001b[49m\u001b[43mstate\u001b[49m\u001b[43m.\u001b[49m\u001b[43mfeed_token\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtoken\u001b[49m\u001b[43m)\u001b[49m\n",
"\u001b[36mFile \u001b[39m\u001b[32m~/VsCodeProjects/assistance-engine/.venv/lib/python3.12/site-packages/lark/lexer.py:698\u001b[39m, in \u001b[36mContextualLexer.lex\u001b[39m\u001b[34m(self, lexer_state, parser_state)\u001b[39m\n\u001b[32m 696\u001b[39m last_token = lexer_state.last_token \u001b[38;5;66;03m# Save last_token. Calling root_lexer.next_token will change this to the wrong token\u001b[39;00m\n\u001b[32m 697\u001b[39m token = \u001b[38;5;28mself\u001b[39m.root_lexer.next_token(lexer_state, parser_state)\n\u001b[32m--> \u001b[39m\u001b[32m698\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m UnexpectedToken(token, e.allowed, state=parser_state, token_history=[last_token], terminals_by_name=\u001b[38;5;28mself\u001b[39m.root_lexer.terminals_by_name)\n\u001b[32m 699\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m UnexpectedCharacters:\n\u001b[32m 700\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m e\n",
"\u001b[31mUnexpectedToken\u001b[39m: Unexpected token Token('COMMAND', '(base, 1000)') at line 2, column 11.\nExpected one of: \n\t* EQUAL\nPrevious tokens: [Token('NAME', 'addVar')]\n"
]
}
],
"source": [
"code = \"\"\"\n",
" addVar(base, 1000)\n",
" addVar(copia, $base)\n",
" addResult(copia)\n",
"\"\"\"\n",
"tree = parser.parse(code)"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "04bf9223",
"metadata": {},
"outputs": [
{
"ename": "NameError",
"evalue": "name 'tree' is not defined",
"output_type": "error",
"traceback": [
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
"\u001b[31mNameError\u001b[39m Traceback (most recent call last)",
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[19]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m \u001b[43mtree\u001b[49m.pretty()\n",
"\u001b[31mNameError\u001b[39m: name 'tree' is not defined"
]
}
],
"source": [
"tree.pretty()"
]
},
{
"cell_type": "code",
"execution_count": 35,
"id": "b2999a98",
"metadata": {},
"outputs": [],
"source": [
"chunks = chunk_avap_code(code)\n",
"\n",
"for c in chunks:\n",
" print(\"----\")\n",
" print(\"TYPE:\", c.kind)\n",
" print(\"TEXT:\\n\", c.text)\n",
" print(\"META:\", c.metadata)"
]
},
{
"cell_type": "markdown",
"id": "77f6c552",
"metadata": {},
"source": [
"## Elastic Search"
]
},
{
"cell_type": "code",
"execution_count": 51,
"id": "09ce3e29",
"metadata": {},
"outputs": [],
"source": [
"es = Elasticsearch(\n",
" ELASTICSEARCH_URL,\n",
" request_timeout=120,\n",
" max_retries=5,\n",
" retry_on_timeout=True,\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 52,
"id": "d575c386",
"metadata": {},
"outputs": [],
"source": [
"if es.indices.exists(index=ELASTICSEARCH_CODE_INDEX):\n",
" es.indices.delete(index=ELASTICSEARCH_CODE_INDEX)"
]
},
{
"cell_type": "code",
"execution_count": 56,
"id": "40ea0af8",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"avap-code\n",
"avap-docs-test\n"
]
}
],
"source": [
"for index in es.indices.get(index=\"*\"):\n",
" print(index)"
]
},
{
"cell_type": "code",
"execution_count": 54,
"id": "4e091b39",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"OllamaEmbeddings(model='qwen3-0.6B-emb:latest', validate_model_on_init=False, base_url='http://localhost:11434', client_kwargs={}, async_client_kwargs={}, sync_client_kwargs={}, mirostat=None, mirostat_eta=None, mirostat_tau=None, num_ctx=None, num_gpu=None, keep_alive=None, num_thread=None, repeat_last_n=None, repeat_penalty=None, temperature=None, stop=None, tfs_z=None, top_k=None, top_p=None)"
]
},
"execution_count": 54,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"embeddings = OllamaEmbeddings(base_url=OLLAMA_LOCAL_URL, model=OLLAMA_EMB_MODEL_NAME)\n",
"embeddings"
]
},
{
"cell_type": "code",
"execution_count": 55,
"id": "5aff21c0",
"metadata": {},
"outputs": [],
"source": [
"# index into Elasticsearch\n",
"db = ElasticsearchStore.from_documents(\n",
" code_chunks,\n",
" embeddings,\n",
" client=es,\n",
" index_name=ELASTICSEARCH_CODE_INDEX,\n",
" distance_strategy=\"COSINE\",\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "74c0a377",
"metadata": {},
"outputs": [],
"source": [
"response = es.search(\n",
" index=ELASTICSEARCH_CODE_INDEX,\n",
" body={\n",
" \"query\": {\"match_all\": {}},\n",
" \"size\": 10 \n",
" }\n",
")\n",
"\n",
"for hit in response[\"hits\"][\"hits\"]:\n",
" print(\"ID:\", hit[\"_id\"])\n",
" print(\"Source:\", hit[\"_source\"])\n",
" print(\"-\" * 40)"
]
},
{
"cell_type": "markdown",
"id": "d823650e",
"metadata": {},
"source": [
"# Retrive"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5732a27d",
"metadata": {},
"outputs": [],
"source": [
"base_retriever = db.as_retriever(\n",
" search_type=\"similarity\",\n",
" search_kwargs={\"k\": 5}\n",
" ) \n",
"\n",
"docs = base_retriever.invoke(\"What reserved words does AVAP have?\")\n",
"docs"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8706506f",
"metadata": {},
"outputs": [],
"source": [
"embeddings = OllamaEmbeddings(base_url=OLLAMA_URL, model=OLLAMA_EMB_MODEL_NAME)\n",
"\n",
"vector_store = ElasticsearchStore(\n",
" client=es,\n",
" index_name=ELASTICSEARCH_DOCS_INDEX,\n",
" embedding=embeddings,\n",
" query_field=\"text\",\n",
" vector_query_field=\"vector\",\n",
")\n",
"\n",
"results = vector_store.similarity_search_with_score(\n",
" query=\"What data types does AVAP have?\",\n",
" k=50\n",
")\n",
"\n",
"results"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "assistance-engine",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.11"
}
},
"nbformat": 4,
"nbformat_minor": 5
}