created code_indexing_improvements research
This commit is contained in:
parent
fe90548b8b
commit
da483c51bb
|
|
@ -0,0 +1,228 @@
|
||||||
|
start: program
|
||||||
|
|
||||||
|
program: separator* line_or_comment (separator+ line_or_comment)* separator*
|
||||||
|
|
||||||
|
?line_or_comment: simple_stmt comment?
|
||||||
|
| compound_stmt
|
||||||
|
| comment
|
||||||
|
| BLOCK_COMMENT
|
||||||
|
|
||||||
|
?separator: EOL+
|
||||||
|
|
||||||
|
comment: DOC_COMMENT | LINE_COMMENT
|
||||||
|
|
||||||
|
EOL: /\r?\n/
|
||||||
|
|
||||||
|
DOC_COMMENT.2: /\/\/\/[^\r\n]*/
|
||||||
|
LINE_COMMENT.1: /\/\/[^\r\n]*/
|
||||||
|
BLOCK_COMMENT: /\/\*[\s\S]*?\*\//
|
||||||
|
|
||||||
|
?simple_stmt: assignment
|
||||||
|
| return_stmt
|
||||||
|
| system_command
|
||||||
|
| io_command
|
||||||
|
| async_command
|
||||||
|
| connector_cmd
|
||||||
|
| db_command
|
||||||
|
| http_command
|
||||||
|
| util_command
|
||||||
|
| modularity_cmd
|
||||||
|
| call_stmt
|
||||||
|
|
||||||
|
?compound_stmt: function_decl
|
||||||
|
| if_stmt
|
||||||
|
| loop_stmt
|
||||||
|
| try_stmt
|
||||||
|
|
||||||
|
assignment: identifier "=" expression
|
||||||
|
|
||||||
|
call_stmt: identifier "(" argument_list? ")"
|
||||||
|
| identifier "=" identifier "." identifier "(" argument_list? ")"
|
||||||
|
| identifier "." identifier "(" argument_list? ")"
|
||||||
|
|
||||||
|
system_command: register_cmd
|
||||||
|
| addvar_cmd
|
||||||
|
|
||||||
|
register_cmd: "registerEndpoint" "(" stringliteral "," stringliteral "," list_display "," stringliteral "," identifier "," identifier ")"
|
||||||
|
|
||||||
|
addvar_cmd: "addVar" "(" addvar_arg "," addvar_arg ")"
|
||||||
|
|
||||||
|
addvar_arg: identifier
|
||||||
|
| literal
|
||||||
|
| "$" identifier
|
||||||
|
|
||||||
|
identifier: IDENTIFIER
|
||||||
|
|
||||||
|
system_variable: "_status"
|
||||||
|
|
||||||
|
io_command: addparam_cmd
|
||||||
|
| getlistlen_cmd
|
||||||
|
| addresult_cmd
|
||||||
|
| getparamlist_cmd
|
||||||
|
|
||||||
|
addparam_cmd: "addParam" "(" stringliteral "," identifier ")"
|
||||||
|
getlistlen_cmd: "getListLen" "(" identifier "," identifier ")"
|
||||||
|
getparamlist_cmd: "getQueryParamList" "(" stringliteral "," identifier ")"
|
||||||
|
addresult_cmd: "addResult" "(" identifier ")"
|
||||||
|
|
||||||
|
if_stmt: "if" "(" if_condition ")" separator block ("else" "(" ")" separator block)? "end" "(" ")"
|
||||||
|
|
||||||
|
if_condition: if_atom "," if_atom "," stringliteral
|
||||||
|
| "None" "," "None" "," stringliteral
|
||||||
|
|
||||||
|
if_atom: identifier
|
||||||
|
| literal
|
||||||
|
|
||||||
|
loop_stmt: "startLoop" "(" identifier "," expression "," expression ")" separator block "endLoop" "(" ")"
|
||||||
|
|
||||||
|
try_stmt: "try" "(" ")" separator block "exception" "(" identifier ")" separator block "end" "(" ")"
|
||||||
|
|
||||||
|
block: separator* line_or_comment (separator+ line_or_comment)* separator*
|
||||||
|
|
||||||
|
async_command: go_stmt
|
||||||
|
| gather_stmt
|
||||||
|
|
||||||
|
go_stmt: identifier "=" "go" identifier "(" argument_list? ")"
|
||||||
|
gather_stmt: identifier "=" "gather" "(" identifier ("," expression)? ")"
|
||||||
|
|
||||||
|
connector_cmd: connector_instantiation
|
||||||
|
|
||||||
|
connector_instantiation: identifier "=" "avapConnector" "(" stringliteral ")"
|
||||||
|
|
||||||
|
http_command: req_post_cmd
|
||||||
|
| req_get_cmd
|
||||||
|
|
||||||
|
req_post_cmd: "RequestPost" "(" expression "," expression "," expression "," expression "," identifier "," expression ")"
|
||||||
|
req_get_cmd: "RequestGet" "(" expression "," expression "," expression "," identifier "," expression ")"
|
||||||
|
|
||||||
|
db_command: orm_direct
|
||||||
|
| orm_check
|
||||||
|
| orm_create
|
||||||
|
| orm_select
|
||||||
|
| orm_insert
|
||||||
|
| orm_update
|
||||||
|
|
||||||
|
orm_direct: "ormDirect" "(" expression "," identifier ")"
|
||||||
|
orm_check: "ormCheckTable" "(" expression "," identifier ")"
|
||||||
|
orm_create: "ormCreateTable" "(" expression "," expression "," expression "," identifier ")"
|
||||||
|
|
||||||
|
orm_select: "ormAccessSelect" "(" orm_fields "," expression ("," expression)? "," identifier ")"
|
||||||
|
|
||||||
|
orm_fields: "*"
|
||||||
|
| expression
|
||||||
|
|
||||||
|
orm_insert: "ormAccessInsert" "(" expression "," expression "," identifier ")"
|
||||||
|
orm_update: "ormAccessUpdate" "(" expression "," expression "," expression "," expression "," identifier ")"
|
||||||
|
|
||||||
|
util_command: json_list_cmd
|
||||||
|
| crypto_cmd
|
||||||
|
| regex_cmd
|
||||||
|
| datetime_cmd
|
||||||
|
| stamp_cmd
|
||||||
|
| string_cmd
|
||||||
|
| replace_cmd
|
||||||
|
|
||||||
|
json_list_cmd: "variableToList" "(" expression "," identifier ")"
|
||||||
|
| "itemFromList" "(" identifier "," expression "," identifier ")"
|
||||||
|
| "variableFromJSON" "(" identifier "," expression "," identifier ")"
|
||||||
|
| "AddVariableToJSON" "(" expression "," expression "," identifier ")"
|
||||||
|
|
||||||
|
crypto_cmd: "encodeSHA256" "(" identifier_or_string "," identifier ")"
|
||||||
|
| "encodeMD5" "(" identifier_or_string "," identifier ")"
|
||||||
|
|
||||||
|
regex_cmd: "getRegex" "(" identifier "," stringliteral "," identifier ")"
|
||||||
|
|
||||||
|
datetime_cmd: "getDateTime" "(" stringliteral "," expression "," stringliteral "," identifier ")"
|
||||||
|
|
||||||
|
stamp_cmd: "stampToDatetime" "(" expression "," stringliteral "," expression "," identifier ")"
|
||||||
|
| "getTimeStamp" "(" stringliteral "," stringliteral "," expression "," identifier ")"
|
||||||
|
|
||||||
|
string_cmd: "randomString" "(" expression "," expression "," identifier ")"
|
||||||
|
|
||||||
|
replace_cmd: "replace" "(" identifier_or_string "," stringliteral "," stringliteral "," identifier ")"
|
||||||
|
|
||||||
|
function_decl: "function" identifier "(" param_list? ")" "{" separator block "}"
|
||||||
|
|
||||||
|
param_list: identifier ("," identifier)*
|
||||||
|
|
||||||
|
return_stmt: "return" "(" expression? ")"
|
||||||
|
|
||||||
|
modularity_cmd: include_stmt
|
||||||
|
| import_stmt
|
||||||
|
|
||||||
|
include_stmt: "include" stringliteral
|
||||||
|
import_stmt: "import" ("<" identifier ">" | stringliteral)
|
||||||
|
|
||||||
|
?expression: logical_or
|
||||||
|
|
||||||
|
?logical_or: logical_and ("or" logical_and)*
|
||||||
|
?logical_and: logical_not ("and" logical_not)*
|
||||||
|
|
||||||
|
?logical_not: "not" logical_not
|
||||||
|
| comparison
|
||||||
|
|
||||||
|
?comparison: arithmetic (comp_op arithmetic)*
|
||||||
|
|
||||||
|
comp_op: "==" | "!=" | "<" | ">" | "<=" | ">=" | "in" | "is"
|
||||||
|
|
||||||
|
?arithmetic: term (("+" | "-") term)*
|
||||||
|
?term: factor (("*" | "/" | "%") factor)*
|
||||||
|
|
||||||
|
?factor: ("+" | "-") factor
|
||||||
|
| power
|
||||||
|
|
||||||
|
?power: primary ("**" factor)?
|
||||||
|
|
||||||
|
?primary: atom postfix*
|
||||||
|
|
||||||
|
postfix: "." identifier
|
||||||
|
| "[" expression "]"
|
||||||
|
| "[" expression? ":" expression? (":" expression?)? "]"
|
||||||
|
| "(" argument_list? ")"
|
||||||
|
|
||||||
|
?atom: identifier
|
||||||
|
| "$" identifier
|
||||||
|
| literal
|
||||||
|
| "(" expression ")"
|
||||||
|
| list_display
|
||||||
|
| dict_display
|
||||||
|
|
||||||
|
list_display: "[" argument_list? "]"
|
||||||
|
| "[" expression "for" identifier "in" expression if_clause? "]"
|
||||||
|
|
||||||
|
if_clause: "if" expression
|
||||||
|
|
||||||
|
dict_display: "{" key_datum_list? "}"
|
||||||
|
|
||||||
|
key_datum_list: key_datum ("," key_datum)*
|
||||||
|
key_datum: expression ":" expression
|
||||||
|
|
||||||
|
argument_list: expression ("," expression)*
|
||||||
|
|
||||||
|
number: FLOATNUMBER
|
||||||
|
| INTEGER
|
||||||
|
|
||||||
|
literal: stringliteral
|
||||||
|
| number
|
||||||
|
| boolean
|
||||||
|
| "None"
|
||||||
|
|
||||||
|
boolean: "True" | "False"
|
||||||
|
|
||||||
|
INTEGER: /[0-9]+/
|
||||||
|
FLOATNUMBER: /(?:[0-9]+\.[0-9]*|\.[0-9]+)/
|
||||||
|
|
||||||
|
stringliteral: STRING_DOUBLE
|
||||||
|
| STRING_SINGLE
|
||||||
|
|
||||||
|
# STRING_DOUBLE: /"([^"\\]|\\["'\\ntr0])*"/
|
||||||
|
# STRING_SINGLE: /'([^'\\]|\\["'\\ntr0])*'/
|
||||||
|
STRING_DOUBLE: /"([^"\\]|\\.)*"/
|
||||||
|
STRING_SINGLE: /'([^'\\]|\\.)*'/
|
||||||
|
|
||||||
|
identifier_or_string: identifier
|
||||||
|
| stringliteral
|
||||||
|
|
||||||
|
IDENTIFIER: /[A-Za-z_][A-Za-z0-9_]*/
|
||||||
|
|
||||||
|
%ignore /[ \t]+/
|
||||||
|
|
@ -0,0 +1,371 @@
|
||||||
|
import json
|
||||||
|
from copy import deepcopy
|
||||||
|
from dataclasses import replace
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any, Union
|
||||||
|
|
||||||
|
from lark import Lark, Tree
|
||||||
|
from chonkie import (
|
||||||
|
Chunk,
|
||||||
|
ElasticHandshake,
|
||||||
|
FileFetcher,
|
||||||
|
MarkdownChef,
|
||||||
|
TextChef,
|
||||||
|
TokenChunker,
|
||||||
|
MarkdownDocument
|
||||||
|
)
|
||||||
|
from elasticsearch import Elasticsearch
|
||||||
|
from loguru import logger
|
||||||
|
from transformers import AutoTokenizer
|
||||||
|
|
||||||
|
from scripts.pipelines.tasks.embeddings import OllamaEmbeddings
|
||||||
|
from src.config import settings
|
||||||
|
|
||||||
|
COMMAND_METADATA_NAMES = {
|
||||||
|
# system
|
||||||
|
"register_cmd": "registerEndpoint",
|
||||||
|
"addvar_cmd": "addVar",
|
||||||
|
"addparam_cmd": "addParam",
|
||||||
|
"getlistlen_cmd": "getListLen",
|
||||||
|
"getparamlist_cmd": "getQueryParamList",
|
||||||
|
"addresult_cmd": "addResult",
|
||||||
|
|
||||||
|
# async
|
||||||
|
"go_stmt": "go",
|
||||||
|
"gather_stmt": "gather",
|
||||||
|
|
||||||
|
# connector
|
||||||
|
"connector_instantiation": "avapConnector",
|
||||||
|
|
||||||
|
# http
|
||||||
|
"req_post_cmd": "RequestPost",
|
||||||
|
"req_get_cmd": "RequestGet",
|
||||||
|
|
||||||
|
# db
|
||||||
|
"orm_direct": "ormDirect",
|
||||||
|
"orm_check": "ormCheckTable",
|
||||||
|
"orm_create": "ormCreateTable",
|
||||||
|
"orm_select": "ormAccessSelect",
|
||||||
|
"orm_insert": "ormAccessInsert",
|
||||||
|
"orm_update": "ormAccessUpdate",
|
||||||
|
|
||||||
|
# util
|
||||||
|
"json_list_cmd": "json_list_ops",
|
||||||
|
"crypto_cmd": "crypto_ops",
|
||||||
|
"regex_cmd": "getRegex",
|
||||||
|
"datetime_cmd": "getDateTime",
|
||||||
|
"stamp_cmd": "timestamp_ops",
|
||||||
|
"string_cmd": "randomString",
|
||||||
|
"replace_cmd": "replace",
|
||||||
|
|
||||||
|
# modularity
|
||||||
|
"include_stmt": "include",
|
||||||
|
"import_stmt": "import",
|
||||||
|
|
||||||
|
# generic statements
|
||||||
|
"assignment": "assignment",
|
||||||
|
"call_stmt": "call",
|
||||||
|
"return_stmt": "return",
|
||||||
|
"if_stmt": "if",
|
||||||
|
"loop_stmt": "startLoop",
|
||||||
|
"try_stmt": "try",
|
||||||
|
"function_decl": "function",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_command_metadata(ast: Tree | None) -> dict[str, bool]:
|
||||||
|
if ast is None:
|
||||||
|
return {}
|
||||||
|
|
||||||
|
used_commands: set[str] = set()
|
||||||
|
|
||||||
|
for subtree in ast.iter_subtrees():
|
||||||
|
if subtree.data in COMMAND_METADATA_NAMES:
|
||||||
|
used_commands.add(COMMAND_METADATA_NAMES[subtree.data])
|
||||||
|
|
||||||
|
return {command_name: True for command_name in sorted(used_commands)}
|
||||||
|
|
||||||
|
|
||||||
|
def _get_text(element) -> str:
|
||||||
|
for attr in ("text", "content", "markdown"):
|
||||||
|
value = getattr(element, attr, None)
|
||||||
|
if isinstance(value, str):
|
||||||
|
return value
|
||||||
|
raise AttributeError(
|
||||||
|
f"Could not extract text from element of type {type(element).__name__}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _merge_markdown_document(processed_doc: MarkdownDocument) -> MarkdownDocument:
|
||||||
|
elements = []
|
||||||
|
|
||||||
|
for chunk in processed_doc.chunks:
|
||||||
|
elements.append(("chunk", chunk.start_index, chunk.end_index, chunk))
|
||||||
|
|
||||||
|
for code in processed_doc.code:
|
||||||
|
elements.append(("code", code.start_index, code.end_index, code))
|
||||||
|
|
||||||
|
for table in processed_doc.tables:
|
||||||
|
elements.append(("table", table.start_index, table.end_index, table))
|
||||||
|
|
||||||
|
elements.sort(key=lambda item: (item[1], item[2]))
|
||||||
|
|
||||||
|
merged_chunks = []
|
||||||
|
current_chunk = None
|
||||||
|
current_parts = []
|
||||||
|
current_end_index = None
|
||||||
|
current_token_count = None
|
||||||
|
|
||||||
|
def flush():
|
||||||
|
nonlocal current_chunk, current_parts, current_end_index, current_token_count
|
||||||
|
|
||||||
|
if current_chunk is None:
|
||||||
|
return
|
||||||
|
|
||||||
|
merged_text = "\n\n".join(part for part in current_parts if part)
|
||||||
|
|
||||||
|
merged_chunks.append(
|
||||||
|
replace(
|
||||||
|
current_chunk,
|
||||||
|
text=merged_text,
|
||||||
|
end_index=current_end_index,
|
||||||
|
token_count=current_token_count,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
current_chunk = None
|
||||||
|
current_parts = []
|
||||||
|
current_end_index = None
|
||||||
|
current_token_count = None
|
||||||
|
|
||||||
|
for kind, _, _, element in elements:
|
||||||
|
if kind == "chunk":
|
||||||
|
flush()
|
||||||
|
current_chunk = element
|
||||||
|
current_parts = [_get_text(element)]
|
||||||
|
current_end_index = element.end_index
|
||||||
|
current_token_count = element.token_count
|
||||||
|
continue
|
||||||
|
|
||||||
|
if current_chunk is None:
|
||||||
|
continue
|
||||||
|
|
||||||
|
current_parts.append(_get_text(element))
|
||||||
|
current_end_index = max(current_end_index, element.end_index)
|
||||||
|
current_token_count += getattr(element, "token_count", 0)
|
||||||
|
|
||||||
|
flush()
|
||||||
|
|
||||||
|
fused_processed_doc = deepcopy(processed_doc)
|
||||||
|
fused_processed_doc.chunks = merged_chunks
|
||||||
|
fused_processed_doc.code = processed_doc.code
|
||||||
|
fused_processed_doc.tables = processed_doc.tables
|
||||||
|
|
||||||
|
return fused_processed_doc
|
||||||
|
|
||||||
|
|
||||||
|
class ElasticHandshakeWithMetadata(ElasticHandshake):
|
||||||
|
"""Extended ElasticHandshake that preserves chunk metadata in Elasticsearch."""
|
||||||
|
|
||||||
|
def _create_bulk_actions(self, chunks: list[dict]) -> list[dict[str, Any]]:
|
||||||
|
"""Generate bulk actions including metadata."""
|
||||||
|
actions = []
|
||||||
|
embeddings = self.embedding_model.embed_batch([chunk["chunk"].text for chunk in chunks])
|
||||||
|
|
||||||
|
for i, chunk in enumerate(chunks):
|
||||||
|
source = {
|
||||||
|
"text": chunk["chunk"].text,
|
||||||
|
"embedding": embeddings[i],
|
||||||
|
"start_index": chunk["chunk"].start_index,
|
||||||
|
"end_index": chunk["chunk"].end_index,
|
||||||
|
"token_count": chunk["chunk"].token_count,
|
||||||
|
}
|
||||||
|
|
||||||
|
# Include metadata if it exists
|
||||||
|
if chunk.get("extra_metadata"):
|
||||||
|
source.update(chunk["extra_metadata"])
|
||||||
|
|
||||||
|
actions.append({
|
||||||
|
"_index": self.index_name,
|
||||||
|
"_id": self._generate_id(i, chunk["chunk"]),
|
||||||
|
"_source": source,
|
||||||
|
})
|
||||||
|
|
||||||
|
return actions
|
||||||
|
|
||||||
|
def write(self, chunks: Union[Chunk, list[Chunk]]) -> list[dict[str, Any]]:
|
||||||
|
"""Write the chunks to the Elasticsearch index using the bulk API."""
|
||||||
|
if isinstance(chunks, Chunk):
|
||||||
|
chunks = [chunks]
|
||||||
|
|
||||||
|
actions = self._create_bulk_actions(chunks)
|
||||||
|
|
||||||
|
# Use the bulk helper to efficiently write the documents
|
||||||
|
from elasticsearch.helpers import bulk
|
||||||
|
|
||||||
|
success, errors = bulk(self.client, actions, raise_on_error=False)
|
||||||
|
|
||||||
|
if errors:
|
||||||
|
logger.warning(f"Encountered {len(errors)} errors during bulk indexing.") # type: ignore
|
||||||
|
# Optionally log the first few errors for debugging
|
||||||
|
for i, error in enumerate(errors[:5]): # type: ignore
|
||||||
|
logger.error(f"Error {i + 1}: {error}")
|
||||||
|
|
||||||
|
logger.info(f"Chonkie wrote {success} chunks to Elasticsearch index: {self.index_name}")
|
||||||
|
|
||||||
|
return actions
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_documents(docs_folder_path: str, docs_extension: list[str]) -> list[Path]:
|
||||||
|
"""
|
||||||
|
Fetch files from a folder that match the specified extensions.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
docs_folder_path (str): Path to the folder containing documents
|
||||||
|
docs_extension (list[str]): List of file extensions to filter by (e.g., [".md", ".avap"])
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of Paths to the fetched documents
|
||||||
|
"""
|
||||||
|
fetcher = FileFetcher()
|
||||||
|
docs_path = fetcher.fetch(dir=f"{settings.proj_root}/{docs_folder_path}", ext=docs_extension)
|
||||||
|
return docs_path
|
||||||
|
|
||||||
|
|
||||||
|
def process_documents(docs_path: list[Path]) -> list[dict[str, Any]]:
|
||||||
|
"""
|
||||||
|
Process documents by applying appropriate chefs and chunking strategies based on file type.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
docs_path: List of Paths to the documents to be processed.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of dicts with "chunk" (Chunk object) and "extra_metadata" (dict with file info).
|
||||||
|
"""
|
||||||
|
processed_docs = []
|
||||||
|
custom_tokenizer = AutoTokenizer.from_pretrained(settings.hf_emb_model_name)
|
||||||
|
|
||||||
|
chef_md = MarkdownChef(tokenizer=custom_tokenizer)
|
||||||
|
chef_txt = TextChef()
|
||||||
|
chunker = TokenChunker(tokenizer=custom_tokenizer)
|
||||||
|
|
||||||
|
with open(settings.proj_root / "research/code_indexing/BNF/avap.lark", encoding="utf-8") as grammar:
|
||||||
|
lark_parser = Lark(
|
||||||
|
grammar.read(),
|
||||||
|
parser="lalr",
|
||||||
|
propagate_positions=True,
|
||||||
|
start="program",
|
||||||
|
)
|
||||||
|
|
||||||
|
for doc_path in docs_path:
|
||||||
|
doc_extension = doc_path.suffix.lower()
|
||||||
|
|
||||||
|
if doc_extension == ".md":
|
||||||
|
processed_doc = chef_md.process(doc_path)
|
||||||
|
fused_doc = _merge_markdown_document(processed_doc)
|
||||||
|
chunked_doc = fused_doc.chunks
|
||||||
|
specific_metadata = {
|
||||||
|
"file_type": "avap_docs",
|
||||||
|
"filename": doc_path.name,
|
||||||
|
}
|
||||||
|
|
||||||
|
elif doc_extension == ".avap":
|
||||||
|
processed_doc = chef_txt.process(doc_path)
|
||||||
|
|
||||||
|
try:
|
||||||
|
ast = lark_parser.parse(processed_doc.content)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error parsing AVAP code in {doc_path.name}: {e}")
|
||||||
|
ast = None
|
||||||
|
|
||||||
|
chunked_doc = chunker.chunk(processed_doc.content)
|
||||||
|
|
||||||
|
specific_metadata = {
|
||||||
|
"file_type": "avap_code",
|
||||||
|
"filename": doc_path.name,
|
||||||
|
**_extract_command_metadata(ast),
|
||||||
|
}
|
||||||
|
|
||||||
|
else:
|
||||||
|
continue
|
||||||
|
|
||||||
|
for chunk in chunked_doc:
|
||||||
|
processed_docs.append(
|
||||||
|
{
|
||||||
|
"chunk": chunk,
|
||||||
|
"extra_metadata": {**specific_metadata},
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
return processed_docs
|
||||||
|
|
||||||
|
|
||||||
|
def ingest_documents(
|
||||||
|
chunked_docs: list[dict[str, Chunk | dict[str, Any]]],
|
||||||
|
es_index: str,
|
||||||
|
es_request_timeout: int,
|
||||||
|
es_max_retries: int,
|
||||||
|
es_retry_on_timeout: bool,
|
||||||
|
delete_es_index: bool,
|
||||||
|
) -> list[dict[str, Any]]:
|
||||||
|
"""
|
||||||
|
Ingest processed documents into an Elasticsearch index.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
chunked_docs (list[dict[str, Any]]): List of dicts with "chunk" and "metadata" keys
|
||||||
|
es_index (str): Name of the Elasticsearch index to ingest into
|
||||||
|
es_request_timeout (int): Timeout for Elasticsearch requests in seconds
|
||||||
|
es_max_retries (int): Maximum number of retries for Elasticsearch requests
|
||||||
|
es_retry_on_timeout (bool): Whether to retry on Elasticsearch request timeouts
|
||||||
|
delete_es_index (bool): Whether to delete the existing Elasticsearch index before ingestion
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of dicts with Elasticsearch response for each chunk
|
||||||
|
"""
|
||||||
|
logger.info(
|
||||||
|
f"Instantiating Elasticsearch client with URL: {settings.elasticsearch_local_url}..."
|
||||||
|
)
|
||||||
|
es = Elasticsearch(
|
||||||
|
hosts=settings.elasticsearch_local_url,
|
||||||
|
request_timeout=es_request_timeout,
|
||||||
|
max_retries=es_max_retries,
|
||||||
|
retry_on_timeout=es_retry_on_timeout,
|
||||||
|
)
|
||||||
|
|
||||||
|
if delete_es_index and es.indices.exists(index=es_index):
|
||||||
|
logger.info(f"Deleting existing Elasticsearch index: {es_index}...")
|
||||||
|
es.indices.delete(index=es_index)
|
||||||
|
|
||||||
|
handshake = ElasticHandshakeWithMetadata(
|
||||||
|
client=es,
|
||||||
|
index_name=es_index,
|
||||||
|
embedding_model=OllamaEmbeddings(model=settings.ollama_emb_model_name),
|
||||||
|
)
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
f"Ingesting {len(chunked_docs)} chunks into Elasticsearch index: {es_index}..."
|
||||||
|
)
|
||||||
|
elasticsearch_chunks = handshake.write(chunked_docs)
|
||||||
|
|
||||||
|
return elasticsearch_chunks
|
||||||
|
|
||||||
|
|
||||||
|
def export_documents(elasticsearch_chunks: list[dict[str, Any]], output_path: str) -> None:
|
||||||
|
"""
|
||||||
|
Export processed documents to JSON files in the specified output folder.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
elasticsearch_chunks (list[dict[str, Any]]): List of dicts with Elasticsearch response for each chunk
|
||||||
|
output_path (str): Path to the file where the JSON will be saved
|
||||||
|
Returns:
|
||||||
|
None
|
||||||
|
"""
|
||||||
|
output_path = settings.proj_root / output_path
|
||||||
|
|
||||||
|
for chunk in elasticsearch_chunks:
|
||||||
|
chunk["_source"]["embedding"] = chunk["_source"]["embedding"].tolist() # For JSON serialization
|
||||||
|
|
||||||
|
with output_path.open("w", encoding="utf-8") as f:
|
||||||
|
json.dump(elasticsearch_chunks, f, ensure_ascii=False, indent=4)
|
||||||
|
|
||||||
|
logger.info(f"Exported processed documents to {output_path}")
|
||||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
|
@ -0,0 +1,89 @@
|
||||||
|
{"chunk_id": "5208d7435c0286ab", "source_file": "docs/samples/hash_SHA256_para_integridad.avap", "doc_type": "code", "block_type": "encodeSHA256", "section": "", "start_line": 1, "end_line": 1, "content": "encodeSHA256(\"payload_data\", checksum)", "metadata": {"uses_crypto": true, "uses_string_ops": true, "complexity": 2}, "token_estimate": 9}
|
||||||
|
{"chunk_id": "e5e9b70428937778", "source_file": "docs/samples/hash_SHA256_para_integridad.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 2, "end_line": 2, "content": "encodeSHA256(\"payload_data\", checksum)\naddResult(checksum)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 14}
|
||||||
|
{"chunk_id": "49d6b31967a1db93", "source_file": "docs/samples/hello_world.avap", "doc_type": "code", "block_type": "registerEndpoint", "section": "", "start_line": 1, "end_line": 1, "content": "registerEndpoint(\"/hello_world\",\"GET\",[],\"HELLO_WORLD\",main,result)", "metadata": {"registers_endpoint": true, "complexity": 1}, "token_estimate": 17}
|
||||||
|
{"chunk_id": "e7ececd11823d42a", "source_file": "docs/samples/hello_world.avap", "doc_type": "code", "block_type": "addVar", "section": "", "start_line": 2, "end_line": 2, "content": "registerEndpoint(\"/hello_world\",\"GET\",[],\"HELLO_WORLD\",main,result)\naddVar(name,\"Alberto\")", "metadata": {"complexity": 0, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 24}
|
||||||
|
{"chunk_id": "f103d7719754088f", "source_file": "docs/samples/hello_world.avap", "doc_type": "code", "block_type": "assignment", "section": "", "start_line": 3, "end_line": 3, "content": "addVar(name,\"Alberto\")\nresult = \"Hello,\" + name", "metadata": {"complexity": 0, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 14}
|
||||||
|
{"chunk_id": "4b1ab59c1acb224c", "source_file": "docs/samples/hello_world.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 4, "end_line": 4, "content": "result = \"Hello,\" + name\naddResult(result)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 12}
|
||||||
|
{"chunk_id": "682adaeeb528f778", "source_file": "docs/samples/hola_mundo.avap", "doc_type": "code", "block_type": "addVar", "section": "", "start_line": 1, "end_line": 1, "content": "addVar(mensaje, \"Hola mundo desde AVAP\")", "metadata": {"complexity": 0}, "token_estimate": 12}
|
||||||
|
{"chunk_id": "9bb665ca8d7590f7", "source_file": "docs/samples/hola_mundo.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 2, "end_line": 2, "content": "addVar(mensaje, \"Hola mundo desde AVAP\")\naddResult(mensaje)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 17}
|
||||||
|
{"chunk_id": "ed0136ad03a51e7e", "source_file": "docs/samples/captura_de_listas_multiples.avap", "doc_type": "code", "block_type": "addParam", "section": "", "start_line": 1, "end_line": 1, "content": "addParam(\"emails\", emails)", "metadata": {"uses_auth": true, "complexity": 1}, "token_estimate": 7}
|
||||||
|
{"chunk_id": "899291ac8959ae3e", "source_file": "docs/samples/captura_de_listas_multiples.avap", "doc_type": "code", "block_type": "getQueryParamList", "section": "", "start_line": 2, "end_line": 2, "content": "addParam(\"emails\", emails)\ngetQueryParamList(\"lista_correos\", lista_correos)", "metadata": {"complexity": 0, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 21}
|
||||||
|
{"chunk_id": "0eeff974dcd74729", "source_file": "docs/samples/captura_de_listas_multiples.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 3, "end_line": 3, "content": "getQueryParamList(\"lista_correos\", lista_correos)\naddResult(lista_correos)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 21}
|
||||||
|
{"chunk_id": "b2e95857d059d99d", "source_file": "docs/samples/comparacion_simple.avap", "doc_type": "code", "block_type": "addParam", "section": "", "start_line": 1, "end_line": 1, "content": "addParam(\"lang\", l)", "metadata": {"uses_auth": true, "complexity": 1}, "token_estimate": 7}
|
||||||
|
{"chunk_id": "db2fab8dfbe7d460", "source_file": "docs/samples/comparacion_simple.avap", "doc_type": "code", "block_type": "if", "section": "", "start_line": 2, "end_line": 4, "content": "addParam(\"lang\", l)\nif(l, \"es\", \"=\")\n addVar(msg, \"Hola\")\nend()", "metadata": {"uses_conditional": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 25}
|
||||||
|
{"chunk_id": "2628fa886650658a", "source_file": "docs/samples/comparacion_simple.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 5, "end_line": 5, "content": "if(l, \"es\", \"=\")\n addVar(msg, \"Hola\")\nend()\naddResult(msg)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 22}
|
||||||
|
{"chunk_id": "89bddd6830b6a8af", "source_file": "docs/samples/concatenacion_dinamica.avap", "doc_type": "code", "block_type": "assignment", "section": "", "start_line": 1, "end_line": 2, "content": "nombre = \"Sistema\"\nlog = \"Evento registrado por: %s\" % nombre", "metadata": {"complexity": 0}, "token_estimate": 18}
|
||||||
|
{"chunk_id": "6797d36c2eb0e38a", "source_file": "docs/samples/concatenacion_dinamica.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 3, "end_line": 3, "content": "nombre = \"Sistema\"\nlog = \"Evento registrado por: %s\" % nombre\naddResult(log)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 23}
|
||||||
|
{"chunk_id": "93008a3bed0ea808", "source_file": "docs/samples/if_desigualdad.avap", "doc_type": "code", "block_type": "addParam", "section": "", "start_line": 1, "end_line": 1, "content": "addParam(\"password\",pass_nueva)", "metadata": {"uses_auth": true, "complexity": 1}, "token_estimate": 9}
|
||||||
|
{"chunk_id": "142b2aef2f05fae7", "source_file": "docs/samples/if_desigualdad.avap", "doc_type": "code", "block_type": "assignment", "section": "", "start_line": 2, "end_line": 2, "content": "addParam(\"password\",pass_nueva)\npass_antigua = \"password\"", "metadata": {"complexity": 0, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 16}
|
||||||
|
{"chunk_id": "b03b67f3aab35d7a", "source_file": "docs/samples/if_desigualdad.avap", "doc_type": "code", "block_type": "if", "section": "", "start_line": 3, "end_line": 5, "content": "pass_antigua = \"password\"\nif(pass_nueva, pass_antigua, \"!=\")\n addVar(cambio, \"Contraseña actualizada\")\nend()", "metadata": {"uses_conditional": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 33}
|
||||||
|
{"chunk_id": "99549cab6c8617d8", "source_file": "docs/samples/if_desigualdad.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 6, "end_line": 6, "content": "if(pass_nueva, pass_antigua, \"!=\")\n addVar(cambio, \"Contraseña actualizada\")\nend()\naddResult(cambio)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 31}
|
||||||
|
{"chunk_id": "123dfdacd4160b0d", "source_file": "docs/samples/limpieza_de_strings.avap", "doc_type": "code", "block_type": "replace", "section": "", "start_line": 1, "end_line": 1, "content": "replace(\"REF_1234_OLD\",\"OLD\", \"NEW\", ref_actualizada)", "metadata": {"uses_string_ops": true, "complexity": 1}, "token_estimate": 17}
|
||||||
|
{"chunk_id": "c65655393175720a", "source_file": "docs/samples/limpieza_de_strings.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 2, "end_line": 2, "content": "replace(\"REF_1234_OLD\",\"OLD\", \"NEW\", ref_actualizada)\naddResult(ref_actualizada)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 23}
|
||||||
|
{"chunk_id": "3edbf12e560e22b1", "source_file": "docs/samples/manejo_error_sql_critico.avap", "doc_type": "code", "block_type": "try", "section": "", "start_line": 1, "end_line": 7, "content": "try()\n ormDirect(\"UPDATE table_inexistente SET a=1\", res)\nexception(e)\n addVar(_status, 500)\n addVar(error_msg, \"Error de base de datos\")\n addResult(error_msg)\nend()", "metadata": {"uses_orm": true, "uses_auth": true, "uses_error_handling": true, "uses_exception": true, "returns_result": true, "complexity": 5}, "token_estimate": 51}
|
||||||
|
{"chunk_id": "75bcc1f794c8527f", "source_file": "docs/samples/else_estandar.avap", "doc_type": "code", "block_type": "addParam", "section": "", "start_line": 1, "end_line": 1, "content": "addParam(\"sal_par\",saldo)", "metadata": {"uses_auth": true, "complexity": 1}, "token_estimate": 8}
|
||||||
|
{"chunk_id": "99462a4539651e84", "source_file": "docs/samples/else_estandar.avap", "doc_type": "code", "block_type": "if", "section": "", "start_line": 2, "end_line": 6, "content": "addParam(\"sal_par\",saldo)\nif(saldo, 0, \">\")\n permitir = True\nelse()\n permitir = False\nend()", "metadata": {"uses_conditional": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 33}
|
||||||
|
{"chunk_id": "c9134748119a6401", "source_file": "docs/samples/else_estandar.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 7, "end_line": 7, "content": "else()\n permitir = False\nend()\naddResult(permitir)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 16}
|
||||||
|
{"chunk_id": "da88ce6ec35e309a", "source_file": "docs/samples/expresion_compleja.avap", "doc_type": "code", "block_type": "addParam", "section": "", "start_line": 1, "end_line": 2, "content": "addParam(\"userrype\", user_type)\naddParam(\"sells\", compras)", "metadata": {"uses_auth": true, "complexity": 1}, "token_estimate": 19}
|
||||||
|
{"chunk_id": "ef826cb80ab05a8c", "source_file": "docs/samples/expresion_compleja.avap", "doc_type": "code", "block_type": "if", "section": "", "start_line": 3, "end_line": 5, "content": "addParam(\"userrype\", user_type)\naddParam(\"sells\", compras)\nif(None, None, \" user_type == 'VIP' or compras > 100\")\n addVar(descuento, 0.20)\nend()", "metadata": {"uses_conditional": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 51}
|
||||||
|
{"chunk_id": "117c5396b3e2f3bd", "source_file": "docs/samples/expresion_compleja.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 6, "end_line": 6, "content": "if(None, None, \" user_type == 'VIP' or compras > 100\")\n addVar(descuento, 0.20)\nend()\naddResult(descuento)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 37}
|
||||||
|
{"chunk_id": "559f8f61eda7ff75", "source_file": "docs/samples/fecha_para_base_de_datos.avap", "doc_type": "code", "block_type": "getDateTime", "section": "", "start_line": 1, "end_line": 1, "content": "getDateTime(\"%Y-%m-%d %H:%M:%S\", 0, \"Europe/Madrid\", sql_date)", "metadata": {"uses_datetime": true, "complexity": 1}, "token_estimate": 27}
|
||||||
|
{"chunk_id": "b40f10f126c22c01", "source_file": "docs/samples/fecha_para_base_de_datos.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 2, "end_line": 2, "content": "getDateTime(\"%Y-%m-%d %H:%M:%S\", 0, \"Europe/Madrid\", sql_date)\naddResult(sql_date)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 32}
|
||||||
|
{"chunk_id": "717f75fe4eb08ecf", "source_file": "docs/samples/bucle_longitud_de_datos.avap", "doc_type": "code", "block_type": "assignment", "section": "", "start_line": 1, "end_line": 1, "content": "registros = ['1','2','3']", "metadata": {"complexity": 0}, "token_estimate": 10}
|
||||||
|
{"chunk_id": "8a695ac320884362", "source_file": "docs/samples/bucle_longitud_de_datos.avap", "doc_type": "code", "block_type": "getListLen", "section": "", "start_line": 2, "end_line": 2, "content": "registros = ['1','2','3']\ngetListLen(registros, total)", "metadata": {"uses_list": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 17}
|
||||||
|
{"chunk_id": "9530c2cad477b991", "source_file": "docs/samples/bucle_longitud_de_datos.avap", "doc_type": "code", "block_type": "assignment", "section": "", "start_line": 3, "end_line": 3, "content": "getListLen(registros, total)\ncontador = 0", "metadata": {"complexity": 0, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 11}
|
||||||
|
{"chunk_id": "c4acc74c9b001703", "source_file": "docs/samples/bucle_longitud_de_datos.avap", "doc_type": "code", "block_type": "startLoop", "section": "", "start_line": 4, "end_line": 6, "content": "contador = 0\nstartLoop(idx, 0, 2)\n actual = registros[int(idx)]\nendLoop()", "metadata": {"uses_loop": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 25}
|
||||||
|
{"chunk_id": "80e935fcd6c7a232", "source_file": "docs/samples/bucle_longitud_de_datos.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 7, "end_line": 7, "content": "startLoop(idx, 0, 2)\n actual = registros[int(idx)]\nendLoop()\naddResult(actual)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 24}
|
||||||
|
{"chunk_id": "576b1bc85805eef0", "source_file": "docs/samples/calculo_de_expiracion.avap", "doc_type": "code", "block_type": "getDateTime", "section": "", "start_line": 1, "end_line": 1, "content": "getDateTime(\"\", 86400, \"UTC\", expira)", "metadata": {"uses_datetime": true, "complexity": 1}, "token_estimate": 13}
|
||||||
|
{"chunk_id": "686f254e071d6280", "source_file": "docs/samples/calculo_de_expiracion.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 2, "end_line": 2, "content": "getDateTime(\"\", 86400, \"UTC\", expira)\naddResult(expira)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 18}
|
||||||
|
{"chunk_id": "79fd8fee120921e7", "source_file": "docs/samples/captura_de_id.avap", "doc_type": "code", "block_type": "addParam", "section": "", "start_line": 1, "end_line": 1, "content": "addParam(\"client_id\", id_interno)", "metadata": {"uses_auth": true, "complexity": 1}, "token_estimate": 10}
|
||||||
|
{"chunk_id": "03697091447c57d4", "source_file": "docs/samples/captura_de_id.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 2, "end_line": 2, "content": "addParam(\"client_id\", id_interno)\naddResult(id_interno)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 16}
|
||||||
|
{"chunk_id": "2c64510b9ac6042b", "source_file": "docs/samples/try_catch_request.avap", "doc_type": "code", "block_type": "try", "section": "", "start_line": 1, "end_line": 6, "content": "try()\n RequestGet(\"https://api.test.com/data\", 0, 0, respuesta, None)\nexception(e)\n addVar(error_trace, e)\n addResult(error_trace)\nend()", "metadata": {"uses_http": true, "uses_error_handling": true, "uses_exception": true, "returns_result": true, "complexity": 4}, "token_estimate": 42}
|
||||||
|
{"chunk_id": "4d9f72fb03ba6d2b", "source_file": "docs/samples/validacion_de_nulo.avap", "doc_type": "code", "block_type": "addParam", "section": "", "start_line": 1, "end_line": 1, "content": "addParam(\"api_key\", key)", "metadata": {"uses_auth": true, "complexity": 1}, "token_estimate": 8}
|
||||||
|
{"chunk_id": "19fa0a3950612c1e", "source_file": "docs/samples/validacion_de_nulo.avap", "doc_type": "code", "block_type": "if", "section": "", "start_line": 2, "end_line": 6, "content": "addParam(\"api_key\", key)\nif(key, None, \"==\")\n addVar(_status, 403)\n addVar(error, \"Acceso denegado: falta API KEY\")\n addResult(error)\nend()", "metadata": {"uses_auth": true, "uses_conditional": true, "returns_result": true, "complexity": 3, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 47}
|
||||||
|
{"chunk_id": "e06fe329097212dd", "source_file": "docs/samples/validacion_in_pertenece_a_lista.avap", "doc_type": "code", "block_type": "addParam", "section": "", "start_line": 1, "end_line": 1, "content": "addParam(\"rol\", r)", "metadata": {"uses_auth": true, "complexity": 1}, "token_estimate": 7}
|
||||||
|
{"chunk_id": "285aeb7e911a5075", "source_file": "docs/samples/validacion_in_pertenece_a_lista.avap", "doc_type": "code", "block_type": "assignment", "section": "", "start_line": 2, "end_line": 2, "content": "addParam(\"rol\", r)\nacceso = False", "metadata": {"complexity": 0, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 11}
|
||||||
|
{"chunk_id": "f8ed75075b7b1b13", "source_file": "docs/samples/validacion_in_pertenece_a_lista.avap", "doc_type": "code", "block_type": "if", "section": "", "start_line": 4, "end_line": 6, "content": "acceso = False\nif(None, None, \"r == 'admin' or r == 'editor' or r == 'root'\")\n acceso = True\nend()", "metadata": {"uses_conditional": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 35}
|
||||||
|
{"chunk_id": "b323dedebcbd9036", "source_file": "docs/samples/validacion_in_pertenece_a_lista.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 8, "end_line": 8, "content": "if(None, None, \"r == 'admin' or r == 'editor' or r == 'root'\")\n acceso = True\nend()\naddResult(acceso)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 35}
|
||||||
|
{"chunk_id": "d02cc7019c314251", "source_file": "docs/samples/construccion_dinamica_de_objeto.avap", "doc_type": "code", "block_type": "assignment", "section": "", "start_line": 1, "end_line": 1, "content": "datos_cliente = \"datos\"", "metadata": {"complexity": 0}, "token_estimate": 6}
|
||||||
|
{"chunk_id": "c1528242fcd85a68", "source_file": "docs/samples/construccion_dinamica_de_objeto.avap", "doc_type": "code", "block_type": "addVar", "section": "", "start_line": 2, "end_line": 2, "content": "datos_cliente = \"datos\"\naddVar(clave, \"cliente_vip\")", "metadata": {"complexity": 0, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 16}
|
||||||
|
{"chunk_id": "d335da8caf95ac8d", "source_file": "docs/samples/construccion_dinamica_de_objeto.avap", "doc_type": "code", "block_type": "addVariableToJSON", "section": "", "start_line": 3, "end_line": 3, "content": "addVar(clave, \"cliente_vip\")\nAddvariableToJSON(clave, datos_cliente, mi_json_final)", "metadata": {"uses_json": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 24}
|
||||||
|
{"chunk_id": "27067ebe43e3b05d", "source_file": "docs/samples/construccion_dinamica_de_objeto.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 4, "end_line": 4, "content": "AddvariableToJSON(clave, datos_cliente, mi_json_final)\naddResult(mi_json_final)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 20}
|
||||||
|
{"chunk_id": "a25dfc3b319135d3", "source_file": "docs/samples/contador_de_parametros.avap", "doc_type": "code", "block_type": "addParam", "section": "", "start_line": 1, "end_line": 1, "content": "addParam(\"data_list\", mi_lista)", "metadata": {"uses_auth": true, "complexity": 1}, "token_estimate": 9}
|
||||||
|
{"chunk_id": "d96fd663666733fe", "source_file": "docs/samples/contador_de_parametros.avap", "doc_type": "code", "block_type": "getListLen", "section": "", "start_line": 2, "end_line": 2, "content": "addParam(\"data_list\", mi_lista)\ngetListLen(mi_lista, cantidad)", "metadata": {"uses_list": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 16}
|
||||||
|
{"chunk_id": "9905db6de1ea3067", "source_file": "docs/samples/contador_de_parametros.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 3, "end_line": 3, "content": "getListLen(mi_lista, cantidad)\naddResult(cantidad)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 12}
|
||||||
|
{"chunk_id": "7c239ad53392d63d", "source_file": "docs/samples/conversion_timestamp_legible.avap", "doc_type": "code", "block_type": "stampToDatetime", "section": "", "start_line": 1, "end_line": 1, "content": "stampToDatetime(1708726162, \"%d/%m/%Y\", 0, fecha_human)", "metadata": {"uses_datetime": true, "complexity": 1}, "token_estimate": 22}
|
||||||
|
{"chunk_id": "c4dc5d3c081101a5", "source_file": "docs/samples/conversion_timestamp_legible.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 2, "end_line": 2, "content": "stampToDatetime(1708726162, \"%d/%m/%Y\", 0, fecha_human)\naddResult(fecha_human)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 28}
|
||||||
|
{"chunk_id": "2905488dffcbd7ba", "source_file": "docs/samples/referencia_por_valor.avap", "doc_type": "code", "block_type": "addVar", "section": "", "start_line": 1, "end_line": 2, "content": "addVar(base, 1000)\naddVar(copia, $base)", "metadata": {"complexity": 0}, "token_estimate": 16}
|
||||||
|
{"chunk_id": "82e05ef62a72de87", "source_file": "docs/samples/referencia_por_valor.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 3, "end_line": 3, "content": "addVar(base, 1000)\naddVar(copia, $base)\naddResult(copia)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 21}
|
||||||
|
{"chunk_id": "a6727546f328e768", "source_file": "docs/samples/respuesta_multiple.avap", "doc_type": "code", "block_type": "addVar", "section": "", "start_line": 1, "end_line": 2, "content": "addVar(code, 200)\naddVar(status, \"Success\")", "metadata": {"complexity": 0}, "token_estimate": 14}
|
||||||
|
{"chunk_id": "ce12abd61c278bec", "source_file": "docs/samples/respuesta_multiple.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 3, "end_line": 4, "content": "addVar(code, 200)\naddVar(status, \"Success\")\naddResult(code)\naddResult(status)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 22}
|
||||||
|
{"chunk_id": "45b0086b13784a7d", "source_file": "docs/samples/salida_bucle_correcta.avap", "doc_type": "code", "block_type": "assignment", "section": "", "start_line": 1, "end_line": 1, "content": "encontrado = False", "metadata": {"complexity": 0}, "token_estimate": 5}
|
||||||
|
{"chunk_id": "c6df33b0e7eac0ff", "source_file": "docs/samples/salida_bucle_correcta.avap", "doc_type": "code", "block_type": "startLoop", "section": "", "start_line": 2, "end_line": 7, "content": "encontrado = False\nstartLoop(i, 1, 10)\n if(i, 5, \"==\")\n encontrado = True\n i = 11 \n end()\nendLoop()", "metadata": {"uses_loop": true, "uses_conditional": true, "complexity": 2, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 42}
|
||||||
|
{"chunk_id": "02edc488f13b7367", "source_file": "docs/samples/salida_bucle_correcta.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 8, "end_line": 8, "content": "i = 11 \n end()\nendLoop()\naddResult(encontrado)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 17}
|
||||||
|
{"chunk_id": "c8dbbbf6cb64c10d", "source_file": "docs/samples/funcion_de_suma.avap", "doc_type": "code", "block_type": "function", "section": "", "start_line": 1, "end_line": 4, "content": "function suma(a, b){\n total = a + b\n return(total)\n }", "metadata": {"complexity": 0}, "token_estimate": 19}
|
||||||
|
{"chunk_id": "1065800a57207e04", "source_file": "docs/samples/funcion_de_suma.avap", "doc_type": "function_signature", "block_type": "function_signature", "section": "", "start_line": 1, "end_line": 1, "content": "function suma(a, b)", "metadata": {"complexity": 0, "full_block_start": 1, "full_block_end": 4}, "token_estimate": 6}
|
||||||
|
{"chunk_id": "1ef5fa8a4a980012", "source_file": "docs/samples/funcion_de_suma.avap", "doc_type": "code", "block_type": "assignment", "section": "", "start_line": 5, "end_line": 5, "content": "// contexto: function suma(a, b)\nresultado = suma(10, 20)", "metadata": {"complexity": 0, "has_overlap": true, "overlap_type": "function_sig"}, "token_estimate": 18}
|
||||||
|
{"chunk_id": "ff7df988add5bbef", "source_file": "docs/samples/funcion_de_suma.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 6, "end_line": 6, "content": "// contexto: function suma(a, b)\naddResult(resultado)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "function_sig"}, "token_estimate": 13}
|
||||||
|
{"chunk_id": "b8682e4f71d9d7c3", "source_file": "docs/samples/funcion_validacion_acceso.avap", "doc_type": "code", "block_type": "function", "section": "", "start_line": 1, "end_line": 7, "content": "function es_valido(token){\n response = False\n if(token, \"SECRET\", \"=\")\n response = True\n end()\n return(response)\n }", "metadata": {"uses_conditional": true, "complexity": 1}, "token_estimate": 34}
|
||||||
|
{"chunk_id": "a1cfc36abdf661a0", "source_file": "docs/samples/funcion_validacion_acceso.avap", "doc_type": "function_signature", "block_type": "function_signature", "section": "", "start_line": 1, "end_line": 1, "content": "function es_valido(token)", "metadata": {"complexity": 0, "full_block_start": 1, "full_block_end": 7}, "token_estimate": 6}
|
||||||
|
{"chunk_id": "66706bf4b7d3aede", "source_file": "docs/samples/funcion_validacion_acceso.avap", "doc_type": "code", "block_type": "assignment", "section": "", "start_line": 8, "end_line": 8, "content": "// contexto: function es_valido(token)\nautorizado = es_valido(\"SECRET\")", "metadata": {"complexity": 0, "has_overlap": true, "overlap_type": "function_sig"}, "token_estimate": 18}
|
||||||
|
{"chunk_id": "5932e6b75c40b7db", "source_file": "docs/samples/funcion_validacion_acceso.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 9, "end_line": 9, "content": "// contexto: function es_valido(token)\naddResult(autorizado)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "function_sig"}, "token_estimate": 15}
|
||||||
|
{"chunk_id": "4be60a16d7cc7c4d", "source_file": "docs/samples/generador_de_tokens_aleatorios.avap", "doc_type": "code", "block_type": "randomString", "section": "", "start_line": 1, "end_line": 1, "content": "randomString(\"[A-Z]\\d\", 32, token_seguridad)", "metadata": {"uses_string_ops": true, "complexity": 1}, "token_estimate": 15}
|
||||||
|
{"chunk_id": "1810ca839b071a65", "source_file": "docs/samples/generador_de_tokens_aleatorios.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 2, "end_line": 2, "content": "randomString(\"[A-Z]\\d\", 32, token_seguridad)\naddResult(token_seguridad)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 21}
|
||||||
|
{"chunk_id": "ed8b4a4e75a71762", "source_file": "docs/samples/obtencion_timestamp.avap", "doc_type": "code", "block_type": "getDateTime", "section": "", "start_line": 1, "end_line": 1, "content": "getDateTime(\"\", 0, \"UTC\", ahora)", "metadata": {"uses_datetime": true, "complexity": 1}, "token_estimate": 11}
|
||||||
|
{"chunk_id": "05d2d0c8e6266861", "source_file": "docs/samples/obtencion_timestamp.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 2, "end_line": 2, "content": "getDateTime(\"\", 0, \"UTC\", ahora)\naddResult(ahora)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 17}
|
||||||
|
{"chunk_id": "02d7b0e4a1e1f09c", "source_file": "docs/samples/ormAccessCreate.avap", "doc_type": "code", "block_type": "orm_command", "section": "", "start_line": 1, "end_line": 1, "content": "ormCheckTable(tabla_pruebas,resultado_comprobacion)", "metadata": {"uses_orm": true, "complexity": 1}, "token_estimate": 13}
|
||||||
|
{"chunk_id": "6daea421c5a1d565", "source_file": "docs/samples/ormAccessCreate.avap", "doc_type": "code", "block_type": "if", "section": "", "start_line": 2, "end_line": 4, "content": "ormCheckTable(tabla_pruebas,resultado_comprobacion)\nif(resultado_comprobacion,False,'==')\n ormCreateTable(\"username,age\",'VARCHAR,INTEGER',tabla_pruebas,resultado_creacion)\nend()", "metadata": {"uses_orm": true, "uses_conditional": true, "complexity": 2, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 45}
|
||||||
|
{"chunk_id": "47d660e6c1f124d1", "source_file": "docs/samples/ormAccessCreate.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 5, "end_line": 6, "content": "if(resultado_comprobacion,False,'==')\n ormCreateTable(\"username,age\",'VARCHAR,INTEGER',tabla_pruebas,resultado_creacion)\nend()\naddResult(resultado_comprobacion)\naddResult(resultado_creacion)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 45}
|
||||||
|
{"chunk_id": "b15daff2028a2136", "source_file": "docs/samples/paginacion_dinamica_recursos.avap", "doc_type": "code", "block_type": "addParam", "section": "", "start_line": 1, "end_line": 2, "content": "addParam(\"page\", p)\naddParam(\"size\", s)", "metadata": {"uses_auth": true, "complexity": 1}, "token_estimate": 14}
|
||||||
|
{"chunk_id": "8f1fa0e84c981765", "source_file": "docs/samples/paginacion_dinamica_recursos.avap", "doc_type": "code", "block_type": "assignment", "section": "", "start_line": 3, "end_line": 6, "content": "addParam(\"page\", p)\naddParam(\"size\", s)\nregistros = [\"u1\", \"u2\", \"u3\", \"u4\", \"u5\", \"u6\"]\noffset = int(p) * int(s)\nlimite = offset + int(s)\ncontador = 0", "metadata": {"complexity": 0, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 62}
|
||||||
|
{"chunk_id": "e27ce4178666239b", "source_file": "docs/samples/paginacion_dinamica_recursos.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 7, "end_line": 8, "content": "offset = int(p) * int(s)\nlimite = offset + int(s)\ncontador = 0\naddResult(offset)\naddResult(limite)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 32}
|
||||||
|
{"chunk_id": "9a66c0e4c49bbbcb", "source_file": "docs/samples/paginacion_dinamica_recursos.avap", "doc_type": "code", "block_type": "startLoop", "section": "", "start_line": 9, "end_line": 13, "content": "addResult(offset)\naddResult(limite)\nstartLoop(i, 2, limite)\n actual = registros[int(i)]\n titulo = \"reg_%s\" % i\n AddvariableToJSON(titulo, actual, pagina_json)\nendLoop()", "metadata": {"uses_loop": true, "uses_json": true, "complexity": 2, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 53}
|
||||||
|
{"chunk_id": "77c985068f6f9269", "source_file": "docs/samples/paginacion_dinamica_recursos.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 14, "end_line": 14, "content": "titulo = \"reg_%s\" % i\n AddvariableToJSON(titulo, actual, pagina_json)\nendLoop()\naddResult(pagina_json)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 32}
|
||||||
|
{"chunk_id": "aeb4f87681bdc8b4", "source_file": "docs/samples/asignacion_booleana.avap", "doc_type": "code", "block_type": "assignment", "section": "", "start_line": 1, "end_line": 2, "content": "nivel = 5\nes_admin = nivel >= 10", "metadata": {"complexity": 0}, "token_estimate": 12}
|
||||||
|
{"chunk_id": "5f0f938196d5e573", "source_file": "docs/samples/asignacion_booleana.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 3, "end_line": 3, "content": "nivel = 5\nes_admin = nivel >= 10\naddResult(es_admin)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 18}
|
||||||
|
{"chunk_id": "42fb50109876864c", "source_file": "docs/samples/asignacion_matematica.avap", "doc_type": "code", "block_type": "assignment", "section": "", "start_line": 1, "end_line": 3, "content": "subtotal = 150.50\niva = subtotal * 0.21\ntotal = subtotal + iva", "metadata": {"complexity": 0}, "token_estimate": 22}
|
||||||
|
{"chunk_id": "6019c2adc7750c04", "source_file": "docs/samples/asignacion_matematica.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 4, "end_line": 4, "content": "subtotal = 150.50\niva = subtotal * 0.21\ntotal = subtotal + iva\naddResult(total)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 27}
|
||||||
|
{"chunk_id": "e2f6a0de7e7f9dc1", "source_file": "docs/samples/bucle_1_10.avap", "doc_type": "code", "block_type": "startLoop", "section": "", "start_line": 1, "end_line": 4, "content": "startLoop(i,1,10)\n item = \"item_%s\" % i\n AddvariableToJSON(item,'valor_generado',mi_json)\nendLoop()", "metadata": {"uses_loop": true, "uses_json": true, "complexity": 2}, "token_estimate": 36}
|
||||||
|
{"chunk_id": "ce1f2fab7c807537", "source_file": "docs/samples/bucle_1_10.avap", "doc_type": "code", "block_type": "addResult", "section": "", "start_line": 5, "end_line": 5, "content": "item = \"item_%s\" % i\n AddvariableToJSON(item,'valor_generado',mi_json)\nendLoop()\naddResult(mi_json)", "metadata": {"returns_result": true, "complexity": 1, "has_overlap": true, "overlap_type": "line_tail"}, "token_estimate": 32}
|
||||||
|
|
@ -0,0 +1,64 @@
|
||||||
|
import typer
|
||||||
|
|
||||||
|
from loguru import logger
|
||||||
|
|
||||||
|
from scripts.pipelines.tasks.chunk import (
|
||||||
|
fetch_documents,
|
||||||
|
process_documents,
|
||||||
|
export_documents,
|
||||||
|
ingest_documents
|
||||||
|
)
|
||||||
|
|
||||||
|
app = typer.Typer()
|
||||||
|
|
||||||
|
|
||||||
|
@app.command()
|
||||||
|
def elasticsearch_ingestion(
|
||||||
|
docs_folder_path: str = "docs/samples",
|
||||||
|
output_path: str = "research/code_indexing/chunks/chunks_EBNF_metadata.json",
|
||||||
|
docs_extension: list[str] = [".avap"],
|
||||||
|
es_index: str = "avap-code-indexing-ebnf-metadata",
|
||||||
|
es_request_timeout: int = 120,
|
||||||
|
es_max_retries: int = 5,
|
||||||
|
es_retry_on_timeout: bool = True,
|
||||||
|
delete_es_index: bool = True
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
Pipeline to ingest documents into an Elasticsearch index.
|
||||||
|
The pipeline includes fetching documents from a specified folder, processing them into chunks, and then ingesting those chunks into the specified Elasticsearch index.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
docs_folder_path (str): Path to the folder containing documents to be ingested. Default is "docs/samples".
|
||||||
|
docs_extension (list[str]): List of file extensions to filter by (e.g., [".md", ".avap"]). Default is [".md", ".avap"].
|
||||||
|
es_index (str): Name of the Elasticsearch index to ingest documents into. Default is "avap-docs-test-v3".
|
||||||
|
es_request_timeout (int): Timeout in seconds for Elasticsearch requests. Default is 120 seconds.
|
||||||
|
es_max_retries (int): Maximum number of retries for Elasticsearch requests in case of failure. Default is 5 retries.
|
||||||
|
es_retry_on_timeout (bool): Whether to retry Elasticsearch requests on timeout. Default is True.
|
||||||
|
delete_es_index (bool): Whether to delete the existing Elasticsearch index before ingestion. Default is True.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
None
|
||||||
|
"""
|
||||||
|
logger.info("Starting Elasticsearch ingestion pipeline...")
|
||||||
|
logger.info(f"Fetching files from {docs_folder_path}...")
|
||||||
|
docs_path = fetch_documents(docs_folder_path, docs_extension)
|
||||||
|
|
||||||
|
logger.info("Processing docs...")
|
||||||
|
chunked_docs = process_documents(docs_path)
|
||||||
|
|
||||||
|
logger.info(f"Ingesting chunks in Elasticsearch index: {es_index}...")
|
||||||
|
elasticsearch_docs = ingest_documents(chunked_docs, es_index, es_request_timeout, es_max_retries,
|
||||||
|
es_retry_on_timeout, delete_es_index)
|
||||||
|
|
||||||
|
logger.info(f"Exporting processed documents to {output_path}...")
|
||||||
|
export_documents(elasticsearch_docs, output_path)
|
||||||
|
|
||||||
|
logger.info(f"Finished ingesting in {es_index}.")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
try:
|
||||||
|
app()
|
||||||
|
except Exception as exc:
|
||||||
|
logger.exception(exc)
|
||||||
|
raise
|
||||||
|
|
@ -0,0 +1,198 @@
|
||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"id": "d520f6c3",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import json\n",
|
||||||
|
"from datasets import load_dataset\n",
|
||||||
|
"\n",
|
||||||
|
"import boto3\n",
|
||||||
|
"from botocore.config import Config\n",
|
||||||
|
"from langchain_core.messages import SystemMessage, HumanMessage\n",
|
||||||
|
"\n",
|
||||||
|
"from src.utils.llm_factory import create_chat_model\n",
|
||||||
|
"from src.config import settings"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "e08b9060",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Create LLM isntance"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"id": "81111a86",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"config = Config(\n",
|
||||||
|
" region_name=\"us-east-1\",\n",
|
||||||
|
" connect_timeout=10, \n",
|
||||||
|
" read_timeout=600, \n",
|
||||||
|
")\n",
|
||||||
|
"\n",
|
||||||
|
"client = boto3.client(\"bedrock-runtime\", config=config)\n",
|
||||||
|
"\n",
|
||||||
|
"llm = create_chat_model(\n",
|
||||||
|
" provider=\"bedrock\",\n",
|
||||||
|
" client=client,\n",
|
||||||
|
" model=\"global.anthropic.claude-sonnet-4-6\",\n",
|
||||||
|
" temperature=0,\n",
|
||||||
|
")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "045f8e81",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Load AVAP data"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"id": "07dea3fe",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"with open(settings.proj_root / \"docs/LRM/avap.md\", \"r\") as f:\n",
|
||||||
|
" avap_docs = f.read()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"id": "adbbe8b6",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Loaded 33 AVAP samples\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"samples_dir = settings.proj_root / \"docs/samples\"\n",
|
||||||
|
"avap_samples = []\n",
|
||||||
|
"\n",
|
||||||
|
"for avap_file in sorted(samples_dir.glob(\"*.avap\")):\n",
|
||||||
|
" with open(avap_file, \"r\") as f:\n",
|
||||||
|
" code = f.read()\n",
|
||||||
|
" \n",
|
||||||
|
" avap_samples.append({\n",
|
||||||
|
" \"file\": avap_file.name,\n",
|
||||||
|
" \"code\": code\n",
|
||||||
|
" })\n",
|
||||||
|
"\n",
|
||||||
|
"# Display as JSON\n",
|
||||||
|
"avap_samples_json = json.dumps(avap_samples, indent=2, ensure_ascii=False)\n",
|
||||||
|
"print(f\"Loaded {len(avap_samples)} AVAP samples\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "7a15e09a",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Prompt"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "895a170f",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"GOLDEN_DATASET_PROMPT = SystemMessage(\n",
|
||||||
|
" content=f\"\"\"\n",
|
||||||
|
" You are an AI agent responsible for generating a golden dataset of queries for AVAP code retrieval and understanding.\n",
|
||||||
|
"\n",
|
||||||
|
" You will receive a JSON array of AVAP code samples, each with a 'file' name and 'code' content.\n",
|
||||||
|
"\n",
|
||||||
|
" Your task is to:\n",
|
||||||
|
" 1. Analyze each AVAP code sample.\n",
|
||||||
|
" 2. Generate 2-3 natural language queries that can be answered by examining that specific code.\n",
|
||||||
|
" 3. Output a JSON array where each element has:\n",
|
||||||
|
" - \"query\": A natural language question about AVAP code implementation, best practices, or specific constructs.\n",
|
||||||
|
" - \"context\": The filename of the code sample that provides the context/answer for this query.\n",
|
||||||
|
"\n",
|
||||||
|
" Requirements:\n",
|
||||||
|
" - Queries should be diverse: ask about functions, control flow, API operations, error handling, etc.\n",
|
||||||
|
" - Queries must be answerable using ONLY the provided code samples.\n",
|
||||||
|
" - Queries should be framed as natural developer questions (e.g., \"How do you handle errors in AVAP?\" or \"Show me an example of looping over a list\").\n",
|
||||||
|
" - Use natural English (or Spanish if context is Spanish-language code).\n",
|
||||||
|
" - Do not reference exact variable names unless necessary; focus on the patterns and constructs used.\n",
|
||||||
|
" - Output MUST be valid JSON array format.\n",
|
||||||
|
"\n",
|
||||||
|
" AVAP Code Samples:\n",
|
||||||
|
" {avap_samples_json}\n",
|
||||||
|
"\n",
|
||||||
|
" Output format (JSON array):\n",
|
||||||
|
" [\n",
|
||||||
|
" {{\"query\": \"...\", \"context\": \"filename.avap\"}},\n",
|
||||||
|
" {{\"query\": \"...\", \"context\": \"filename.avap\"}},\n",
|
||||||
|
" ...\n",
|
||||||
|
" ]\n",
|
||||||
|
" \"\"\"\n",
|
||||||
|
")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "a3123199",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "98c4f93c",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "723352ee",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "assistance-engine",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.11.13"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
||||||
Loading…
Reference in New Issue