added ast tree metadata

This commit is contained in:
acano 2026-03-25 10:36:18 +01:00
parent dc8230c872
commit fe90548b8b
1 changed files with 90 additions and 16 deletions

View File

@ -4,7 +4,7 @@ from dataclasses import replace
from pathlib import Path
from typing import Any, Union
from lark import Lark
from lark import Lark, Tree
from chonkie import (
Chunk,
ElasticHandshake,
@ -21,6 +21,70 @@ from transformers import AutoTokenizer
from scripts.pipelines.tasks.embeddings import OllamaEmbeddings
from src.config import settings
COMMAND_METADATA_NAMES = {
# system
"register_cmd": "registerEndpoint",
"addvar_cmd": "addVar",
"addparam_cmd": "addParam",
"getlistlen_cmd": "getListLen",
"getparamlist_cmd": "getQueryParamList",
"addresult_cmd": "addResult",
# async
"go_stmt": "go",
"gather_stmt": "gather",
# connector
"connector_instantiation": "avapConnector",
# http
"req_post_cmd": "RequestPost",
"req_get_cmd": "RequestGet",
# db
"orm_direct": "ormDirect",
"orm_check": "ormCheckTable",
"orm_create": "ormCreateTable",
"orm_select": "ormAccessSelect",
"orm_insert": "ormAccessInsert",
"orm_update": "ormAccessUpdate",
# util
"json_list_cmd": "json_list_ops",
"crypto_cmd": "crypto_ops",
"regex_cmd": "getRegex",
"datetime_cmd": "getDateTime",
"stamp_cmd": "timestamp_ops",
"string_cmd": "randomString",
"replace_cmd": "replace",
# modularity
"include_stmt": "include",
"import_stmt": "import",
# generic statements
"assignment": "assignment",
"call_stmt": "call",
"return_stmt": "return",
"if_stmt": "if",
"loop_stmt": "startLoop",
"try_stmt": "try",
"function_decl": "function",
}
def _extract_command_metadata(ast: Tree | None) -> dict[str, bool]:
if ast is None:
return {}
used_commands: set[str] = set()
for subtree in ast.iter_subtrees():
if subtree.data in COMMAND_METADATA_NAMES:
used_commands.add(COMMAND_METADATA_NAMES[subtree.data])
return {command_name: True for command_name in sorted(used_commands)}
def _get_text(element) -> str:
for attr in ("text", "content", "markdown"):
@ -168,60 +232,70 @@ def fetch_documents(docs_folder_path: str, docs_extension: list[str]) -> list[Pa
return docs_path
def process_documents(docs_path: list[Path]) -> list[dict[str, Chunk | dict[str, Any]]]:
def process_documents(docs_path: list[Path]) -> list[dict[str, Any]]:
"""
Process documents by applying appropriate chefs and chunking strategies based on file type.
Args:
docs_path (list[Path]): List of Paths to the documents to be processed
docs_path: List of Paths to the documents to be processed.
Returns:
List of dicts with "chunk" (Chunk object) and "metadata" (dict with file info)
List of dicts with "chunk" (Chunk object) and "extra_metadata" (dict with file info).
"""
processed_docs = []
specific_metadata = {}
custom_tokenizer = AutoTokenizer.from_pretrained(settings.hf_emb_model_name)
chef_md = MarkdownChef(tokenizer=custom_tokenizer)
chef_txt = TextChef()
chunker = TokenChunker(tokenizer=custom_tokenizer)
with open(settings.proj_root / "docs/BNF/avap.lark") as grammar:
lark_parser = Lark(grammar=grammar, parser="lalr", propagate_positions=True, start="program")
with open(settings.proj_root / "research/code_indexing/BNF/avap.lark", encoding="utf-8") as grammar:
lark_parser = Lark(
grammar.read(),
parser="lalr",
propagate_positions=True,
start="program",
)
for doc_path in docs_path:
doc_extension = doc_path.suffix.lower()
if doc_extension == ".md":
processed_doc = chef_md.process(doc_path)
fused_doc = _merge_markdown_document(processed_doc)
chunked_doc = fused_doc.chunks
specific_metadata = {
"file_type": "avap_docs",
"filename": doc_path.name
"filename": doc_path.name,
}
elif doc_extension == ".avap":
processed_doc = chef_txt.process(doc_path)
chunked_doc = chunker.chunk(processed_doc.content)
try:
ast = lark_parser.parse(processed_doc.content)
except Exception as e:
logger.error(f"Error parsing AVAP code in {doc_path.name}: {e}")
ast = None
chunked_doc = chunker.chunk(processed_doc.content)
specific_metadata = {
"file_type": "avap_code",
"filename": doc_path.name,
"AST": str(ast)
**_extract_command_metadata(ast),
}
else:
continue
for chunk in chunked_doc:
processed_docs.append({
"chunk": chunk,
"extra_metadata": {**specific_metadata}
})
for chunk in chunked_doc:
processed_docs.append(
{
"chunk": chunk,
"extra_metadata": {**specific_metadata},
}
)
return processed_docs