added ast tree metadata

This commit is contained in:
acano 2026-03-25 10:36:18 +01:00
parent dc8230c872
commit fe90548b8b
1 changed files with 90 additions and 16 deletions

View File

@ -4,7 +4,7 @@ from dataclasses import replace
from pathlib import Path from pathlib import Path
from typing import Any, Union from typing import Any, Union
from lark import Lark from lark import Lark, Tree
from chonkie import ( from chonkie import (
Chunk, Chunk,
ElasticHandshake, ElasticHandshake,
@ -21,6 +21,70 @@ from transformers import AutoTokenizer
from scripts.pipelines.tasks.embeddings import OllamaEmbeddings from scripts.pipelines.tasks.embeddings import OllamaEmbeddings
from src.config import settings from src.config import settings
COMMAND_METADATA_NAMES = {
# system
"register_cmd": "registerEndpoint",
"addvar_cmd": "addVar",
"addparam_cmd": "addParam",
"getlistlen_cmd": "getListLen",
"getparamlist_cmd": "getQueryParamList",
"addresult_cmd": "addResult",
# async
"go_stmt": "go",
"gather_stmt": "gather",
# connector
"connector_instantiation": "avapConnector",
# http
"req_post_cmd": "RequestPost",
"req_get_cmd": "RequestGet",
# db
"orm_direct": "ormDirect",
"orm_check": "ormCheckTable",
"orm_create": "ormCreateTable",
"orm_select": "ormAccessSelect",
"orm_insert": "ormAccessInsert",
"orm_update": "ormAccessUpdate",
# util
"json_list_cmd": "json_list_ops",
"crypto_cmd": "crypto_ops",
"regex_cmd": "getRegex",
"datetime_cmd": "getDateTime",
"stamp_cmd": "timestamp_ops",
"string_cmd": "randomString",
"replace_cmd": "replace",
# modularity
"include_stmt": "include",
"import_stmt": "import",
# generic statements
"assignment": "assignment",
"call_stmt": "call",
"return_stmt": "return",
"if_stmt": "if",
"loop_stmt": "startLoop",
"try_stmt": "try",
"function_decl": "function",
}
def _extract_command_metadata(ast: Tree | None) -> dict[str, bool]:
if ast is None:
return {}
used_commands: set[str] = set()
for subtree in ast.iter_subtrees():
if subtree.data in COMMAND_METADATA_NAMES:
used_commands.add(COMMAND_METADATA_NAMES[subtree.data])
return {command_name: True for command_name in sorted(used_commands)}
def _get_text(element) -> str: def _get_text(element) -> str:
for attr in ("text", "content", "markdown"): for attr in ("text", "content", "markdown"):
@ -168,60 +232,70 @@ def fetch_documents(docs_folder_path: str, docs_extension: list[str]) -> list[Pa
return docs_path return docs_path
def process_documents(docs_path: list[Path]) -> list[dict[str, Chunk | dict[str, Any]]]: def process_documents(docs_path: list[Path]) -> list[dict[str, Any]]:
""" """
Process documents by applying appropriate chefs and chunking strategies based on file type. Process documents by applying appropriate chefs and chunking strategies based on file type.
Args: Args:
docs_path (list[Path]): List of Paths to the documents to be processed docs_path: List of Paths to the documents to be processed.
Returns: Returns:
List of dicts with "chunk" (Chunk object) and "metadata" (dict with file info) List of dicts with "chunk" (Chunk object) and "extra_metadata" (dict with file info).
""" """
processed_docs = [] processed_docs = []
specific_metadata = {}
custom_tokenizer = AutoTokenizer.from_pretrained(settings.hf_emb_model_name) custom_tokenizer = AutoTokenizer.from_pretrained(settings.hf_emb_model_name)
chef_md = MarkdownChef(tokenizer=custom_tokenizer) chef_md = MarkdownChef(tokenizer=custom_tokenizer)
chef_txt = TextChef() chef_txt = TextChef()
chunker = TokenChunker(tokenizer=custom_tokenizer) chunker = TokenChunker(tokenizer=custom_tokenizer)
with open(settings.proj_root / "docs/BNF/avap.lark") as grammar:
lark_parser = Lark(grammar=grammar, parser="lalr", propagate_positions=True, start="program") with open(settings.proj_root / "research/code_indexing/BNF/avap.lark", encoding="utf-8") as grammar:
lark_parser = Lark(
grammar.read(),
parser="lalr",
propagate_positions=True,
start="program",
)
for doc_path in docs_path: for doc_path in docs_path:
doc_extension = doc_path.suffix.lower() doc_extension = doc_path.suffix.lower()
if doc_extension == ".md": if doc_extension == ".md":
processed_doc = chef_md.process(doc_path) processed_doc = chef_md.process(doc_path)
fused_doc = _merge_markdown_document(processed_doc) fused_doc = _merge_markdown_document(processed_doc)
chunked_doc = fused_doc.chunks chunked_doc = fused_doc.chunks
specific_metadata = { specific_metadata = {
"file_type": "avap_docs", "file_type": "avap_docs",
"filename": doc_path.name "filename": doc_path.name,
} }
elif doc_extension == ".avap": elif doc_extension == ".avap":
processed_doc = chef_txt.process(doc_path) processed_doc = chef_txt.process(doc_path)
chunked_doc = chunker.chunk(processed_doc.content)
try: try:
ast = lark_parser.parse(processed_doc.content) ast = lark_parser.parse(processed_doc.content)
except Exception as e: except Exception as e:
logger.error(f"Error parsing AVAP code in {doc_path.name}: {e}") logger.error(f"Error parsing AVAP code in {doc_path.name}: {e}")
ast = None ast = None
chunked_doc = chunker.chunk(processed_doc.content)
specific_metadata = { specific_metadata = {
"file_type": "avap_code", "file_type": "avap_code",
"filename": doc_path.name, "filename": doc_path.name,
"AST": str(ast) **_extract_command_metadata(ast),
} }
else: else:
continue continue
for chunk in chunked_doc: for chunk in chunked_doc:
processed_docs.append({ processed_docs.append(
"chunk": chunk, {
"extra_metadata": {**specific_metadata} "chunk": chunk,
}) "extra_metadata": {**specific_metadata},
}
)
return processed_docs return processed_docs