added ast tree metadata
This commit is contained in:
parent
dc8230c872
commit
fe90548b8b
|
|
@ -4,7 +4,7 @@ from dataclasses import replace
|
|||
from pathlib import Path
|
||||
from typing import Any, Union
|
||||
|
||||
from lark import Lark
|
||||
from lark import Lark, Tree
|
||||
from chonkie import (
|
||||
Chunk,
|
||||
ElasticHandshake,
|
||||
|
|
@ -21,6 +21,70 @@ from transformers import AutoTokenizer
|
|||
from scripts.pipelines.tasks.embeddings import OllamaEmbeddings
|
||||
from src.config import settings
|
||||
|
||||
COMMAND_METADATA_NAMES = {
|
||||
# system
|
||||
"register_cmd": "registerEndpoint",
|
||||
"addvar_cmd": "addVar",
|
||||
"addparam_cmd": "addParam",
|
||||
"getlistlen_cmd": "getListLen",
|
||||
"getparamlist_cmd": "getQueryParamList",
|
||||
"addresult_cmd": "addResult",
|
||||
|
||||
# async
|
||||
"go_stmt": "go",
|
||||
"gather_stmt": "gather",
|
||||
|
||||
# connector
|
||||
"connector_instantiation": "avapConnector",
|
||||
|
||||
# http
|
||||
"req_post_cmd": "RequestPost",
|
||||
"req_get_cmd": "RequestGet",
|
||||
|
||||
# db
|
||||
"orm_direct": "ormDirect",
|
||||
"orm_check": "ormCheckTable",
|
||||
"orm_create": "ormCreateTable",
|
||||
"orm_select": "ormAccessSelect",
|
||||
"orm_insert": "ormAccessInsert",
|
||||
"orm_update": "ormAccessUpdate",
|
||||
|
||||
# util
|
||||
"json_list_cmd": "json_list_ops",
|
||||
"crypto_cmd": "crypto_ops",
|
||||
"regex_cmd": "getRegex",
|
||||
"datetime_cmd": "getDateTime",
|
||||
"stamp_cmd": "timestamp_ops",
|
||||
"string_cmd": "randomString",
|
||||
"replace_cmd": "replace",
|
||||
|
||||
# modularity
|
||||
"include_stmt": "include",
|
||||
"import_stmt": "import",
|
||||
|
||||
# generic statements
|
||||
"assignment": "assignment",
|
||||
"call_stmt": "call",
|
||||
"return_stmt": "return",
|
||||
"if_stmt": "if",
|
||||
"loop_stmt": "startLoop",
|
||||
"try_stmt": "try",
|
||||
"function_decl": "function",
|
||||
}
|
||||
|
||||
|
||||
def _extract_command_metadata(ast: Tree | None) -> dict[str, bool]:
|
||||
if ast is None:
|
||||
return {}
|
||||
|
||||
used_commands: set[str] = set()
|
||||
|
||||
for subtree in ast.iter_subtrees():
|
||||
if subtree.data in COMMAND_METADATA_NAMES:
|
||||
used_commands.add(COMMAND_METADATA_NAMES[subtree.data])
|
||||
|
||||
return {command_name: True for command_name in sorted(used_commands)}
|
||||
|
||||
|
||||
def _get_text(element) -> str:
|
||||
for attr in ("text", "content", "markdown"):
|
||||
|
|
@ -168,60 +232,70 @@ def fetch_documents(docs_folder_path: str, docs_extension: list[str]) -> list[Pa
|
|||
return docs_path
|
||||
|
||||
|
||||
def process_documents(docs_path: list[Path]) -> list[dict[str, Chunk | dict[str, Any]]]:
|
||||
def process_documents(docs_path: list[Path]) -> list[dict[str, Any]]:
|
||||
"""
|
||||
Process documents by applying appropriate chefs and chunking strategies based on file type.
|
||||
|
||||
Args:
|
||||
docs_path (list[Path]): List of Paths to the documents to be processed
|
||||
docs_path: List of Paths to the documents to be processed.
|
||||
|
||||
Returns:
|
||||
List of dicts with "chunk" (Chunk object) and "metadata" (dict with file info)
|
||||
List of dicts with "chunk" (Chunk object) and "extra_metadata" (dict with file info).
|
||||
"""
|
||||
processed_docs = []
|
||||
specific_metadata = {}
|
||||
custom_tokenizer = AutoTokenizer.from_pretrained(settings.hf_emb_model_name)
|
||||
|
||||
chef_md = MarkdownChef(tokenizer=custom_tokenizer)
|
||||
chef_txt = TextChef()
|
||||
chunker = TokenChunker(tokenizer=custom_tokenizer)
|
||||
with open(settings.proj_root / "docs/BNF/avap.lark") as grammar:
|
||||
lark_parser = Lark(grammar=grammar, parser="lalr", propagate_positions=True, start="program")
|
||||
|
||||
with open(settings.proj_root / "research/code_indexing/BNF/avap.lark", encoding="utf-8") as grammar:
|
||||
lark_parser = Lark(
|
||||
grammar.read(),
|
||||
parser="lalr",
|
||||
propagate_positions=True,
|
||||
start="program",
|
||||
)
|
||||
|
||||
for doc_path in docs_path:
|
||||
doc_extension = doc_path.suffix.lower()
|
||||
|
||||
|
||||
if doc_extension == ".md":
|
||||
processed_doc = chef_md.process(doc_path)
|
||||
fused_doc = _merge_markdown_document(processed_doc)
|
||||
chunked_doc = fused_doc.chunks
|
||||
specific_metadata = {
|
||||
"file_type": "avap_docs",
|
||||
"filename": doc_path.name
|
||||
"filename": doc_path.name,
|
||||
}
|
||||
|
||||
elif doc_extension == ".avap":
|
||||
processed_doc = chef_txt.process(doc_path)
|
||||
chunked_doc = chunker.chunk(processed_doc.content)
|
||||
|
||||
try:
|
||||
ast = lark_parser.parse(processed_doc.content)
|
||||
except Exception as e:
|
||||
logger.error(f"Error parsing AVAP code in {doc_path.name}: {e}")
|
||||
ast = None
|
||||
|
||||
chunked_doc = chunker.chunk(processed_doc.content)
|
||||
|
||||
specific_metadata = {
|
||||
"file_type": "avap_code",
|
||||
"filename": doc_path.name,
|
||||
"AST": str(ast)
|
||||
**_extract_command_metadata(ast),
|
||||
}
|
||||
|
||||
else:
|
||||
continue
|
||||
|
||||
for chunk in chunked_doc:
|
||||
processed_docs.append({
|
||||
"chunk": chunk,
|
||||
"extra_metadata": {**specific_metadata}
|
||||
})
|
||||
for chunk in chunked_doc:
|
||||
processed_docs.append(
|
||||
{
|
||||
"chunk": chunk,
|
||||
"extra_metadata": {**specific_metadata},
|
||||
}
|
||||
)
|
||||
|
||||
return processed_docs
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue