added ast tree metadata

2026-03-25 10:36:18 +01:00 · 2026-03-25 10:36:18 +01:00 · fe90548b8b
parent dc8230c872
commit fe90548b8b
1 changed files with 90 additions and 16 deletions
--- a/scripts/pipelines/tasks/chunk.py
+++ b/scripts/pipelines/tasks/chunk.py
@ -4,7 +4,7 @@ from dataclasses import replace
 from pathlib import Path
 from typing import Any, Union

-from lark import Lark
+from lark import Lark, Tree
 from chonkie import (
    Chunk,
    ElasticHandshake,
@ -21,6 +21,70 @@ from transformers import AutoTokenizer
 from scripts.pipelines.tasks.embeddings import OllamaEmbeddings
 from src.config import settings

+COMMAND_METADATA_NAMES = {
+    # system
+    "register_cmd": "registerEndpoint",
+    "addvar_cmd": "addVar",
+    "addparam_cmd": "addParam",
+    "getlistlen_cmd": "getListLen",
+    "getparamlist_cmd": "getQueryParamList",
+    "addresult_cmd": "addResult",
+
+    # async
+    "go_stmt": "go",
+    "gather_stmt": "gather",
+
+    # connector
+    "connector_instantiation": "avapConnector",
+
+    # http
+    "req_post_cmd": "RequestPost",
+    "req_get_cmd": "RequestGet",
+
+    # db
+    "orm_direct": "ormDirect",
+    "orm_check": "ormCheckTable",
+    "orm_create": "ormCreateTable",
+    "orm_select": "ormAccessSelect",
+    "orm_insert": "ormAccessInsert",
+    "orm_update": "ormAccessUpdate",
+
+    # util
+    "json_list_cmd": "json_list_ops",
+    "crypto_cmd": "crypto_ops",
+    "regex_cmd": "getRegex",
+    "datetime_cmd": "getDateTime",
+    "stamp_cmd": "timestamp_ops",
+    "string_cmd": "randomString",
+    "replace_cmd": "replace",
+
+    # modularity
+    "include_stmt": "include",
+    "import_stmt": "import",
+
+    # generic statements
+    "assignment": "assignment",
+    "call_stmt": "call",
+    "return_stmt": "return",
+    "if_stmt": "if",
+    "loop_stmt": "startLoop",
+    "try_stmt": "try",
+    "function_decl": "function",
+}
+
+
+def _extract_command_metadata(ast: Tree | None) -> dict[str, bool]:
+    if ast is None:
+        return {}
+
+    used_commands: set[str] = set()
+
+    for subtree in ast.iter_subtrees():
+        if subtree.data in COMMAND_METADATA_NAMES:
+            used_commands.add(COMMAND_METADATA_NAMES[subtree.data])
+
+    return {command_name: True for command_name in sorted(used_commands)}
+

 def _get_text(element) -> str:
    for attr in ("text", "content", "markdown"):
@ -168,60 +232,70 @@ def fetch_documents(docs_folder_path: str, docs_extension: list[str]) -> list[Pa
    return docs_path


-def process_documents(docs_path: list[Path]) -> list[dict[str, Chunk | dict[str, Any]]]:
+def process_documents(docs_path: list[Path]) -> list[dict[str, Any]]:
    """
    Process documents by applying appropriate chefs and chunking strategies based on file type.

    Args:
-        docs_path (list[Path]): List of Paths to the documents to be processed
+        docs_path: List of Paths to the documents to be processed.

    Returns:
-        List of dicts with "chunk" (Chunk object) and "metadata" (dict with file info)
+        List of dicts with "chunk" (Chunk object) and "extra_metadata" (dict with file info).
    """
    processed_docs = []
-    specific_metadata = {}
    custom_tokenizer = AutoTokenizer.from_pretrained(settings.hf_emb_model_name)

    chef_md = MarkdownChef(tokenizer=custom_tokenizer)
    chef_txt = TextChef()
    chunker = TokenChunker(tokenizer=custom_tokenizer)
-    with open(settings.proj_root / "docs/BNF/avap.lark") as grammar:
-       lark_parser = Lark(grammar=grammar, parser="lalr", propagate_positions=True, start="program")
+
+    with open(settings.proj_root / "research/code_indexing/BNF/avap.lark", encoding="utf-8") as grammar:
+        lark_parser = Lark(
+            grammar.read(),
+            parser="lalr",
+            propagate_positions=True,
+            start="program",
+        )

    for doc_path in docs_path:
        doc_extension = doc_path.suffix.lower()
-        
+
        if doc_extension == ".md":
            processed_doc = chef_md.process(doc_path)
            fused_doc = _merge_markdown_document(processed_doc)
            chunked_doc = fused_doc.chunks
            specific_metadata = {
                "file_type": "avap_docs",
-                "filename": doc_path.name
+                "filename": doc_path.name,
            }

        elif doc_extension == ".avap":
            processed_doc = chef_txt.process(doc_path)
-            chunked_doc = chunker.chunk(processed_doc.content)
+
            try:
                ast = lark_parser.parse(processed_doc.content)
            except Exception as e:
                logger.error(f"Error parsing AVAP code in {doc_path.name}: {e}")
                ast = None
+
+            chunked_doc = chunker.chunk(processed_doc.content)
+
            specific_metadata = {
                "file_type": "avap_code",
                "filename": doc_path.name,
-                "AST": str(ast)
+                **_extract_command_metadata(ast),
            }

        else:
            continue

-        for chunk in chunked_doc:            
-            processed_docs.append({
-                "chunk": chunk,
-                "extra_metadata": {**specific_metadata}
-            })
+        for chunk in chunked_doc:
+            processed_docs.append(
+                {
+                    "chunk": chunk,
+                    "extra_metadata": {**specific_metadata},
+                }
+            )

    return processed_docs