diff --git a/scripts/pipelines/tasks/chunk.py b/scripts/pipelines/tasks/chunk.py index 06af90a..eaef395 100644 --- a/scripts/pipelines/tasks/chunk.py +++ b/scripts/pipelines/tasks/chunk.py @@ -4,7 +4,7 @@ from dataclasses import replace from pathlib import Path from typing import Any, Union -from lark import Lark +from lark import Lark, Tree from chonkie import ( Chunk, ElasticHandshake, @@ -21,6 +21,70 @@ from transformers import AutoTokenizer from scripts.pipelines.tasks.embeddings import OllamaEmbeddings from src.config import settings +COMMAND_METADATA_NAMES = { + # system + "register_cmd": "registerEndpoint", + "addvar_cmd": "addVar", + "addparam_cmd": "addParam", + "getlistlen_cmd": "getListLen", + "getparamlist_cmd": "getQueryParamList", + "addresult_cmd": "addResult", + + # async + "go_stmt": "go", + "gather_stmt": "gather", + + # connector + "connector_instantiation": "avapConnector", + + # http + "req_post_cmd": "RequestPost", + "req_get_cmd": "RequestGet", + + # db + "orm_direct": "ormDirect", + "orm_check": "ormCheckTable", + "orm_create": "ormCreateTable", + "orm_select": "ormAccessSelect", + "orm_insert": "ormAccessInsert", + "orm_update": "ormAccessUpdate", + + # util + "json_list_cmd": "json_list_ops", + "crypto_cmd": "crypto_ops", + "regex_cmd": "getRegex", + "datetime_cmd": "getDateTime", + "stamp_cmd": "timestamp_ops", + "string_cmd": "randomString", + "replace_cmd": "replace", + + # modularity + "include_stmt": "include", + "import_stmt": "import", + + # generic statements + "assignment": "assignment", + "call_stmt": "call", + "return_stmt": "return", + "if_stmt": "if", + "loop_stmt": "startLoop", + "try_stmt": "try", + "function_decl": "function", +} + + +def _extract_command_metadata(ast: Tree | None) -> dict[str, bool]: + if ast is None: + return {} + + used_commands: set[str] = set() + + for subtree in ast.iter_subtrees(): + if subtree.data in COMMAND_METADATA_NAMES: + used_commands.add(COMMAND_METADATA_NAMES[subtree.data]) + + return {command_name: True for command_name in sorted(used_commands)} + def _get_text(element) -> str: for attr in ("text", "content", "markdown"): @@ -168,60 +232,70 @@ def fetch_documents(docs_folder_path: str, docs_extension: list[str]) -> list[Pa return docs_path -def process_documents(docs_path: list[Path]) -> list[dict[str, Chunk | dict[str, Any]]]: +def process_documents(docs_path: list[Path]) -> list[dict[str, Any]]: """ Process documents by applying appropriate chefs and chunking strategies based on file type. Args: - docs_path (list[Path]): List of Paths to the documents to be processed + docs_path: List of Paths to the documents to be processed. Returns: - List of dicts with "chunk" (Chunk object) and "metadata" (dict with file info) + List of dicts with "chunk" (Chunk object) and "extra_metadata" (dict with file info). """ processed_docs = [] - specific_metadata = {} custom_tokenizer = AutoTokenizer.from_pretrained(settings.hf_emb_model_name) chef_md = MarkdownChef(tokenizer=custom_tokenizer) chef_txt = TextChef() chunker = TokenChunker(tokenizer=custom_tokenizer) - with open(settings.proj_root / "docs/BNF/avap.lark") as grammar: - lark_parser = Lark(grammar=grammar, parser="lalr", propagate_positions=True, start="program") + + with open(settings.proj_root / "research/code_indexing/BNF/avap.lark", encoding="utf-8") as grammar: + lark_parser = Lark( + grammar.read(), + parser="lalr", + propagate_positions=True, + start="program", + ) for doc_path in docs_path: doc_extension = doc_path.suffix.lower() - + if doc_extension == ".md": processed_doc = chef_md.process(doc_path) fused_doc = _merge_markdown_document(processed_doc) chunked_doc = fused_doc.chunks specific_metadata = { "file_type": "avap_docs", - "filename": doc_path.name + "filename": doc_path.name, } elif doc_extension == ".avap": processed_doc = chef_txt.process(doc_path) - chunked_doc = chunker.chunk(processed_doc.content) + try: ast = lark_parser.parse(processed_doc.content) except Exception as e: logger.error(f"Error parsing AVAP code in {doc_path.name}: {e}") ast = None + + chunked_doc = chunker.chunk(processed_doc.content) + specific_metadata = { "file_type": "avap_code", "filename": doc_path.name, - "AST": str(ast) + **_extract_command_metadata(ast), } else: continue - for chunk in chunked_doc: - processed_docs.append({ - "chunk": chunk, - "extra_metadata": {**specific_metadata} - }) + for chunk in chunked_doc: + processed_docs.append( + { + "chunk": chunk, + "extra_metadata": {**specific_metadata}, + } + ) return processed_docs