Add chunk.py for processing and replacing JavaScript references with Avap

- Implemented `replace_javascript_with_avap` function to handle text replacement. - Created `read_concat_files` function to read and concatenate files with a specified prefix, replacing JavaScript markers. - Added functionality to read files from a specified directory and process their contents.
2026-03-09 13:21:18 +01:00 · 2026-03-09 13:21:18 +01:00 · 6d856ba691
parent c8da317dd8
commit 6d856ba691
3 changed files with 166 additions and 190 deletions
--- a/scratches/acano/generate_synthethic_data.ipynb
+++ b/scratches/acano/generate_synthethic_data.ipynb
@ -2,7 +2,7 @@
 "cells": [
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
   "id": "b657efd2",
   "metadata": {},
   "outputs": [],
@ -15,7 +15,7 @@
    "from langchain_core.messages import SystemMessage, HumanMessage\n",
    "\n",
    "from src.utils.llm_factory import create_chat_model\n",
-    "from src.config import RAW_DIR, INTERIM_DIR"
+    "from src.config import RAW_DIR, INTERIM_DIR, EXTERNAL_DIR"
   ]
  },
  {
@ -111,7 +111,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "with open(RAW_DIR / \"avap.txt\", \"r\") as f:\n",
+    "with open(RAW_DIR / \"combined.txt\", \"r\") as f:\n",
    "    avap_docs = f.read()"
   ]
  },
@ -242,7 +242,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "with open(INTERIM_DIR /'synthetic_datasets/synthetic_data.json', 'w') as f:\n",
+    "with open(INTERIM_DIR /'synthethic_datasets/synthethic_data.json', 'w') as f:\n",
    "    json.dump(synthetic_data, f)"
   ]
  },
@ -283,9 +283,17 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "with open(INTERIM_DIR /'synthetic_datasets/synthetic_data_no_mbpp.json', 'w') as f:\n",
+    "with open(INTERIM_DIR /'synthethic_datasets/synthethic_data_no_mbpp.json', 'w') as f:\n",
    "    json.dump(synthetic_data, f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c3deb316",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
--- a/scratches/acano/langgraph_agent_simple.ipynb
+++ b/scratches/acano/langgraph_agent_simple.ipynb
--- a/scripts/pipelines/tasks/chunk.py
+++ b/scripts/pipelines/tasks/chunk.py
@ -0,0 +1,61 @@
 import os
 import re
 def replace_javascript_with_avap(text: str) -> str:
    """
    Replace mentions of javascript language with avap in the text.
    Handles code blocks, language identifiers, and references.
    Args:
        text: The text to process.
    Returns:
        The text with javascript references replaced with avap.
    """
    # Replace ```javascript with ```avap
    text = text.replace("```javascript", "```avap")
    # Replace ```js with ```avap
    text = text.replace("```js", "```avap")
    # Replace common phrases (case-insensitive)
    text = re.sub(r"\bjavascript\s+code\b", "avap code", text, flags=re.IGNORECASE)
    text = re.sub(
        r"\bjavascript\s+example\b", "avap example", text, flags=re.IGNORECASE
    )
    text = re.sub(r"\bjavascript\b(?!\s+file)", "avap", text, flags=re.IGNORECASE)
    return text
 def read_concat_files(folder_path: str, file_prefix: str, concatenate: bool = True) -> str | list[str]:
    """
    Read and concatenate all files in a folder whose names start with a given prefix.
    Replaces javascript language markers with avap.
    Args:
        folder_path: Path to the folder to search in.
        file_prefix: The prefix that file names must start with.
        concatenate: Whether to concatenate the contents of the files.
    Returns:
        A single string with the concatenated contents of all matching files,
        with javascript markers replaced with avap, or a list of strings if concatenate is False.
    """
    contents = []
    for filename in sorted(os.listdir(folder_path)):
        if filename.startswith(file_prefix):
            file_path = os.path.join(folder_path, filename)
            if os.path.isfile(file_path):
                with open(file_path, "r", encoding="utf-8") as f:
                    content = f.read()
                    if content.strip():
                        print(f"Reading file: {filename}")  # Skip empty files
                        contents.append(content)
    if concatenate:
        concatenated = "\n".join(contents)
        return replace_javascript_with_avap(concatenated)
    else:
        return [replace_javascript_with_avap(content) for content in contents]