Add chunk.py for processing and replacing JavaScript references with Avap
- Implemented `replace_javascript_with_avap` function to handle text replacement. - Created `read_concat_files` function to read and concatenate files with a specified prefix, replacing JavaScript markers. - Added functionality to read files from a specified directory and process their contents.
This commit is contained in:
parent
c8da317dd8
commit
6d856ba691
|
|
@ -2,7 +2,7 @@
|
||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 2,
|
||||||
"id": "b657efd2",
|
"id": "b657efd2",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
|
|
@ -15,7 +15,7 @@
|
||||||
"from langchain_core.messages import SystemMessage, HumanMessage\n",
|
"from langchain_core.messages import SystemMessage, HumanMessage\n",
|
||||||
"\n",
|
"\n",
|
||||||
"from src.utils.llm_factory import create_chat_model\n",
|
"from src.utils.llm_factory import create_chat_model\n",
|
||||||
"from src.config import RAW_DIR, INTERIM_DIR"
|
"from src.config import RAW_DIR, INTERIM_DIR, EXTERNAL_DIR"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|
@ -111,7 +111,7 @@
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"with open(RAW_DIR / \"avap.txt\", \"r\") as f:\n",
|
"with open(RAW_DIR / \"combined.txt\", \"r\") as f:\n",
|
||||||
" avap_docs = f.read()"
|
" avap_docs = f.read()"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
|
@ -242,7 +242,7 @@
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"with open(INTERIM_DIR /'synthetic_datasets/synthetic_data.json', 'w') as f:\n",
|
"with open(INTERIM_DIR /'synthethic_datasets/synthethic_data.json', 'w') as f:\n",
|
||||||
" json.dump(synthetic_data, f)"
|
" json.dump(synthetic_data, f)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
|
@ -283,9 +283,17 @@
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"with open(INTERIM_DIR /'synthetic_datasets/synthetic_data_no_mbpp.json', 'w') as f:\n",
|
"with open(INTERIM_DIR /'synthethic_datasets/synthethic_data_no_mbpp.json', 'w') as f:\n",
|
||||||
" json.dump(synthetic_data, f)"
|
" json.dump(synthetic_data, f)"
|
||||||
]
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "c3deb316",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"metadata": {
|
"metadata": {
|
||||||
|
|
|
||||||
File diff suppressed because one or more lines are too long
|
|
@ -0,0 +1,61 @@
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
|
||||||
|
|
||||||
|
def replace_javascript_with_avap(text: str) -> str:
|
||||||
|
"""
|
||||||
|
Replace mentions of javascript language with avap in the text.
|
||||||
|
Handles code blocks, language identifiers, and references.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text: The text to process.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The text with javascript references replaced with avap.
|
||||||
|
"""
|
||||||
|
# Replace ```javascript with ```avap
|
||||||
|
text = text.replace("```javascript", "```avap")
|
||||||
|
|
||||||
|
# Replace ```js with ```avap
|
||||||
|
text = text.replace("```js", "```avap")
|
||||||
|
|
||||||
|
# Replace common phrases (case-insensitive)
|
||||||
|
text = re.sub(r"\bjavascript\s+code\b", "avap code", text, flags=re.IGNORECASE)
|
||||||
|
text = re.sub(
|
||||||
|
r"\bjavascript\s+example\b", "avap example", text, flags=re.IGNORECASE
|
||||||
|
)
|
||||||
|
text = re.sub(r"\bjavascript\b(?!\s+file)", "avap", text, flags=re.IGNORECASE)
|
||||||
|
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
def read_concat_files(folder_path: str, file_prefix: str, concatenate: bool = True) -> str | list[str]:
|
||||||
|
"""
|
||||||
|
Read and concatenate all files in a folder whose names start with a given prefix.
|
||||||
|
Replaces javascript language markers with avap.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
folder_path: Path to the folder to search in.
|
||||||
|
file_prefix: The prefix that file names must start with.
|
||||||
|
concatenate: Whether to concatenate the contents of the files.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A single string with the concatenated contents of all matching files,
|
||||||
|
with javascript markers replaced with avap, or a list of strings if concatenate is False.
|
||||||
|
"""
|
||||||
|
contents = []
|
||||||
|
for filename in sorted(os.listdir(folder_path)):
|
||||||
|
if filename.startswith(file_prefix):
|
||||||
|
file_path = os.path.join(folder_path, filename)
|
||||||
|
if os.path.isfile(file_path):
|
||||||
|
with open(file_path, "r", encoding="utf-8") as f:
|
||||||
|
content = f.read()
|
||||||
|
if content.strip():
|
||||||
|
print(f"Reading file: {filename}") # Skip empty files
|
||||||
|
contents.append(content)
|
||||||
|
|
||||||
|
if concatenate:
|
||||||
|
concatenated = "\n".join(contents)
|
||||||
|
return replace_javascript_with_avap(concatenated)
|
||||||
|
else:
|
||||||
|
return [replace_javascript_with_avap(content) for content in contents]
|
||||||
Loading…
Reference in New Issue