Merge branch 'mrh-online-dev' of github.com:BRUNIX-AI/assistance-engine into mrh-online-dev

This commit is contained in:
pseco 2026-03-09 15:04:23 +01:00
commit f6bfba5561
3 changed files with 166 additions and 190 deletions

View File

@ -2,7 +2,7 @@
"cells": [ "cells": [
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 2,
"id": "b657efd2", "id": "b657efd2",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -15,7 +15,7 @@
"from langchain_core.messages import SystemMessage, HumanMessage\n", "from langchain_core.messages import SystemMessage, HumanMessage\n",
"\n", "\n",
"from src.utils.llm_factory import create_chat_model\n", "from src.utils.llm_factory import create_chat_model\n",
"from src.config import RAW_DIR, INTERIM_DIR" "from src.config import RAW_DIR, INTERIM_DIR, EXTERNAL_DIR"
] ]
}, },
{ {
@ -111,7 +111,7 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"with open(RAW_DIR / \"avap.txt\", \"r\") as f:\n", "with open(RAW_DIR / \"combined.txt\", \"r\") as f:\n",
" avap_docs = f.read()" " avap_docs = f.read()"
] ]
}, },
@ -242,7 +242,7 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"with open(INTERIM_DIR /'synthetic_datasets/synthetic_data.json', 'w') as f:\n", "with open(INTERIM_DIR /'synthethic_datasets/synthethic_data.json', 'w') as f:\n",
" json.dump(synthetic_data, f)" " json.dump(synthetic_data, f)"
] ]
}, },
@ -283,9 +283,17 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"with open(INTERIM_DIR /'synthetic_datasets/synthetic_data_no_mbpp.json', 'w') as f:\n", "with open(INTERIM_DIR /'synthethic_datasets/synthethic_data_no_mbpp.json', 'w') as f:\n",
" json.dump(synthetic_data, f)" " json.dump(synthetic_data, f)"
] ]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c3deb316",
"metadata": {},
"outputs": [],
"source": []
} }
], ],
"metadata": { "metadata": {

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,61 @@
import os
import re
def replace_javascript_with_avap(text: str) -> str:
"""
Replace mentions of javascript language with avap in the text.
Handles code blocks, language identifiers, and references.
Args:
text: The text to process.
Returns:
The text with javascript references replaced with avap.
"""
# Replace ```javascript with ```avap
text = text.replace("```javascript", "```avap")
# Replace ```js with ```avap
text = text.replace("```js", "```avap")
# Replace common phrases (case-insensitive)
text = re.sub(r"\bjavascript\s+code\b", "avap code", text, flags=re.IGNORECASE)
text = re.sub(
r"\bjavascript\s+example\b", "avap example", text, flags=re.IGNORECASE
)
text = re.sub(r"\bjavascript\b(?!\s+file)", "avap", text, flags=re.IGNORECASE)
return text
def read_concat_files(folder_path: str, file_prefix: str, concatenate: bool = True) -> str | list[str]:
"""
Read and concatenate all files in a folder whose names start with a given prefix.
Replaces javascript language markers with avap.
Args:
folder_path: Path to the folder to search in.
file_prefix: The prefix that file names must start with.
concatenate: Whether to concatenate the contents of the files.
Returns:
A single string with the concatenated contents of all matching files,
with javascript markers replaced with avap, or a list of strings if concatenate is False.
"""
contents = []
for filename in sorted(os.listdir(folder_path)):
if filename.startswith(file_prefix):
file_path = os.path.join(folder_path, filename)
if os.path.isfile(file_path):
with open(file_path, "r", encoding="utf-8") as f:
content = f.read()
if content.strip():
print(f"Reading file: {filename}") # Skip empty files
contents.append(content)
if concatenate:
concatenated = "\n".join(contents)
return replace_javascript_with_avap(concatenated)
else:
return [replace_javascript_with_avap(content) for content in contents]