245 lines
7.8 KiB
Python
245 lines
7.8 KiB
Python
"""
|
|
Generator for BNF specification files from AVAP documentation.
|
|
|
|
This script extracts BNF specifications from the AVAP Language Reference Manual (LRM)
|
|
and generates individual text files for each BNF section.
|
|
|
|
Output format: n0X_BNF.txt (where X is the section number)
|
|
Default output directory: ingestion/code/BNF/
|
|
Default markdown source: docs/LRM/avap.md
|
|
|
|
USAGE EXAMPLES:
|
|
|
|
Use default configuration:
|
|
python scripts/pipelines/flows/bnf_files_generator.py
|
|
|
|
Customize input and output paths:
|
|
python scripts/pipelines/flows/bnf_files_generator.py --markdown docs/LRM/avap.md --output ingestion/code
|
|
python scripts/pipelines/flows/bnf_files_generator.py -m docs/LRM/avap.md -o ingestion/code
|
|
|
|
OPTIONS:
|
|
--markdown, -m: Path to the AVAP markdown file (relative to project root)
|
|
--output, -o: Output directory for BNF files (relative to project root)
|
|
"""
|
|
|
|
import re
|
|
import typer
|
|
from pathlib import Path
|
|
from typing import List, Tuple, Optional
|
|
|
|
app = typer.Typer()
|
|
|
|
|
|
class BNFExtractor:
|
|
"""Extract BNF specifications from AVAP markdown documentation."""
|
|
|
|
def __init__(self, markdown_file: Path, output_dir: Path):
|
|
"""
|
|
Initialize BNF extractor.
|
|
|
|
Args:
|
|
markdown_file: Path to the AVAP markdown file
|
|
output_dir: Directory where BNF files will be saved
|
|
"""
|
|
self.markdown_file = markdown_file
|
|
self.output_dir = output_dir
|
|
self.bnf_sections: List[Tuple[int, str, str]] = []
|
|
|
|
@staticmethod
|
|
def _roman_to_int(roman: str) -> int:
|
|
"""
|
|
Convert Roman numerals to integers.
|
|
|
|
Args:
|
|
roman: Roman numeral string (e.g., 'I', 'IV', 'IX', 'XII')
|
|
|
|
Returns:
|
|
Integer value of the Roman numeral
|
|
"""
|
|
roman_values = {
|
|
'I': 1, 'V': 5, 'X': 10, 'L': 50,
|
|
'C': 100, 'D': 500, 'M': 1000
|
|
}
|
|
total = 0
|
|
prev_value = 0
|
|
|
|
for char in reversed(roman):
|
|
value = roman_values.get(char, 0)
|
|
if value < prev_value:
|
|
total -= value
|
|
else:
|
|
total += value
|
|
prev_value = value
|
|
|
|
return total
|
|
|
|
def read_markdown_file(self) -> str:
|
|
"""Read the markdown file content."""
|
|
with open(self.markdown_file, "r", encoding="utf-8") as f:
|
|
return f.read()
|
|
|
|
def extract_bnf_sections(self, content: str) -> List[Tuple[int, str, str]]:
|
|
"""
|
|
Extract all BNF specifications from markdown content.
|
|
|
|
Pattern: ### Especificación BNF (Sección I)
|
|
```bnf
|
|
... BNF content ...
|
|
```
|
|
|
|
Args:
|
|
content: Markdown file content
|
|
|
|
Returns:
|
|
List of tuples: (section_number, section_title, bnf_content)
|
|
"""
|
|
bnf_sections = []
|
|
|
|
# Pattern to find BNF specification headers and extract Roman numerals
|
|
# Matches: ### Especificación BNF (Sección I), (Sección II), etc.
|
|
header_pattern = r"### Especificación BNF \(Sección ([IVXLCDM]+)\)"
|
|
|
|
# Find all BNF headers with their positions
|
|
for match in re.finditer(header_pattern, content):
|
|
roman_numeral = match.group(1)
|
|
section_number = self._roman_to_int(roman_numeral)
|
|
header_start = match.start()
|
|
header_end = match.end()
|
|
|
|
# Find the code block after this header
|
|
code_block_pattern = r"```bnf\n(.*?)```"
|
|
search_start = header_end
|
|
|
|
code_match = re.search(code_block_pattern, content[search_start:], re.DOTALL)
|
|
|
|
if code_match:
|
|
bnf_content = code_match.group(1).strip()
|
|
section_title = f"Especificación BNF - Sección {roman_numeral}"
|
|
bnf_sections.append((section_number, section_title, bnf_content))
|
|
|
|
self.bnf_sections = bnf_sections
|
|
return bnf_sections
|
|
|
|
def format_bnf_file_content(self, section_number: int, title: str, bnf_content: str) -> str:
|
|
"""
|
|
Format BNF content for file output.
|
|
|
|
Args:
|
|
section_number: Section number (1-9, etc.)
|
|
title: Section title
|
|
bnf_content: Raw BNF grammar content
|
|
|
|
Returns:
|
|
BNF content without additional formatting
|
|
"""
|
|
return bnf_content
|
|
|
|
def save_bnf_files(self) -> int:
|
|
"""
|
|
Save extracted BNF sections to individual files.
|
|
|
|
File naming convention: n0X_BNF.txt (e.g., n01_BNF.txt, n02_BNF.txt, etc.)
|
|
|
|
Returns:
|
|
Number of files created
|
|
"""
|
|
# Ensure output directory exists
|
|
self.output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
files_created = 0
|
|
|
|
for section_number, title, bnf_content in self.bnf_sections:
|
|
# Format filename with zero-padded section number
|
|
filename = f"n{section_number:02d}_BNF.txt"
|
|
filepath = self.output_dir / filename
|
|
|
|
# Format and write file content
|
|
formatted_content = self.format_bnf_file_content(
|
|
section_number, title, bnf_content
|
|
)
|
|
|
|
with open(filepath, "w", encoding="utf-8") as f:
|
|
f.write(formatted_content)
|
|
|
|
print(f"Created: {filepath}")
|
|
files_created += 1
|
|
|
|
return files_created
|
|
|
|
def generate(self) -> Tuple[int, List[str]]:
|
|
"""
|
|
Execute the complete BNF extraction and file generation process.
|
|
|
|
Returns:
|
|
Tuple of (number_of_files_created, list_of_file_paths)
|
|
"""
|
|
print(f"Reading markdown file: {self.markdown_file}")
|
|
content = self.read_markdown_file()
|
|
|
|
print(f"Extracting BNF specifications...")
|
|
bnf_sections = self.extract_bnf_sections(content)
|
|
|
|
print(f"Found {len(bnf_sections)} BNF sections:")
|
|
for section_number, title, _ in bnf_sections:
|
|
print(f" - {title}")
|
|
|
|
print(f"\nSaving BNF files to: {self.output_dir}")
|
|
files_created = self.save_bnf_files()
|
|
|
|
# Generate list of created file paths
|
|
file_paths = [
|
|
str(self.output_dir / f"n{i:02d}_BNF.txt")
|
|
for i, _, _ in bnf_sections
|
|
]
|
|
|
|
return files_created, file_paths
|
|
|
|
|
|
@app.command()
|
|
def main(
|
|
markdown_file: str = typer.Option(
|
|
"docs/LRM/avap.md",
|
|
"--markdown",
|
|
"-m",
|
|
help="Path to AVAP markdown file (relative to project root)"
|
|
),
|
|
output_dir: str = typer.Option(
|
|
"ingestion/code/BNF/",
|
|
"--output",
|
|
"-o",
|
|
help="Output directory for BNF files (relative to project root)"
|
|
)
|
|
):
|
|
"""Extract BNF specifications from AVAP documentation.
|
|
|
|
Default behavior:
|
|
- Reads from: docs/LRM/avap.md
|
|
- Writes to: ingestion/code/BNF/
|
|
"""
|
|
# Get project root directory (scripts/pipelines/flows -> project root)
|
|
script_dir = Path(__file__).parent
|
|
project_root = script_dir.parent.parent.parent
|
|
|
|
# Convert relative paths to absolute
|
|
markdown_path = project_root / markdown_file
|
|
output_path = project_root / output_dir
|
|
|
|
# Verify markdown file exists
|
|
if not markdown_path.exists():
|
|
typer.echo(f"Error: Markdown file not found: {markdown_path}", err=True)
|
|
raise typer.Exit(code=1)
|
|
|
|
# Create extractor and generate files
|
|
extractor = BNFExtractor(markdown_path, output_path)
|
|
files_created, file_paths = extractor.generate()
|
|
|
|
print(f"\n{'='*80}")
|
|
print(f"BNF extraction complete!")
|
|
print(f"Total files created: {files_created}")
|
|
print(f"Output directory: {output_path}")
|
|
print(f"{'='*80}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
app()
|