""" Generator for BNF specification files from AVAP documentation. This script extracts BNF specifications from the AVAP Language Reference Manual (LRM) and generates individual text files for each BNF section. Output format: n0X_BNF.txt (where X is the section number) Default output directory: ingestion/code/BNF/ Default markdown source: docs/LRM/avap.md USAGE EXAMPLES: Use default configuration: python scripts/pipelines/flows/bnf_files_generator.py Customize input and output paths: python scripts/pipelines/flows/bnf_files_generator.py --markdown docs/LRM/avap.md --output ingestion/code python scripts/pipelines/flows/bnf_files_generator.py -m docs/LRM/avap.md -o ingestion/code OPTIONS: --markdown, -m: Path to the AVAP markdown file (relative to project root) --output, -o: Output directory for BNF files (relative to project root) """ import re import typer from pathlib import Path from typing import List, Tuple, Optional app = typer.Typer() class BNFExtractor: """Extract BNF specifications from AVAP markdown documentation.""" def __init__(self, markdown_file: Path, output_dir: Path): """ Initialize BNF extractor. Args: markdown_file: Path to the AVAP markdown file output_dir: Directory where BNF files will be saved """ self.markdown_file = markdown_file self.output_dir = output_dir self.bnf_sections: List[Tuple[int, str, str]] = [] @staticmethod def _roman_to_int(roman: str) -> int: """ Convert Roman numerals to integers. Args: roman: Roman numeral string (e.g., 'I', 'IV', 'IX', 'XII') Returns: Integer value of the Roman numeral """ roman_values = { 'I': 1, 'V': 5, 'X': 10, 'L': 50, 'C': 100, 'D': 500, 'M': 1000 } total = 0 prev_value = 0 for char in reversed(roman): value = roman_values.get(char, 0) if value < prev_value: total -= value else: total += value prev_value = value return total def read_markdown_file(self) -> str: """Read the markdown file content.""" with open(self.markdown_file, "r", encoding="utf-8") as f: return f.read() def extract_bnf_sections(self, content: str) -> List[Tuple[int, str, str]]: """ Extract all BNF specifications from markdown content. Pattern: ### Especificación BNF (Sección I) ```bnf ... BNF content ... ``` Args: content: Markdown file content Returns: List of tuples: (section_number, section_title, bnf_content) """ bnf_sections = [] # Pattern to find BNF specification headers and extract Roman numerals # Matches: ### Especificación BNF (Sección I), (Sección II), etc. header_pattern = r"### Especificación BNF \(Sección ([IVXLCDM]+)\)" # Find all BNF headers with their positions for match in re.finditer(header_pattern, content): roman_numeral = match.group(1) section_number = self._roman_to_int(roman_numeral) header_start = match.start() header_end = match.end() # Find the code block after this header code_block_pattern = r"```bnf\n(.*?)```" search_start = header_end code_match = re.search(code_block_pattern, content[search_start:], re.DOTALL) if code_match: bnf_content = code_match.group(1).strip() section_title = f"Especificación BNF - Sección {roman_numeral}" bnf_sections.append((section_number, section_title, bnf_content)) self.bnf_sections = bnf_sections return bnf_sections def format_bnf_file_content(self, section_number: int, title: str, bnf_content: str) -> str: """ Format BNF content for file output. Args: section_number: Section number (1-9, etc.) title: Section title bnf_content: Raw BNF grammar content Returns: BNF content without additional formatting """ return bnf_content def save_bnf_files(self) -> int: """ Save extracted BNF sections to individual files. File naming convention: n0X_BNF.txt (e.g., n01_BNF.txt, n02_BNF.txt, etc.) Returns: Number of files created """ # Ensure output directory exists self.output_dir.mkdir(parents=True, exist_ok=True) files_created = 0 for section_number, title, bnf_content in self.bnf_sections: # Format filename with zero-padded section number filename = f"n{section_number:02d}_BNF.txt" filepath = self.output_dir / filename # Format and write file content formatted_content = self.format_bnf_file_content( section_number, title, bnf_content ) with open(filepath, "w", encoding="utf-8") as f: f.write(formatted_content) print(f"Created: {filepath}") files_created += 1 return files_created def generate(self) -> Tuple[int, List[str]]: """ Execute the complete BNF extraction and file generation process. Returns: Tuple of (number_of_files_created, list_of_file_paths) """ print(f"Reading markdown file: {self.markdown_file}") content = self.read_markdown_file() print(f"Extracting BNF specifications...") bnf_sections = self.extract_bnf_sections(content) print(f"Found {len(bnf_sections)} BNF sections:") for section_number, title, _ in bnf_sections: print(f" - {title}") print(f"\nSaving BNF files to: {self.output_dir}") files_created = self.save_bnf_files() # Generate list of created file paths file_paths = [ str(self.output_dir / f"n{i:02d}_BNF.txt") for i, _, _ in bnf_sections ] return files_created, file_paths @app.command() def main( markdown_file: str = typer.Option( "docs/LRM/avap.md", "--markdown", "-m", help="Path to AVAP markdown file (relative to project root)" ), output_dir: str = typer.Option( "ingestion/code/BNF/", "--output", "-o", help="Output directory for BNF files (relative to project root)" ) ): """Extract BNF specifications from AVAP documentation. Default behavior: - Reads from: docs/LRM/avap.md - Writes to: ingestion/code/BNF/ """ # Get project root directory (scripts/pipelines/flows -> project root) script_dir = Path(__file__).parent project_root = script_dir.parent.parent.parent # Convert relative paths to absolute markdown_path = project_root / markdown_file output_path = project_root / output_dir # Verify markdown file exists if not markdown_path.exists(): typer.echo(f"Error: Markdown file not found: {markdown_path}", err=True) raise typer.Exit(code=1) # Create extractor and generate files extractor = BNFExtractor(markdown_path, output_path) files_created, file_paths = extractor.generate() print(f"\n{'='*80}") print(f"BNF extraction complete!") print(f"Total files created: {files_created}") print(f"Output directory: {output_path}") print(f"{'='*80}") if __name__ == "__main__": app()