assistance-engine/scripts/pipelines/flows/bnf_files_generator.py

"""
Generator for BNF specification files from AVAP documentation.

This script extracts BNF specifications from the AVAP Language Reference Manual (LRM)
and generates individual text files for each BNF section.

Output format: n0X_BNF.txt (where X is the section number)
Default output directory: ingestion/code/BNF/
Default markdown source: docs/LRM/avap.md

USAGE EXAMPLES:

Use default configuration:
    python scripts/pipelines/flows/bnf_files_generator.py

Customize input and output paths:
    python scripts/pipelines/flows/bnf_files_generator.py --markdown docs/LRM/avap.md --output ingestion/code
    python scripts/pipelines/flows/bnf_files_generator.py -m docs/LRM/avap.md -o ingestion/code

OPTIONS:
    --markdown, -m: Path to the AVAP markdown file (relative to project root)
    --output, -o:   Output directory for BNF files (relative to project root)
"""

import re
import typer
from pathlib import Path
from typing import List, Tuple, Optional

app = typer.Typer()


class BNFExtractor:
    """Extract BNF specifications from AVAP markdown documentation."""

    def __init__(self, markdown_file: Path, output_dir: Path):
        """
        Initialize BNF extractor.

        Args:
            markdown_file: Path to the AVAP markdown file
            output_dir: Directory where BNF files will be saved
        """
        self.markdown_file = markdown_file
        self.output_dir = output_dir
        self.bnf_sections: List[Tuple[int, str, str]] = []

    @staticmethod
    def _roman_to_int(roman: str) -> int:
        """
        Convert Roman numerals to integers.

        Args:
            roman: Roman numeral string (e.g., 'I', 'IV', 'IX', 'XII')

        Returns:
            Integer value of the Roman numeral
        """
        roman_values = {
            'I': 1, 'V': 5, 'X': 10, 'L': 50,
            'C': 100, 'D': 500, 'M': 1000
        }
        total = 0
        prev_value = 0

        for char in reversed(roman):
            value = roman_values.get(char, 0)
            if value < prev_value:
                total -= value
            else:
                total += value
            prev_value = value

        return total

    def read_markdown_file(self) -> str:
        """Read the markdown file content."""
        with open(self.markdown_file, "r", encoding="utf-8") as f:
            return f.read()

    def extract_bnf_sections(self, content: str) -> List[Tuple[int, str, str]]:
        """
        Extract all BNF specifications from markdown content.

        Pattern: ### Especificación BNF (Sección I)
                 ```bnf
                 ... BNF content ...
                 ```

        Args:
            content: Markdown file content

        Returns:
            List of tuples: (section_number, section_title, bnf_content)
        """
        bnf_sections = []

        # Pattern to find BNF specification headers and extract Roman numerals
        # Matches: ### Especificación BNF (Sección I), (Sección II), etc.
        header_pattern = r"### Especificación BNF \(Sección ([IVXLCDM]+)\)"

        # Find all BNF headers with their positions
        for match in re.finditer(header_pattern, content):
            roman_numeral = match.group(1)
            section_number = self._roman_to_int(roman_numeral)
            header_start = match.start()
            header_end = match.end()

            # Find the code block after this header
            code_block_pattern = r"```bnf\n(.*?)```"
            search_start = header_end

            code_match = re.search(code_block_pattern, content[search_start:], re.DOTALL)

            if code_match:
                bnf_content = code_match.group(1).strip()
                section_title = f"Especificación BNF - Sección {roman_numeral}"
                bnf_sections.append((section_number, section_title, bnf_content))

        self.bnf_sections = bnf_sections
        return bnf_sections

    def format_bnf_file_content(self, section_number: int, title: str, bnf_content: str) -> str:
        """
        Format BNF content for file output.

        Args:
            section_number: Section number (1-9, etc.)
            title: Section title
            bnf_content: Raw BNF grammar content

        Returns:
            BNF content without additional formatting
        """
        return bnf_content

    def save_bnf_files(self) -> int:
        """
        Save extracted BNF sections to individual files.

        File naming convention: n0X_BNF.txt (e.g., n01_BNF.txt, n02_BNF.txt, etc.)

        Returns:
            Number of files created
        """
        # Ensure output directory exists
        self.output_dir.mkdir(parents=True, exist_ok=True)

        files_created = 0

        for section_number, title, bnf_content in self.bnf_sections:
            # Format filename with zero-padded section number
            filename = f"n{section_number:02d}_BNF.txt"
            filepath = self.output_dir / filename

            # Format and write file content
            formatted_content = self.format_bnf_file_content(
                section_number, title, bnf_content
            )

            with open(filepath, "w", encoding="utf-8") as f:
                f.write(formatted_content)

            print(f"Created: {filepath}")
            files_created += 1

        return files_created

    def generate(self) -> Tuple[int, List[str]]:
        """
        Execute the complete BNF extraction and file generation process.

        Returns:
            Tuple of (number_of_files_created, list_of_file_paths)
        """
        print(f"Reading markdown file: {self.markdown_file}")
        content = self.read_markdown_file()

        print(f"Extracting BNF specifications...")
        bnf_sections = self.extract_bnf_sections(content)

        print(f"Found {len(bnf_sections)} BNF sections:")
        for section_number, title, _ in bnf_sections:
            print(f"  - {title}")

        print(f"\nSaving BNF files to: {self.output_dir}")
        files_created = self.save_bnf_files()

        # Generate list of created file paths
        file_paths = [
            str(self.output_dir / f"n{i:02d}_BNF.txt")
            for i, _, _ in bnf_sections
        ]

        return files_created, file_paths


@app.command()
def main(
    markdown_file: str = typer.Option(
        "docs/LRM/avap.md",
        "--markdown",
        "-m",
        help="Path to AVAP markdown file (relative to project root)"
    ),
    output_dir: str = typer.Option(
        "ingestion/code/BNF/",
        "--output",
        "-o",
        help="Output directory for BNF files (relative to project root)"
    )
):
    """Extract BNF specifications from AVAP documentation.

    Default behavior:
    - Reads from: docs/LRM/avap.md
    - Writes to: ingestion/code/BNF/
    """
    # Get project root directory (scripts/pipelines/flows -> project root)
    script_dir = Path(__file__).parent
    project_root = script_dir.parent.parent.parent

    # Convert relative paths to absolute
    markdown_path = project_root / markdown_file
    output_path = project_root / output_dir

    # Verify markdown file exists
    if not markdown_path.exists():
        typer.echo(f"Error: Markdown file not found: {markdown_path}", err=True)
        raise typer.Exit(code=1)

    # Create extractor and generate files
    extractor = BNFExtractor(markdown_path, output_path)
    files_created, file_paths = extractor.generate()

    print(f"\n{'='*80}")
    print(f"BNF extraction complete!")
    print(f"Total files created: {files_created}")
    print(f"Output directory: {output_path}")
    print(f"{'='*80}")


if __name__ == "__main__":
    app()