assistance-engine/scripts/pipelines/ingestion/avap_chunker.py

"""
chunker.py  v1.0

Uso:
    python chunker.py --lang-config avap_config.json --docs-path ./docs/samples
    python chunker.py --lang-config avap_config.json --docs-path ./docs/samples --workers 8
    python chunker.py --lang-config avap_config.json --docs-path ./docs/samples --redis-url redis://localhost:6379
    python chunker.py --lang-config avap_config.json --docs-path ./docs/samples --no-dedup
"""

import re
import os
import json
import hashlib
import argparse
import tempfile
import warnings as py_warnings
from pathlib import Path
from dataclasses import dataclass, asdict, field
from typing import Optional, Generator, IO
from concurrent.futures import ProcessPoolExecutor, as_completed


try:
    import tiktoken
    _ENC = tiktoken.get_encoding("cl100k_base")
    def count_tokens(text: str) -> int:
        return len(_ENC.encode(text))
    TOKEN_BACKEND = "tiktoken/cl100k_base"
except ImportError:
    py_warnings.warn("tiktoken no instalado — usando word-count. pip install tiktoken",
                     stacklevel=2)
    def count_tokens(text: str) -> int:  # type: ignore[misc]
        return len(text.split())
    TOKEN_BACKEND = "word-count (estimación)"

try:
    from datasketch import MinHash, MinHashLSH
    MINHASH_AVAILABLE = True
except ImportError:
    MINHASH_AVAILABLE = False
    py_warnings.warn("datasketch no instalado — dedup desactivada. pip install datasketch",
                     stacklevel=2)

try:
    from tqdm import tqdm
except ImportError:
    def tqdm(x, **kwargs): return x  # type: ignore[misc]


MAX_NARRATIVE_TOKENS = 400
OVERLAP_LINES        = 3
DEDUP_THRESHOLD      = 0.85
MINHASH_NUM_PERM     = 128
MINHASH_SHINGLE_SIZE = 3
DEFAULT_WORKERS      = max(1, (os.cpu_count() or 4) - 1)


@dataclass
class BlockDef:
    name:               str
    doc_type:           str
    opener_re:          re.Pattern
    closer_re:          re.Pattern
    extract_signature:bool  = False
    signature_template:str = ""

    def extract_sig(self, clean_line):
        if not self.extract_signature:
            return None
        m = self.opener_re.match(clean_line)
        if not m:
            return None
        tpl = self.signature_template
        for i, g in enumerate(m.groups(), start=1):
            tpl = tpl.replace(f"{{group{i}}}", (g or "").strip())
        return tpl


@dataclass
class StatementDef:
    name: str
    re:   re.Pattern


@dataclass
class SemanticTag:
    tag: str
    re:  re.Pattern


class LanguageConfig:

    def __init__(self, config_path: str):
        raw = json.loads(Path(config_path).read_text(encoding="utf-8"))

        self.language    = raw.get("language", "unknown")
        self.version     = raw.get("version", "1.0")
        self.extensions  = set(raw.get("file_extensions", []))

        lex = raw.get("lexer", {})
        self.string_delimiters = lex.get("string_delimiters", ['"', "'"])
        self.escape_char = lex.get("escape_char", "\\")
        self.comment_line = sorted(lex.get("comment_line", ["#"]), key=len, reverse=True)
        cb = lex.get("comment_block", {})
        self.comment_block_open = cb.get("open",  "")
        self.comment_block_close = cb.get("close", "")
        self.line_oriented = lex.get("line_oriented", True)

        self.blocks: list[BlockDef] = []
        for b in raw.get("blocks", []):
            self.blocks.append(BlockDef(
                name = b["name"],
                doc_type = b.get("doc_type", "code"),
                opener_re = re.compile(b["opener_pattern"]),
                closer_re = re.compile(b["closer_pattern"]),
                extract_signature = b.get("extract_signature", False),
                signature_template = b.get("signature_template", ""),
            ))

        self.statements: list[StatementDef] = [
            StatementDef(name=s["name"], re=re.compile(s["pattern"]))
            for s in raw.get("statements", [])
        ]

        self.semantic_tags: list[SemanticTag] = [
            SemanticTag(tag=t["tag"], re=re.compile(t["pattern"]))
            for t in raw.get("semantic_tags", [])
        ]

    def match_opener(self, clean_line):
        for block in self.blocks:
            if block.opener_re.match(clean_line):
                return block
        return None

    def match_closer(self, clean_line):
        for block in self.blocks:
            if block.closer_re.match(clean_line):
                return True
        return False

    def classify_statement(self, clean_line):
        for stmt in self.statements:
            if stmt.re.match(clean_line):
                return stmt.name
        return "statement"

    def enrich_metadata(self, content):
        meta: dict = {}
        for tag in self.semantic_tags:
            if tag.re.search(content):
                meta[tag.tag] = True
        meta["complexity"] = sum(1 for v in meta.values() if v is True)
        return meta


@dataclass
class Chunk:
    chunk_id: str
    source_file: str
    doc_type: str
    block_type: str
    section: str
    start_line: int
    end_line: int
    content: str
    metadata: dict = field(default_factory=dict)

    def token_count(self):
        return count_tokens(self.content)

    def to_dict(self):
        d = asdict(self)
        d["token_estimate"] = self.token_count()
        return d


def make_chunk_id(filepath, start, end, content):
    return hashlib.sha1(
        f"{filepath.name}:{start}:{end}:{content[:60]}".encode()
    ).hexdigest()[:16]


def make_chunk(filepath: Path, doc_type, block_type,section, start, end, content, cfg, extra_meta = None):
    content = content.strip()
    meta = cfg.enrich_metadata(content)
    if extra_meta:
        meta.update(extra_meta)
    return Chunk(
        chunk_id=make_chunk_id(filepath, start, end, content),
        source_file=str(filepath),
        doc_type=doc_type, block_type=block_type,
        section=section, start_line=start, end_line=end,
        content=content, metadata=meta,
    )

class GenericLexer:

    def __init__(self, cfg: LanguageConfig):
        self.cfg = cfg
        self.in_block_comment = False

    def process_line(self, raw):
        if self.in_block_comment:
            if self.cfg.comment_block_close and \
               self.cfg.comment_block_close in raw:
                self.in_block_comment = False
            return False, ""

        cb_open  = self.cfg.comment_block_open
        cb_close = self.cfg.comment_block_close
        if cb_open and cb_open in raw:
            idx_open = raw.index(cb_open)
            rest = raw[idx_open + len(cb_open):]
            if cb_close and cb_close in rest:
                idx_close = raw.index(cb_close, idx_open)
                code_part = raw[:idx_open] + raw[idx_close + len(cb_close):]
                return self._strip_line_comments(code_part)
            else:
                self.in_block_comment = True
                return self._strip_line_comments(raw[:idx_open])

        return self._strip_line_comments(raw)

    def _strip_line_comments(self, raw):

        in_str: Optional[str] = None
        result = []
        i = 0

        while i < len(raw):
            ch = raw[i]
            if in_str and ch == self.cfg.escape_char:
                result.append(ch)
                if i + 1 < len(raw):
                    result.append(raw[i + 1])
                    i += 2
                else:
                    i += 1
                continue

            if in_str and ch == in_str:
                in_str = None
                result.append(ch); i += 1; continue

            if not in_str and ch in self.cfg.string_delimiters:
                in_str = ch
                result.append(ch); i += 1; continue

            if not in_str:
                matched = False
                for prefix in self.cfg.comment_line:
                    if raw[i:].startswith(prefix):
                        matched = True
                        break
                if matched:
                    break

            result.append(ch); i += 1

        code = "".join(result).strip()
        return bool(code), code


class SemanticOverlapBuffer:
    def __init__(self, overlap_lines = OVERLAP_LINES):
        self.overlap_lines = overlap_lines
        self._prev = None
        self._current_fn_sig = None
        self._current_fn_file = None

    def notify_function(self, sig, source_file):
        self._current_fn_sig  = sig
        self._current_fn_file = source_file

    def notify_file_change(self, source_file):
        if self._current_fn_file != source_file:
            self._current_fn_sig  = None
            self._current_fn_file = source_file
            self._prev = None

    def apply(self, chunk):
        if self.overlap_lines <= 0:
            self._prev = chunk
            return chunk

        if self._prev and self._prev.source_file != chunk.source_file:
            self.notify_file_change(chunk.source_file)

        context_header = None

        if (self._current_fn_sig
                and self._current_fn_file == chunk.source_file
                and chunk.block_type not in ("function", "function_signature")):
            context_header = f"// contexto: {self._current_fn_sig}"
            overlap_type   = "function_sig"
        elif (self._prev
              and self._prev.source_file == chunk.source_file
              and self._prev.doc_type == chunk.doc_type):
            context_header = "\n".join(
                self._prev.content.splitlines()[-self.overlap_lines:])
            overlap_type   = "line_tail"
        else:
            overlap_type   = "none"

        self._prev = chunk

        if context_header:
            new_content = (context_header + "\n" + chunk.content).strip()
            return Chunk(
                chunk_id=chunk.chunk_id, source_file=chunk.source_file,
                doc_type=chunk.doc_type, block_type=chunk.block_type,
                section=chunk.section, start_line=chunk.start_line,
                end_line=chunk.end_line, content=new_content,
                metadata={**chunk.metadata,
                           "has_overlap": True,
                           "overlap_type": overlap_type},
            )
        return chunk


def _shingles(text, k = MINHASH_SHINGLE_SIZE):
    words = text.lower().split()
    if len(words) < k:
        return [" ".join(words).encode()]
    return [" ".join(words[i:i+k]).encode() for i in range(len(words) - k + 1)]


def _build_minhash(text):
    m = MinHash(num_perm=MINHASH_NUM_PERM)
    for s in _shingles(text):
        m.update(s)
    return m


class StreamingDeduplicator:
    def __init__(self, threshold: float = DEDUP_THRESHOLD ):
        self.threshold = threshold
        self._lsh: dict[str, "MinHashLSH"] = {}
        self.removed = 0


    def _get_lsh(self, doc_type):
        if doc_type not in self._lsh:
            self._lsh[doc_type] = MinHashLSH(
                threshold=self.threshold, num_perm=MINHASH_NUM_PERM)
        return self._lsh[doc_type]

    def is_duplicate(self, chunk):
        if not MINHASH_AVAILABLE:
            return False
        lsh = self._get_lsh(chunk.doc_type)
        m   = _build_minhash(chunk.content)
        try:
            if lsh.query(m):
                self.removed += 1
                return True
        except Exception:
            pass
        try:
            lsh.insert(chunk.chunk_id, m)
        except Exception as e:
            print(e)
            pass
        return False

class JsonlWriter:
    def __init__(self, path):
        out = Path(path)
        if out.suffix.lower() == ".json":
            out = out.with_suffix(".jsonl")
        out.parent.mkdir(parents=True, exist_ok=True)
        self.path    = out
        self._handle: IO = open(out, "w", encoding="utf-8")
        self.written = 0

    def write(self, chunk):
        self._handle.write(json.dumps(chunk.to_dict(), ensure_ascii=False) + "\n")
        self.written += 1

    def close(self):
        if self._handle:
            self._handle.close()

def validate_syntax(lines, filepath, cfg ):
    warnings_out = []
    stack = []
    lexer = GenericLexer(cfg)
    for i, raw in enumerate(lines):
        line_no = i + 1
        is_code, clean = lexer.process_line(raw)
        if not is_code or not clean:
            continue
        block = cfg.match_opener(clean)
        if block:
            stack.append((block.name, line_no))
            continue
        if cfg.match_closer(clean):
            if stack:
                stack.pop()
            else:
                warnings_out.append(
                    f"{filepath.name}:{line_no} — close without open")
    for bt, ln in stack:
        warnings_out.append(
            f"{filepath.name}:{ln} — not closed block '{bt}'")
    return warnings_out

def iter_code_chunks(filepath, cfg, overlap_buf):
    lines = filepath.read_text(encoding="utf-8").splitlines()
    warnings = validate_syntax(lines, filepath, cfg)
    overlap_buf.notify_file_change(str(filepath))

    lexer = GenericLexer(cfg)
    i = 0
    pending_raw = []
    loose_buffer = []
    loose_type = None

    def flush_loose():
        nonlocal loose_buffer, loose_type
        if not loose_buffer:
            return
        start   = loose_buffer[0][0]
        end     = loose_buffer[-1][0]
        content = "\n".join(t for _, t in loose_buffer)
        chunk   = make_chunk(filepath, "code", loose_type or "statement",
                             "", start, end, content, cfg)
        chunk   = overlap_buf.apply(chunk)
        loose_buffer.clear(); loose_type = None
        yield chunk

    while i < len(lines):
        raw = lines[i]
        line_no = i + 1
        is_code, clean = lexer.process_line(raw)

        if not is_code or not clean:
            pending_raw.append(raw); i += 1; continue

        block_def = cfg.match_opener(clean)

        if block_def:
            yield from flush_loose()
            block_start = line_no
            block_lines = list(pending_raw) + [raw]
            pending_raw.clear()

            sig = block_def.extract_sig(clean)
            if sig:
                overlap_buf.notify_function(sig, str(filepath))

            depth = 1; i += 1
            while i < len(lines) and depth > 0:
                inner_raw = lines[i]
                _, inner_clean = lexer.process_line(inner_raw)
                block_lines.append(inner_raw)
                if inner_clean:
                    if cfg.match_opener(inner_clean):
                        depth += 1
                    elif cfg.match_closer(inner_clean):
                        depth -= 1
                i += 1

            chunk = make_chunk(filepath, block_def.doc_type, block_def.name, "", block_start, i, "\n".join(block_lines), cfg)
            chunk = overlap_buf.apply(chunk)
            yield chunk

            if sig:
                yield make_chunk(
                    filepath, "function_signature", "function_signature", "", block_start, block_start, sig, cfg,
                    extra_meta={"full_block_start": block_start,
                                "full_block_end":   i}
                )
            continue

        stmt_type = cfg.classify_statement(clean)
        if loose_type and stmt_type != loose_type:
            yield from flush_loose()
        if pending_raw and not loose_buffer:
            for pc in pending_raw:
                loose_buffer.append((line_no, pc))
            pending_raw.clear()
        loose_type = stmt_type
        loose_buffer.append((line_no, raw))
        i += 1

    yield from flush_loose()

    if warnings:
        yield (None, warnings)


RE_MD_H1 = re.compile(r"^# (.+)")
RE_MD_H2 = re.compile(r"^## (.+)")
RE_MD_H3 = re.compile(r"^### (.+)")
RE_FENCE_OPEN = re.compile(r"^```(\w*)")
RE_FENCE_CLOSE = re.compile(r"^```\s*$")
RE_TABLE_ROW = re.compile(r"^\|")


def split_narrative_by_tokens(text, max_tokens):
    paragraphs = re.split(r"\n\s*\n", text)
    result = []; current = []; current_tokens = 0
    for para in paragraphs:
        pt = count_tokens(para)
        if current_tokens + pt > max_tokens and current:
            result.append("\n\n".join(current))
            current = [para]; current_tokens = pt
        else:
            current.append(para); current_tokens += pt
    if current:
        result.append("\n\n".join(current))
    return [t for t in result if t.strip()]


def iter_markdown_chunks(filepath, cfg, max_tokens = MAX_NARRATIVE_TOKENS):

    lines = filepath.read_text(encoding="utf-8").splitlines()
    current_h1 = current_h2 = current_h3 = ""

    def section_label() -> str:
        return " > ".join(p for p in [current_h1, current_h2, current_h3] if p)

    def make_md_chunk(doc_type, block_type, start, end, content) -> Chunk:
        return make_chunk(filepath, doc_type, block_type,
                          section_label(), start, end, content, cfg)

    i = 0
    narrative_start = 1; narrative_lines: list[str] = []

    def flush_narrative() -> Generator:
        nonlocal narrative_lines, narrative_start
        text = "\n".join(narrative_lines).strip()
        if not text:
            narrative_lines.clear(); return
        for sub in split_narrative_by_tokens(text, max_tokens):
            sl = sub.count("\n") + 1
            yield make_md_chunk("spec", "narrative",
                                 narrative_start, narrative_start + sl - 1, sub)
        narrative_lines.clear()

    while i < len(lines):
        raw = lines[i]; line_no = i + 1
        m1 = RE_MD_H1.match(raw); m2 = RE_MD_H2.match(raw); m3 = RE_MD_H3.match(raw)

        if m1:
            yield from flush_narrative()
            current_h1 = m1.group(1).strip(); current_h2 = current_h3 = ""
            narrative_start = line_no + 1; i += 1; continue
        if m2:
            yield from flush_narrative()
            current_h2 = m2.group(1).strip(); current_h3 = ""
            narrative_start = line_no + 1; i += 1; continue
        if m3:
            yield from flush_narrative()
            current_h3 = m3.group(1).strip()
            narrative_start = line_no + 1; i += 1; continue

        fm = RE_FENCE_OPEN.match(raw)
        if fm and not RE_FENCE_CLOSE.match(raw):
            yield from flush_narrative()
            lang = fm.group(1).lower() or "code"
            doc_type = "bnf" if lang == "bnf" else "code_example"
            fence_start = line_no
            fence_lines = [raw]; i += 1
            while i < len(lines):
                fence_lines.append(lines[i])
                if RE_FENCE_CLOSE.match(lines[i]) and len(fence_lines) > 1:
                    i += 1; break
                i += 1
            yield make_md_chunk(doc_type, lang,
                                 fence_start, fence_start + len(fence_lines) - 1,
                                 "\n".join(fence_lines))
            narrative_start = i + 1
            continue

        if RE_TABLE_ROW.match(raw):
            yield from flush_narrative()
            ts = line_no; tl = []
            while i < len(lines) and RE_TABLE_ROW.match(lines[i]):
                tl.append(lines[i]); i += 1
            yield make_md_chunk("spec", "table", ts, ts + len(tl) - 1, "\n".join(tl))
            narrative_start = i + 1
            continue

        if not narrative_lines:
            narrative_start = line_no
        narrative_lines.append(raw)
        i += 1

    yield from flush_narrative()


def _worker(args):

    paths, config_path, overlap_lines, max_tokens = args

    cfg = LanguageConfig(config_path)
    overlap_buf = SemanticOverlapBuffer(overlap_lines)
    stats = {t: 0 for t in ["code", "function_signature", "spec", "bnf", "code_example", "unknown", "total"]}
    all_warnings = []

    fd, tmp_path = tempfile.mkstemp(suffix=".jsonl", prefix="worker_")
    os.close(fd)

    with open(tmp_path, "w", encoding="utf-8") as f:
        for path in paths:
            ext = path.suffix.lower()
            if ext in cfg.extensions:
                for item in iter_code_chunks(path, cfg, overlap_buf):
                    if isinstance(item, tuple) and item[0] is None:
                        all_warnings.extend(item[1])
                        continue
                    chunk = item
                    f.write(json.dumps(chunk.to_dict(), ensure_ascii=False) + "\n")
                    stats[chunk.doc_type] = stats.get(chunk.doc_type, 0) + 1
                    stats["total"] += 1
            elif ext == ".md":
                for chunk in iter_markdown_chunks(path, cfg, max_tokens):
                    f.write(json.dumps(chunk.to_dict(), ensure_ascii=False) + "\n")
                    stats[chunk.doc_type] = stats.get(chunk.doc_type, 0) + 1
                    stats["total"] += 1
            else:
                content = path.read_text(encoding="utf-8")
                chunk   = make_chunk(path, "unknown", "raw", "", 1,
                                     content.count("\n") + 1, content, cfg)
                f.write(json.dumps(chunk.to_dict(), ensure_ascii=False) + "\n")
                stats["unknown"] += 1; stats["total"] += 1

    return tmp_path, stats, all_warnings

def fetch_documents(docs_path, cfg, extra_extensions):
    root = Path(docs_path)
    if not root.exists():
        raise FileNotFoundError(f"PATH not found: {root}")
    all_exts = cfg.extensions | set(extra_extensions)
    return sorted(p for p in root.rglob("*")
                  if p.is_file() and p.suffix.lower() in all_exts)


def _partition(paths, n):
    k = max(1, len(paths) // n)
    return [paths[i:i+k] for i in range(0, len(paths), k)]


def run_pipeline(paths,
                 config_path,
                 writer,
                 deduplicator,
                 overlap_lines,
                 max_tokens,
                 workers):

    total_stats = {t: 0 for t in ["code", "function_signature", "spec", "bnf", "code_example", "unknown", "total", "dedup_removed"]}
    all_warnings = []
    tmp_files = []

    partitions  = _partition(paths, workers)
    worker_args = [(part, config_path, overlap_lines, max_tokens) for part in partitions]

    print(f"{len(paths)} Files in {len(partitions)} workers...\n")

    with ProcessPoolExecutor(max_workers=workers) as executor:
        futures = {executor.submit(_worker, arg): i
                   for i, arg in enumerate(worker_args)}
        for future in tqdm(as_completed(futures), total=len(futures),
                           desc="  Workers", unit="worker"):
            tmp_path, stats, warns = future.result()
            tmp_files.append(tmp_path)
            all_warnings.extend(warns)
            for k, v in stats.items():
                total_stats[k] = total_stats.get(k, 0) + v

    print(f"\n  Mergin {len(tmp_files)} partial files...")
    for tmp_path in tqdm(tmp_files, desc="  Merge + dedup", unit="file"):
        with open(tmp_path, encoding="utf-8") as f:
            for line in f:
                line = line.strip()
                if not line:
                    continue
                try:
                    cd = json.loads(line)
                    if deduplicator:
                        c = Chunk(
                            chunk_id=cd["chunk_id"], source_file=cd["source_file"],
                            doc_type=cd["doc_type"], block_type=cd["block_type"],
                            section=cd["section"], start_line=cd["start_line"],
                            end_line=cd["end_line"], content=cd["content"],
                            metadata=cd.get("metadata", {}),
                        )
                        if deduplicator.is_duplicate(c):
                            total_stats["dedup_removed"] = \
                                total_stats.get("dedup_removed", 0) + 1
                            continue
                    writer._handle.write(line + "\n")
                    writer.written += 1
                except json.JSONDecodeError as e:
                    print(e)
                    pass
        Path(tmp_path).unlink(missing_ok=True)

    return total_stats, all_warnings


def print_report(stats, warnings, output_path, token_backend, workers, language):

    print(f"  RESULT — [{language}]")

    print(f"  Tokenizer   : {token_backend}")
    dedup_be = "MinHash LSH (RAM)" if MINHASH_AVAILABLE else "desactivada"
    print(f"  Dedup backend : {dedup_be}")
    print(f"  Workers       : {workers}")
    print()
    for t in ["code", "function_signature", "spec", "bnf", "code_example", "unknown"]:
        n = stats.get(t, 0)
        if n:
            print(f"  {t:<25}: {n:>6} chunks")
    print(f"\n  Total written           : {stats.get('total', 0)}")
    print(f"  Erased (dedup)       : {stats.get('dedup_removed', 0)}")
    if warnings:
        print(f"\n  Warnings ({len(warnings)}):")
        for w in warnings[:20]:
            print(w)
        if len(warnings) > 20:
            print(f"  ... and {len(warnings) - 20} more")
    else:
        print("\n  Ok")
    print(f"\n  OUTPUT File {output_path}")


def main():
    parser = argparse.ArgumentParser(
        description="GEneric chunker"
    )
    parser.add_argument("--lang-config",     required=True,
                        help="(ej: avap_config.json)")
    parser.add_argument("--docs-path",       default="docs/samples")
    parser.add_argument("--output",          default="ingestion/chunks.jsonl")
    parser.add_argument("--overlap",         type=int,   default=OVERLAP_LINES)
    parser.add_argument("--max-tokens",      type=int,   default=MAX_NARRATIVE_TOKENS)
    parser.add_argument("--dedup-threshold", type=float, default=DEDUP_THRESHOLD)
    parser.add_argument("--no-dedup",        action="store_true")
    parser.add_argument("--no-overlap",      action="store_true")
    parser.add_argument("--workers",         type=int,   default=DEFAULT_WORKERS)

    args = parser.parse_args()

    cfg     = LanguageConfig(args.lang_config)
    overlap = 0 if args.no_overlap else args.overlap

    print(f"  Lenguaje      : {cfg.language} v{cfg.version}")
    print(f"  Config        : {args.lang_config}")
    print(f"  Extensiones   : {cfg.extensions | {'.md'}}")
    print(f"  Docs path     : {args.docs_path}")
    print(f"  Output        : {args.output}")
    print(f"  Workers       : {args.workers}")
    print(f"  Tokenizador   : {TOKEN_BACKEND}")
    print(f"  Overlap       : {overlap} líneas (semántico)")
    print(f"  Max tokens    : {args.max_tokens}")
    dedup_info = "deactive" if args.no_dedup else \
                 f"MinHash LSH threshold={args.dedup_threshold}" + \
                 (f" RAM")
    print(f"  Dedup         : {dedup_info}")
    print()

    paths = fetch_documents(args.docs_path, cfg, [".md"])
    if not paths:
        print("No files found.")
        return
    print(f"{len(paths)} files found\n")

    deduplicator = None
    if not args.no_dedup and MINHASH_AVAILABLE:
        deduplicator = StreamingDeduplicator(
            threshold=args.dedup_threshold,
        )

    writer = JsonlWriter(args.output)
    try:
        stats, warnings = run_pipeline(
            paths, args.lang_config, writer, deduplicator,
            overlap, args.max_tokens, args.workers
        )
    finally:
        writer.close()

    print_report(stats, warnings, str(writer.path),
                 TOKEN_BACKEND, args.workers, cfg.language)


if __name__ == "__main__":
    main()