Scripts/Additional/duplicate_PDF/check_duplicate.py

"""
Scan each PDF in this folder and find duplicate connection tags (inside the SAME PDF)
based on user-provided wildcard patterns (using '*' as "anything").

Output: one Excel file per PDF: <PDF_NAME>_duplicates.xlsx
"""

from __future__ import annotations

import re
import sys
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, Iterable, List, Set, Tuple


# -----------------------------
# USER PATTERNS (wildcards)
# -----------------------------
# '*' means "any characters". Everything else is treated literally.
WILDCARD_PATTERNS: List[str] = [
    "*_*_TPE*",
    "*_*_S*_PB",
    "*_*_S*_PB_LT",
    "*_*_JR*_PB",
    "*_*_JR*_PB_LT",
    "*_*_SS*_SPB",
    "*_*_SS*_STPB",
    "*_*_SS*_SPB_LT",
    "*_*_EN*_PB",
    "*_*_EN*_PB_LT",
    "*_*_PE*",
    "*_*_LPE*",
    "*_*_FPE*",
    "*_*_BCN*_R",
    "*_*_BCN*_B",
    "*_*_BCN*_A",
    "*_*_BCN*_G",
    "*_*_BCN*_H",
    "*_*_EPC*_1",
    "*_*_EPC*_2",
    "*_*_VFD1_DISC",
    "*_*_*_STO1",
    "*_*_*_ESTOP1",
    "*_*_LS*",
    "*_*_ENC*",
    "*_*_ENW*",
    "*_*_ENS*",
    "*_*_PX*",
    "*_*_SOL*",
    "*_*_DIV*",
    "*_*_PS*",
    "*_*_BDS*",
    "*_*_TS*",
]

# -----------------------------
# CABLE PATTERNS (separate check)
# -----------------------------
# Rule: if a cable label appears more than 2 times in the SAME PDF => duplicated/overused.
CABLE_WILDCARD_PATTERNS: List[str] = [
    "*_*_VFD*_I0",
    "*_*_VFD*_I1",
    "*_*_VFD*_I2",
    "*_*_VFD*_I3",
    "*_*_VFD*_IO0",
    "*_*_VFD*_IO1",
    "*_*_VFD*_SI0",
    "*_*_VFD*_SI1",
    "*_*_VFD*_SI2",
    "*_*_VFD*_SI3",
    "*_*_VFD*_SO0",
    "*_FIO*_P0_C0",
    "*_FIO*_P0_C1",
    "*_FIO*_P1_C2",
    "*_FIO*_P1_C3",
    "*_FIO*_P2_C4",
    "*_FIO*_P2_C5",
    "*_FIO*_P3_C6",
    "*_FIO*_P3_C7",
    "*_FIO*_P4_C8",
    "*_FIO*_P4_C9",
    "*_FIO*_P5_C10",
    "*_FIO*_P5_C11",
    "*_FIO*_P6_C12",
    "*_FIO*_P6_C13",
    "*_FIO*_P7_C14",
    "*_FIO*_P7_C15",
    "*_FIOH*_C7_A",
    "*_FIOH*_C7_B",
    "*_FIOH*_C5_A",
    "*_FIOH*_C5_B",
    "*_FIOH*_C3_A",
    "*_FIOH*_C3_B",
    "*_FIOH*_C1_A",
    "*_FIOH*_C1_B",
    "*_FIOH*_C8_A",
    "*_FIOH*_C8_B",
    "*_FIOH*_C6_A",
    "*_FIOH*_C6_B",
    "*_FIOH*_C4_A",
    "*_FIOH*_C4_B",
    "*_FIOH*_C2_A",
    "*_FIOH*_C2_B",
]


# Candidate token: something like "PS3_2_VFD1_DISC" (>= 2 underscore-separated parts)
TOKEN_RE = re.compile(r"\b[A-Z0-9]+(?:_[A-Z0-9]+)+\b", re.IGNORECASE)


def _compile_wildcard_patterns(patterns: Iterable[str]) -> List[re.Pattern]:
    compiled: List[re.Pattern] = []
    for p in patterns:
        # Treat everything literally except '*' which becomes '.*'
        parts = [re.escape(x) for x in p.split("*")]
        regex = ".*".join(parts)
        # Match full token
        compiled.append(re.compile(rf"^{regex}$", re.IGNORECASE))
    return compiled


def _tokenize(text: str) -> List[str]:
    # Normalize common oddities
    text = text.replace("\u00ad", "")  # soft hyphen
    # PDFs sometimes insert whitespace/newlines around underscores; normalize that.
    # Example: "PS3_2_VFD1_\nDISC" -> "PS3_2_VFD1_DISC"
    text = re.sub(r"\s*_\s*", "_", text)
    return [m.group(0).upper() for m in TOKEN_RE.finditer(text)]


def _ensure_deps() -> Tuple[object, object]:
    """
    Returns (fitz_module, pandas_module). Exits with helpful message if missing.
    """
    try:
        import fitz  # PyMuPDF
    except Exception:
        print(
            "Missing dependency: PyMuPDF\n"
            "Install with:\n"
            "  python -m pip install --upgrade pip\n"
            "  python -m pip install pymupdf\n",
            file=sys.stderr,
        )
        raise

    try:
        import pandas as pd
    except Exception:
        print(
            "Missing dependency: pandas (and openpyxl for Excel)\n"
            "Install with:\n"
            "  python -m pip install --upgrade pip\n"
            "  python -m pip install pandas openpyxl\n",
            file=sys.stderr,
        )
        raise

    return fitz, pd


@dataclass
class Occurrence:
    token: str
    pages: Set[int]  # 1-based page numbers
    count: int


def find_duplicates_in_pdf(pdf_path: Path, compiled_patterns: List[re.Pattern]) -> List[Occurrence]:
    fitz, _ = _ensure_deps()

    occurrences: Dict[str, Occurrence] = {}

    with fitz.open(pdf_path) as doc:
        for page_index in range(doc.page_count):
            page = doc.load_page(page_index)
            text = page.get_text("text") or ""
            tokens = _tokenize(text)
            if not tokens:
                continue

            for t in tokens:
                if not any(r.match(t) for r in compiled_patterns):
                    continue
                if t not in occurrences:
                    occurrences[t] = Occurrence(token=t, pages=set(), count=0)
                occurrences[t].pages.add(page_index + 1)
                occurrences[t].count += 1

    # "Duplicate" = appears on more than one page (what you asked for)
    dups = [o for o in occurrences.values() if len(o.pages) > 1]
    dups.sort(key=lambda o: (-len(o.pages), -o.count, o.token))
    return dups


def find_cable_overuse_in_pdf(
    pdf_path: Path,
    compiled_patterns: List[re.Pattern],
    *,
    allowed_occurrences: int = 2,
) -> List[Occurrence]:
    """
    Separate check from page-duplicate logic:
    - Cable labels are often printed twice (both ends) => OK up to allowed_occurrences (default 2).
    - If total occurrences > allowed_occurrences, flag it.
    """
    fitz, _ = _ensure_deps()

    occurrences: Dict[str, Occurrence] = {}

    with fitz.open(pdf_path) as doc:
        for page_index in range(doc.page_count):
            page = doc.load_page(page_index)
            text = page.get_text("text") or ""
            tokens = _tokenize(text)
            if not tokens:
                continue

            for t in tokens:
                if not any(r.match(t) for r in compiled_patterns):
                    continue
                if t not in occurrences:
                    occurrences[t] = Occurrence(token=t, pages=set(), count=0)
                occurrences[t].pages.add(page_index + 1)
                occurrences[t].count += 1

    overused = [o for o in occurrences.values() if o.count > allowed_occurrences]
    overused.sort(key=lambda o: (-o.count, -len(o.pages), o.token))
    return overused


def write_excel_for_pdf(
    pdf_path: Path,
    duplicates: List[Occurrence],
    cable_overuse: List[Occurrence],
) -> Path:
    _, pd = _ensure_deps()

    out_path = pdf_path.with_name(pdf_path.stem + "_duplicates.xlsx")
    rows = []
    for d in duplicates:
        rows.append(
            {
                "Token": d.token,
                "Pages": ", ".join(map(str, sorted(d.pages))),
                "UniquePagesCount": len(d.pages),
                "TotalOccurrences": d.count,
            }
        )

    df = pd.DataFrame(rows, columns=["Token", "Pages", "UniquePagesCount", "TotalOccurrences"])

    cable_rows = []
    for c in cable_overuse:
        cable_rows.append(
            {
                "CableLabel": c.token,
                "Pages": ", ".join(map(str, sorted(c.pages))),
                "UniquePagesCount": len(c.pages),
                "TotalOccurrences": c.count,
            }
        )
    cable_df = pd.DataFrame(
        cable_rows, columns=["CableLabel", "Pages", "UniquePagesCount", "TotalOccurrences"]
    )

    with pd.ExcelWriter(out_path, engine="openpyxl") as writer:
        df.to_excel(writer, index=False, sheet_name="Duplicates")
        cable_df.to_excel(writer, index=False, sheet_name="CableOveruse")

        summary = pd.DataFrame(
            [
                {
                    "PDF": pdf_path.name,
                    "DuplicateTokens": len(duplicates),
                    "CableOverusedLabels": len(cable_overuse),
                },
            ]
        )
        summary.to_excel(writer, index=False, sheet_name="Summary")

    return out_path


def main() -> int:
    base_dir = Path(__file__).resolve().parent
    pdfs = sorted(base_dir.glob("*.pdf"))
    if not pdfs:
        print(f"No PDFs found in: {base_dir}")
        return 1

    compiled_patterns = _compile_wildcard_patterns(WILDCARD_PATTERNS)
    compiled_cable_patterns = _compile_wildcard_patterns(CABLE_WILDCARD_PATTERNS)

    print(f"Found {len(pdfs)} PDF(s). Checking duplicates INSIDE each PDF only...")
    for pdf in pdfs:
        print(f"\n--- {pdf.name} ---")
        try:
            dups = find_duplicates_in_pdf(pdf, compiled_patterns)
            cable_overuse = find_cable_overuse_in_pdf(pdf, compiled_cable_patterns, allowed_occurrences=2)
            out_xlsx = write_excel_for_pdf(pdf, dups, cable_overuse)
            print(f"Duplicate tokens (appear on >1 page): {len(dups)}")
            print(f"Cable labels overused (total occurrences > 2): {len(cable_overuse)}")
            print(f"Excel written: {out_xlsx.name}")
        except Exception as e:
            print(f"ERROR processing {pdf.name}: {e}", file=sys.stderr)
            return 2

    print("\nDone.")
    return 0


if __name__ == "__main__":
    raise SystemExit(main())