""" Scan each PDF in this folder and find duplicate connection tags (inside the SAME PDF) based on user-provided wildcard patterns (using '*' as "anything"). Output: one Excel file per PDF: _duplicates.xlsx """ from __future__ import annotations import re import sys from dataclasses import dataclass from pathlib import Path from typing import Dict, Iterable, List, Set, Tuple # ----------------------------- # USER PATTERNS (wildcards) # ----------------------------- # '*' means "any characters". Everything else is treated literally. WILDCARD_PATTERNS: List[str] = [ "*_*_TPE*", "*_*_S*_PB", "*_*_S*_PB_LT", "*_*_JR*_PB", "*_*_JR*_PB_LT", "*_*_SS*_SPB", "*_*_SS*_STPB", "*_*_SS*_SPB_LT", "*_*_EN*_PB", "*_*_EN*_PB_LT", "*_*_PE*", "*_*_LPE*", "*_*_FPE*", "*_*_BCN*_R", "*_*_BCN*_B", "*_*_BCN*_A", "*_*_BCN*_G", "*_*_BCN*_H", "*_*_EPC*_1", "*_*_EPC*_2", "*_*_VFD1_DISC", "*_*_*_STO1", "*_*_*_ESTOP1", "*_*_LS*", "*_*_ENC*", "*_*_ENW*", "*_*_ENS*", "*_*_PX*", "*_*_SOL*", "*_*_DIV*", "*_*_PS*", "*_*_BDS*", "*_*_TS*", ] # ----------------------------- # CABLE PATTERNS (separate check) # ----------------------------- # Rule: if a cable label appears more than 2 times in the SAME PDF => duplicated/overused. CABLE_WILDCARD_PATTERNS: List[str] = [ "*_*_VFD*_I0", "*_*_VFD*_I1", "*_*_VFD*_I2", "*_*_VFD*_I3", "*_*_VFD*_IO0", "*_*_VFD*_IO1", "*_*_VFD*_SI0", "*_*_VFD*_SI1", "*_*_VFD*_SI2", "*_*_VFD*_SI3", "*_*_VFD*_SO0", "*_FIO*_P0_C0", "*_FIO*_P0_C1", "*_FIO*_P1_C2", "*_FIO*_P1_C3", "*_FIO*_P2_C4", "*_FIO*_P2_C5", "*_FIO*_P3_C6", "*_FIO*_P3_C7", "*_FIO*_P4_C8", "*_FIO*_P4_C9", "*_FIO*_P5_C10", "*_FIO*_P5_C11", "*_FIO*_P6_C12", "*_FIO*_P6_C13", "*_FIO*_P7_C14", "*_FIO*_P7_C15", "*_FIOH*_C7_A", "*_FIOH*_C7_B", "*_FIOH*_C5_A", "*_FIOH*_C5_B", "*_FIOH*_C3_A", "*_FIOH*_C3_B", "*_FIOH*_C1_A", "*_FIOH*_C1_B", "*_FIOH*_C8_A", "*_FIOH*_C8_B", "*_FIOH*_C6_A", "*_FIOH*_C6_B", "*_FIOH*_C4_A", "*_FIOH*_C4_B", "*_FIOH*_C2_A", "*_FIOH*_C2_B", ] # Candidate token: something like "PS3_2_VFD1_DISC" (>= 2 underscore-separated parts) TOKEN_RE = re.compile(r"\b[A-Z0-9]+(?:_[A-Z0-9]+)+\b", re.IGNORECASE) def _compile_wildcard_patterns(patterns: Iterable[str]) -> List[re.Pattern]: compiled: List[re.Pattern] = [] for p in patterns: # Treat everything literally except '*' which becomes '.*' parts = [re.escape(x) for x in p.split("*")] regex = ".*".join(parts) # Match full token compiled.append(re.compile(rf"^{regex}$", re.IGNORECASE)) return compiled def _tokenize(text: str) -> List[str]: # Normalize common oddities text = text.replace("\u00ad", "") # soft hyphen # PDFs sometimes insert whitespace/newlines around underscores; normalize that. # Example: "PS3_2_VFD1_\nDISC" -> "PS3_2_VFD1_DISC" text = re.sub(r"\s*_\s*", "_", text) return [m.group(0).upper() for m in TOKEN_RE.finditer(text)] def _ensure_deps() -> Tuple[object, object]: """ Returns (fitz_module, pandas_module). Exits with helpful message if missing. """ try: import fitz # PyMuPDF except Exception: print( "Missing dependency: PyMuPDF\n" "Install with:\n" " python -m pip install --upgrade pip\n" " python -m pip install pymupdf\n", file=sys.stderr, ) raise try: import pandas as pd except Exception: print( "Missing dependency: pandas (and openpyxl for Excel)\n" "Install with:\n" " python -m pip install --upgrade pip\n" " python -m pip install pandas openpyxl\n", file=sys.stderr, ) raise return fitz, pd @dataclass class Occurrence: token: str pages: Set[int] # 1-based page numbers count: int def find_duplicates_in_pdf(pdf_path: Path, compiled_patterns: List[re.Pattern]) -> List[Occurrence]: fitz, _ = _ensure_deps() occurrences: Dict[str, Occurrence] = {} with fitz.open(pdf_path) as doc: for page_index in range(doc.page_count): page = doc.load_page(page_index) text = page.get_text("text") or "" tokens = _tokenize(text) if not tokens: continue for t in tokens: if not any(r.match(t) for r in compiled_patterns): continue if t not in occurrences: occurrences[t] = Occurrence(token=t, pages=set(), count=0) occurrences[t].pages.add(page_index + 1) occurrences[t].count += 1 # "Duplicate" = appears on more than one page (what you asked for) dups = [o for o in occurrences.values() if len(o.pages) > 1] dups.sort(key=lambda o: (-len(o.pages), -o.count, o.token)) return dups def find_cable_overuse_in_pdf( pdf_path: Path, compiled_patterns: List[re.Pattern], *, allowed_occurrences: int = 2, ) -> List[Occurrence]: """ Separate check from page-duplicate logic: - Cable labels are often printed twice (both ends) => OK up to allowed_occurrences (default 2). - If total occurrences > allowed_occurrences, flag it. """ fitz, _ = _ensure_deps() occurrences: Dict[str, Occurrence] = {} with fitz.open(pdf_path) as doc: for page_index in range(doc.page_count): page = doc.load_page(page_index) text = page.get_text("text") or "" tokens = _tokenize(text) if not tokens: continue for t in tokens: if not any(r.match(t) for r in compiled_patterns): continue if t not in occurrences: occurrences[t] = Occurrence(token=t, pages=set(), count=0) occurrences[t].pages.add(page_index + 1) occurrences[t].count += 1 overused = [o for o in occurrences.values() if o.count > allowed_occurrences] overused.sort(key=lambda o: (-o.count, -len(o.pages), o.token)) return overused def write_excel_for_pdf( pdf_path: Path, duplicates: List[Occurrence], cable_overuse: List[Occurrence], ) -> Path: _, pd = _ensure_deps() out_path = pdf_path.with_name(pdf_path.stem + "_duplicates.xlsx") rows = [] for d in duplicates: rows.append( { "Token": d.token, "Pages": ", ".join(map(str, sorted(d.pages))), "UniquePagesCount": len(d.pages), "TotalOccurrences": d.count, } ) df = pd.DataFrame(rows, columns=["Token", "Pages", "UniquePagesCount", "TotalOccurrences"]) cable_rows = [] for c in cable_overuse: cable_rows.append( { "CableLabel": c.token, "Pages": ", ".join(map(str, sorted(c.pages))), "UniquePagesCount": len(c.pages), "TotalOccurrences": c.count, } ) cable_df = pd.DataFrame( cable_rows, columns=["CableLabel", "Pages", "UniquePagesCount", "TotalOccurrences"] ) with pd.ExcelWriter(out_path, engine="openpyxl") as writer: df.to_excel(writer, index=False, sheet_name="Duplicates") cable_df.to_excel(writer, index=False, sheet_name="CableOveruse") summary = pd.DataFrame( [ { "PDF": pdf_path.name, "DuplicateTokens": len(duplicates), "CableOverusedLabels": len(cable_overuse), }, ] ) summary.to_excel(writer, index=False, sheet_name="Summary") return out_path def main() -> int: base_dir = Path(__file__).resolve().parent pdfs = sorted(base_dir.glob("*.pdf")) if not pdfs: print(f"No PDFs found in: {base_dir}") return 1 compiled_patterns = _compile_wildcard_patterns(WILDCARD_PATTERNS) compiled_cable_patterns = _compile_wildcard_patterns(CABLE_WILDCARD_PATTERNS) print(f"Found {len(pdfs)} PDF(s). Checking duplicates INSIDE each PDF only...") for pdf in pdfs: print(f"\n--- {pdf.name} ---") try: dups = find_duplicates_in_pdf(pdf, compiled_patterns) cable_overuse = find_cable_overuse_in_pdf(pdf, compiled_cable_patterns, allowed_occurrences=2) out_xlsx = write_excel_for_pdf(pdf, dups, cable_overuse) print(f"Duplicate tokens (appear on >1 page): {len(dups)}") print(f"Cable labels overused (total occurrences > 2): {len(cable_overuse)}") print(f"Excel written: {out_xlsx.name}") except Exception as e: print(f"ERROR processing {pdf.name}: {e}", file=sys.stderr) return 2 print("\nDone.") return 0 if __name__ == "__main__": raise SystemExit(main())