Scripts/Additional/duplicate_PDF/check_duplicate.py

317 lines
8.9 KiB
Python

"""
Scan each PDF in this folder and find duplicate connection tags (inside the SAME PDF)
based on user-provided wildcard patterns (using '*' as "anything").
Output: one Excel file per PDF: <PDF_NAME>_duplicates.xlsx
"""
from __future__ import annotations
import re
import sys
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, Iterable, List, Set, Tuple
# -----------------------------
# USER PATTERNS (wildcards)
# -----------------------------
# '*' means "any characters". Everything else is treated literally.
WILDCARD_PATTERNS: List[str] = [
"*_*_TPE*",
"*_*_S*_PB",
"*_*_S*_PB_LT",
"*_*_JR*_PB",
"*_*_JR*_PB_LT",
"*_*_SS*_SPB",
"*_*_SS*_STPB",
"*_*_SS*_SPB_LT",
"*_*_EN*_PB",
"*_*_EN*_PB_LT",
"*_*_PE*",
"*_*_LPE*",
"*_*_FPE*",
"*_*_BCN*_R",
"*_*_BCN*_B",
"*_*_BCN*_A",
"*_*_BCN*_G",
"*_*_BCN*_H",
"*_*_EPC*_1",
"*_*_EPC*_2",
"*_*_VFD1_DISC",
"*_*_*_STO1",
"*_*_*_ESTOP1",
"*_*_LS*",
"*_*_ENC*",
"*_*_ENW*",
"*_*_ENS*",
"*_*_PX*",
"*_*_SOL*",
"*_*_DIV*",
"*_*_PS*",
"*_*_BDS*",
"*_*_TS*",
]
# -----------------------------
# CABLE PATTERNS (separate check)
# -----------------------------
# Rule: if a cable label appears more than 2 times in the SAME PDF => duplicated/overused.
CABLE_WILDCARD_PATTERNS: List[str] = [
"*_*_VFD*_I0",
"*_*_VFD*_I1",
"*_*_VFD*_I2",
"*_*_VFD*_I3",
"*_*_VFD*_IO0",
"*_*_VFD*_IO1",
"*_*_VFD*_SI0",
"*_*_VFD*_SI1",
"*_*_VFD*_SI2",
"*_*_VFD*_SI3",
"*_*_VFD*_SO0",
"*_FIO*_P0_C0",
"*_FIO*_P0_C1",
"*_FIO*_P1_C2",
"*_FIO*_P1_C3",
"*_FIO*_P2_C4",
"*_FIO*_P2_C5",
"*_FIO*_P3_C6",
"*_FIO*_P3_C7",
"*_FIO*_P4_C8",
"*_FIO*_P4_C9",
"*_FIO*_P5_C10",
"*_FIO*_P5_C11",
"*_FIO*_P6_C12",
"*_FIO*_P6_C13",
"*_FIO*_P7_C14",
"*_FIO*_P7_C15",
"*_FIOH*_C7_A",
"*_FIOH*_C7_B",
"*_FIOH*_C5_A",
"*_FIOH*_C5_B",
"*_FIOH*_C3_A",
"*_FIOH*_C3_B",
"*_FIOH*_C1_A",
"*_FIOH*_C1_B",
"*_FIOH*_C8_A",
"*_FIOH*_C8_B",
"*_FIOH*_C6_A",
"*_FIOH*_C6_B",
"*_FIOH*_C4_A",
"*_FIOH*_C4_B",
"*_FIOH*_C2_A",
"*_FIOH*_C2_B",
]
# Candidate token: something like "PS3_2_VFD1_DISC" (>= 2 underscore-separated parts)
TOKEN_RE = re.compile(r"\b[A-Z0-9]+(?:_[A-Z0-9]+)+\b", re.IGNORECASE)
def _compile_wildcard_patterns(patterns: Iterable[str]) -> List[re.Pattern]:
compiled: List[re.Pattern] = []
for p in patterns:
# Treat everything literally except '*' which becomes '.*'
parts = [re.escape(x) for x in p.split("*")]
regex = ".*".join(parts)
# Match full token
compiled.append(re.compile(rf"^{regex}$", re.IGNORECASE))
return compiled
def _tokenize(text: str) -> List[str]:
# Normalize common oddities
text = text.replace("\u00ad", "") # soft hyphen
# PDFs sometimes insert whitespace/newlines around underscores; normalize that.
# Example: "PS3_2_VFD1_\nDISC" -> "PS3_2_VFD1_DISC"
text = re.sub(r"\s*_\s*", "_", text)
return [m.group(0).upper() for m in TOKEN_RE.finditer(text)]
def _ensure_deps() -> Tuple[object, object]:
"""
Returns (fitz_module, pandas_module). Exits with helpful message if missing.
"""
try:
import fitz # PyMuPDF
except Exception:
print(
"Missing dependency: PyMuPDF\n"
"Install with:\n"
" python -m pip install --upgrade pip\n"
" python -m pip install pymupdf\n",
file=sys.stderr,
)
raise
try:
import pandas as pd
except Exception:
print(
"Missing dependency: pandas (and openpyxl for Excel)\n"
"Install with:\n"
" python -m pip install --upgrade pip\n"
" python -m pip install pandas openpyxl\n",
file=sys.stderr,
)
raise
return fitz, pd
@dataclass
class Occurrence:
token: str
pages: Set[int] # 1-based page numbers
count: int
def find_duplicates_in_pdf(pdf_path: Path, compiled_patterns: List[re.Pattern]) -> List[Occurrence]:
fitz, _ = _ensure_deps()
occurrences: Dict[str, Occurrence] = {}
with fitz.open(pdf_path) as doc:
for page_index in range(doc.page_count):
page = doc.load_page(page_index)
text = page.get_text("text") or ""
tokens = _tokenize(text)
if not tokens:
continue
for t in tokens:
if not any(r.match(t) for r in compiled_patterns):
continue
if t not in occurrences:
occurrences[t] = Occurrence(token=t, pages=set(), count=0)
occurrences[t].pages.add(page_index + 1)
occurrences[t].count += 1
# "Duplicate" = appears on more than one page (what you asked for)
dups = [o for o in occurrences.values() if len(o.pages) > 1]
dups.sort(key=lambda o: (-len(o.pages), -o.count, o.token))
return dups
def find_cable_overuse_in_pdf(
pdf_path: Path,
compiled_patterns: List[re.Pattern],
*,
allowed_occurrences: int = 2,
) -> List[Occurrence]:
"""
Separate check from page-duplicate logic:
- Cable labels are often printed twice (both ends) => OK up to allowed_occurrences (default 2).
- If total occurrences > allowed_occurrences, flag it.
"""
fitz, _ = _ensure_deps()
occurrences: Dict[str, Occurrence] = {}
with fitz.open(pdf_path) as doc:
for page_index in range(doc.page_count):
page = doc.load_page(page_index)
text = page.get_text("text") or ""
tokens = _tokenize(text)
if not tokens:
continue
for t in tokens:
if not any(r.match(t) for r in compiled_patterns):
continue
if t not in occurrences:
occurrences[t] = Occurrence(token=t, pages=set(), count=0)
occurrences[t].pages.add(page_index + 1)
occurrences[t].count += 1
overused = [o for o in occurrences.values() if o.count > allowed_occurrences]
overused.sort(key=lambda o: (-o.count, -len(o.pages), o.token))
return overused
def write_excel_for_pdf(
pdf_path: Path,
duplicates: List[Occurrence],
cable_overuse: List[Occurrence],
) -> Path:
_, pd = _ensure_deps()
out_path = pdf_path.with_name(pdf_path.stem + "_duplicates.xlsx")
rows = []
for d in duplicates:
rows.append(
{
"Token": d.token,
"Pages": ", ".join(map(str, sorted(d.pages))),
"UniquePagesCount": len(d.pages),
"TotalOccurrences": d.count,
}
)
df = pd.DataFrame(rows, columns=["Token", "Pages", "UniquePagesCount", "TotalOccurrences"])
cable_rows = []
for c in cable_overuse:
cable_rows.append(
{
"CableLabel": c.token,
"Pages": ", ".join(map(str, sorted(c.pages))),
"UniquePagesCount": len(c.pages),
"TotalOccurrences": c.count,
}
)
cable_df = pd.DataFrame(
cable_rows, columns=["CableLabel", "Pages", "UniquePagesCount", "TotalOccurrences"]
)
with pd.ExcelWriter(out_path, engine="openpyxl") as writer:
df.to_excel(writer, index=False, sheet_name="Duplicates")
cable_df.to_excel(writer, index=False, sheet_name="CableOveruse")
summary = pd.DataFrame(
[
{
"PDF": pdf_path.name,
"DuplicateTokens": len(duplicates),
"CableOverusedLabels": len(cable_overuse),
},
]
)
summary.to_excel(writer, index=False, sheet_name="Summary")
return out_path
def main() -> int:
base_dir = Path(__file__).resolve().parent
pdfs = sorted(base_dir.glob("*.pdf"))
if not pdfs:
print(f"No PDFs found in: {base_dir}")
return 1
compiled_patterns = _compile_wildcard_patterns(WILDCARD_PATTERNS)
compiled_cable_patterns = _compile_wildcard_patterns(CABLE_WILDCARD_PATTERNS)
print(f"Found {len(pdfs)} PDF(s). Checking duplicates INSIDE each PDF only...")
for pdf in pdfs:
print(f"\n--- {pdf.name} ---")
try:
dups = find_duplicates_in_pdf(pdf, compiled_patterns)
cable_overuse = find_cable_overuse_in_pdf(pdf, compiled_cable_patterns, allowed_occurrences=2)
out_xlsx = write_excel_for_pdf(pdf, dups, cable_overuse)
print(f"Duplicate tokens (appear on >1 page): {len(dups)}")
print(f"Cable labels overused (total occurrences > 2): {len(cable_overuse)}")
print(f"Excel written: {out_xlsx.name}")
except Exception as e:
print(f"ERROR processing {pdf.name}: {e}", file=sys.stderr)
return 2
print("\nDone.")
return 0
if __name__ == "__main__":
raise SystemExit(main())