317 lines
8.9 KiB
Python
317 lines
8.9 KiB
Python
"""
|
|
Scan each PDF in this folder and find duplicate connection tags (inside the SAME PDF)
|
|
based on user-provided wildcard patterns (using '*' as "anything").
|
|
|
|
Output: one Excel file per PDF: <PDF_NAME>_duplicates.xlsx
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import re
|
|
import sys
|
|
from dataclasses import dataclass
|
|
from pathlib import Path
|
|
from typing import Dict, Iterable, List, Set, Tuple
|
|
|
|
|
|
# -----------------------------
|
|
# USER PATTERNS (wildcards)
|
|
# -----------------------------
|
|
# '*' means "any characters". Everything else is treated literally.
|
|
WILDCARD_PATTERNS: List[str] = [
|
|
"*_*_TPE*",
|
|
"*_*_S*_PB",
|
|
"*_*_S*_PB_LT",
|
|
"*_*_JR*_PB",
|
|
"*_*_JR*_PB_LT",
|
|
"*_*_SS*_SPB",
|
|
"*_*_SS*_STPB",
|
|
"*_*_SS*_SPB_LT",
|
|
"*_*_EN*_PB",
|
|
"*_*_EN*_PB_LT",
|
|
"*_*_PE*",
|
|
"*_*_LPE*",
|
|
"*_*_FPE*",
|
|
"*_*_BCN*_R",
|
|
"*_*_BCN*_B",
|
|
"*_*_BCN*_A",
|
|
"*_*_BCN*_G",
|
|
"*_*_BCN*_H",
|
|
"*_*_EPC*_1",
|
|
"*_*_EPC*_2",
|
|
"*_*_VFD1_DISC",
|
|
"*_*_*_STO1",
|
|
"*_*_*_ESTOP1",
|
|
"*_*_LS*",
|
|
"*_*_ENC*",
|
|
"*_*_ENW*",
|
|
"*_*_ENS*",
|
|
"*_*_PX*",
|
|
"*_*_SOL*",
|
|
"*_*_DIV*",
|
|
"*_*_PS*",
|
|
"*_*_BDS*",
|
|
"*_*_TS*",
|
|
]
|
|
|
|
# -----------------------------
|
|
# CABLE PATTERNS (separate check)
|
|
# -----------------------------
|
|
# Rule: if a cable label appears more than 2 times in the SAME PDF => duplicated/overused.
|
|
CABLE_WILDCARD_PATTERNS: List[str] = [
|
|
"*_*_VFD*_I0",
|
|
"*_*_VFD*_I1",
|
|
"*_*_VFD*_I2",
|
|
"*_*_VFD*_I3",
|
|
"*_*_VFD*_IO0",
|
|
"*_*_VFD*_IO1",
|
|
"*_*_VFD*_SI0",
|
|
"*_*_VFD*_SI1",
|
|
"*_*_VFD*_SI2",
|
|
"*_*_VFD*_SI3",
|
|
"*_*_VFD*_SO0",
|
|
"*_FIO*_P0_C0",
|
|
"*_FIO*_P0_C1",
|
|
"*_FIO*_P1_C2",
|
|
"*_FIO*_P1_C3",
|
|
"*_FIO*_P2_C4",
|
|
"*_FIO*_P2_C5",
|
|
"*_FIO*_P3_C6",
|
|
"*_FIO*_P3_C7",
|
|
"*_FIO*_P4_C8",
|
|
"*_FIO*_P4_C9",
|
|
"*_FIO*_P5_C10",
|
|
"*_FIO*_P5_C11",
|
|
"*_FIO*_P6_C12",
|
|
"*_FIO*_P6_C13",
|
|
"*_FIO*_P7_C14",
|
|
"*_FIO*_P7_C15",
|
|
"*_FIOH*_C7_A",
|
|
"*_FIOH*_C7_B",
|
|
"*_FIOH*_C5_A",
|
|
"*_FIOH*_C5_B",
|
|
"*_FIOH*_C3_A",
|
|
"*_FIOH*_C3_B",
|
|
"*_FIOH*_C1_A",
|
|
"*_FIOH*_C1_B",
|
|
"*_FIOH*_C8_A",
|
|
"*_FIOH*_C8_B",
|
|
"*_FIOH*_C6_A",
|
|
"*_FIOH*_C6_B",
|
|
"*_FIOH*_C4_A",
|
|
"*_FIOH*_C4_B",
|
|
"*_FIOH*_C2_A",
|
|
"*_FIOH*_C2_B",
|
|
]
|
|
|
|
|
|
# Candidate token: something like "PS3_2_VFD1_DISC" (>= 2 underscore-separated parts)
|
|
TOKEN_RE = re.compile(r"\b[A-Z0-9]+(?:_[A-Z0-9]+)+\b", re.IGNORECASE)
|
|
|
|
|
|
def _compile_wildcard_patterns(patterns: Iterable[str]) -> List[re.Pattern]:
|
|
compiled: List[re.Pattern] = []
|
|
for p in patterns:
|
|
# Treat everything literally except '*' which becomes '.*'
|
|
parts = [re.escape(x) for x in p.split("*")]
|
|
regex = ".*".join(parts)
|
|
# Match full token
|
|
compiled.append(re.compile(rf"^{regex}$", re.IGNORECASE))
|
|
return compiled
|
|
|
|
|
|
def _tokenize(text: str) -> List[str]:
|
|
# Normalize common oddities
|
|
text = text.replace("\u00ad", "") # soft hyphen
|
|
# PDFs sometimes insert whitespace/newlines around underscores; normalize that.
|
|
# Example: "PS3_2_VFD1_\nDISC" -> "PS3_2_VFD1_DISC"
|
|
text = re.sub(r"\s*_\s*", "_", text)
|
|
return [m.group(0).upper() for m in TOKEN_RE.finditer(text)]
|
|
|
|
|
|
def _ensure_deps() -> Tuple[object, object]:
|
|
"""
|
|
Returns (fitz_module, pandas_module). Exits with helpful message if missing.
|
|
"""
|
|
try:
|
|
import fitz # PyMuPDF
|
|
except Exception:
|
|
print(
|
|
"Missing dependency: PyMuPDF\n"
|
|
"Install with:\n"
|
|
" python -m pip install --upgrade pip\n"
|
|
" python -m pip install pymupdf\n",
|
|
file=sys.stderr,
|
|
)
|
|
raise
|
|
|
|
try:
|
|
import pandas as pd
|
|
except Exception:
|
|
print(
|
|
"Missing dependency: pandas (and openpyxl for Excel)\n"
|
|
"Install with:\n"
|
|
" python -m pip install --upgrade pip\n"
|
|
" python -m pip install pandas openpyxl\n",
|
|
file=sys.stderr,
|
|
)
|
|
raise
|
|
|
|
return fitz, pd
|
|
|
|
|
|
@dataclass
|
|
class Occurrence:
|
|
token: str
|
|
pages: Set[int] # 1-based page numbers
|
|
count: int
|
|
|
|
|
|
def find_duplicates_in_pdf(pdf_path: Path, compiled_patterns: List[re.Pattern]) -> List[Occurrence]:
|
|
fitz, _ = _ensure_deps()
|
|
|
|
occurrences: Dict[str, Occurrence] = {}
|
|
|
|
with fitz.open(pdf_path) as doc:
|
|
for page_index in range(doc.page_count):
|
|
page = doc.load_page(page_index)
|
|
text = page.get_text("text") or ""
|
|
tokens = _tokenize(text)
|
|
if not tokens:
|
|
continue
|
|
|
|
for t in tokens:
|
|
if not any(r.match(t) for r in compiled_patterns):
|
|
continue
|
|
if t not in occurrences:
|
|
occurrences[t] = Occurrence(token=t, pages=set(), count=0)
|
|
occurrences[t].pages.add(page_index + 1)
|
|
occurrences[t].count += 1
|
|
|
|
# "Duplicate" = appears on more than one page (what you asked for)
|
|
dups = [o for o in occurrences.values() if len(o.pages) > 1]
|
|
dups.sort(key=lambda o: (-len(o.pages), -o.count, o.token))
|
|
return dups
|
|
|
|
|
|
def find_cable_overuse_in_pdf(
|
|
pdf_path: Path,
|
|
compiled_patterns: List[re.Pattern],
|
|
*,
|
|
allowed_occurrences: int = 2,
|
|
) -> List[Occurrence]:
|
|
"""
|
|
Separate check from page-duplicate logic:
|
|
- Cable labels are often printed twice (both ends) => OK up to allowed_occurrences (default 2).
|
|
- If total occurrences > allowed_occurrences, flag it.
|
|
"""
|
|
fitz, _ = _ensure_deps()
|
|
|
|
occurrences: Dict[str, Occurrence] = {}
|
|
|
|
with fitz.open(pdf_path) as doc:
|
|
for page_index in range(doc.page_count):
|
|
page = doc.load_page(page_index)
|
|
text = page.get_text("text") or ""
|
|
tokens = _tokenize(text)
|
|
if not tokens:
|
|
continue
|
|
|
|
for t in tokens:
|
|
if not any(r.match(t) for r in compiled_patterns):
|
|
continue
|
|
if t not in occurrences:
|
|
occurrences[t] = Occurrence(token=t, pages=set(), count=0)
|
|
occurrences[t].pages.add(page_index + 1)
|
|
occurrences[t].count += 1
|
|
|
|
overused = [o for o in occurrences.values() if o.count > allowed_occurrences]
|
|
overused.sort(key=lambda o: (-o.count, -len(o.pages), o.token))
|
|
return overused
|
|
|
|
|
|
def write_excel_for_pdf(
|
|
pdf_path: Path,
|
|
duplicates: List[Occurrence],
|
|
cable_overuse: List[Occurrence],
|
|
) -> Path:
|
|
_, pd = _ensure_deps()
|
|
|
|
out_path = pdf_path.with_name(pdf_path.stem + "_duplicates.xlsx")
|
|
rows = []
|
|
for d in duplicates:
|
|
rows.append(
|
|
{
|
|
"Token": d.token,
|
|
"Pages": ", ".join(map(str, sorted(d.pages))),
|
|
"UniquePagesCount": len(d.pages),
|
|
"TotalOccurrences": d.count,
|
|
}
|
|
)
|
|
|
|
df = pd.DataFrame(rows, columns=["Token", "Pages", "UniquePagesCount", "TotalOccurrences"])
|
|
|
|
cable_rows = []
|
|
for c in cable_overuse:
|
|
cable_rows.append(
|
|
{
|
|
"CableLabel": c.token,
|
|
"Pages": ", ".join(map(str, sorted(c.pages))),
|
|
"UniquePagesCount": len(c.pages),
|
|
"TotalOccurrences": c.count,
|
|
}
|
|
)
|
|
cable_df = pd.DataFrame(
|
|
cable_rows, columns=["CableLabel", "Pages", "UniquePagesCount", "TotalOccurrences"]
|
|
)
|
|
|
|
with pd.ExcelWriter(out_path, engine="openpyxl") as writer:
|
|
df.to_excel(writer, index=False, sheet_name="Duplicates")
|
|
cable_df.to_excel(writer, index=False, sheet_name="CableOveruse")
|
|
|
|
summary = pd.DataFrame(
|
|
[
|
|
{
|
|
"PDF": pdf_path.name,
|
|
"DuplicateTokens": len(duplicates),
|
|
"CableOverusedLabels": len(cable_overuse),
|
|
},
|
|
]
|
|
)
|
|
summary.to_excel(writer, index=False, sheet_name="Summary")
|
|
|
|
return out_path
|
|
|
|
|
|
def main() -> int:
|
|
base_dir = Path(__file__).resolve().parent
|
|
pdfs = sorted(base_dir.glob("*.pdf"))
|
|
if not pdfs:
|
|
print(f"No PDFs found in: {base_dir}")
|
|
return 1
|
|
|
|
compiled_patterns = _compile_wildcard_patterns(WILDCARD_PATTERNS)
|
|
compiled_cable_patterns = _compile_wildcard_patterns(CABLE_WILDCARD_PATTERNS)
|
|
|
|
print(f"Found {len(pdfs)} PDF(s). Checking duplicates INSIDE each PDF only...")
|
|
for pdf in pdfs:
|
|
print(f"\n--- {pdf.name} ---")
|
|
try:
|
|
dups = find_duplicates_in_pdf(pdf, compiled_patterns)
|
|
cable_overuse = find_cable_overuse_in_pdf(pdf, compiled_cable_patterns, allowed_occurrences=2)
|
|
out_xlsx = write_excel_for_pdf(pdf, dups, cable_overuse)
|
|
print(f"Duplicate tokens (appear on >1 page): {len(dups)}")
|
|
print(f"Cable labels overused (total occurrences > 2): {len(cable_overuse)}")
|
|
print(f"Excel written: {out_xlsx.name}")
|
|
except Exception as e:
|
|
print(f"ERROR processing {pdf.name}: {e}", file=sys.stderr)
|
|
return 2
|
|
|
|
print("\nDone.")
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
raise SystemExit(main())
|
|
|