MonitorProgress/pdf_manifest_checker.py
ilia.gurielidze@autStand.com d51d597e87 No idea
2025-04-09 19:09:01 +04:00

219 lines
10 KiB
Python

# pdf_manifest_checker.py
import csv
import os
from pypdf import PdfReader
import sys
import re # Import the regex module
def normalize(text):
"""Normalize string for comparison: lowercase, treat '-' and '_' the same, remove all whitespace."""
if not isinstance(text, str):
return ""
text = text.lower() # Convert to lowercase
text = text.replace('-', '_') # Replace hyphens with underscores
text = re.sub(r'\\s+', '', text) # Remove ALL whitespace characters
return text
def read_aliases_from_manifest(csv_filepath, alias_column_name='Alias'):
"""Reads the specified column from a CSV file into a set."""
aliases = set()
try:
with open(csv_filepath, mode='r', newline='', encoding='utf-8') as infile:
reader = csv.DictReader(infile)
if alias_column_name not in reader.fieldnames:
print(f"Error: Column '{alias_column_name}' not found in {csv_filepath}")
print(f"Available columns: {', '.join(reader.fieldnames)}")
return None
for row in reader:
alias = row[alias_column_name]
if alias: # Add only non-empty aliases
aliases.add(alias.strip())
except FileNotFoundError:
print(f"Error: Manifest file not found at {csv_filepath}")
return None
except Exception as e:
print(f"Error reading CSV file {csv_filepath}: {e}")
return None
return aliases
def extract_text_from_pdfs(pdf_folder, text_output_folder):
"""
Extracts text from all PDF files in a given folder, saves each to a .txt file,
and returns the combined text of all PDFs.
Checks for existing .txt files in text_output_folder to avoid reprocessing.
"""
all_pdf_text = ""
if not os.path.isdir(pdf_folder):
print(f"Error: PDF folder not found at {pdf_folder}")
return None
# Ensure the output directory for text files exists
os.makedirs(text_output_folder, exist_ok=True)
print(f"Saving/reading extracted text files to/from: '{text_output_folder}'")
print(f"\nScanning PDF files in '{pdf_folder}'...")
pdf_files_found = False
processed_count = 0
skipped_count = 0
for filename in os.listdir(pdf_folder):
if filename.lower().endswith('.pdf'):
pdf_files_found = True
filepath = os.path.join(pdf_folder, filename)
txt_filename = os.path.splitext(filename)[0] + '.txt'
txt_filepath = os.path.join(text_output_folder, txt_filename)
current_pdf_text = ""
# --- Check for existing .txt file ---
if os.path.exists(txt_filepath):
try:
with open(txt_filepath, 'r', encoding='utf-8') as txt_file:
current_pdf_text = txt_file.read()
print(f" Skipping PDF processing for {filename}, using existing text from {txt_filename}.")
all_pdf_text += current_pdf_text # Add text from file
skipped_count += 1
continue # Move to the next file in the pdf_folder
except Exception as read_e:
print(f" Warning: Could not read existing text file {txt_filename}. Will re-process PDF. Error: {read_e}")
# --------------------------------------
# --- Process PDF if .txt doesn't exist or couldn't be read ---
print(f" Processing PDF: {filename}...")
processed_count += 1
try:
reader = PdfReader(filepath)
for page_num, page in enumerate(reader.pages):
try:
page_text = page.extract_text()
if page_text: # Append only if text was extracted
current_pdf_text += page_text + "\n"
except Exception as page_e:
print(f" Warning: Could not extract text from page {page_num + 1} of {filename}. Error: {page_e}")
# Save newly extracted text to a corresponding .txt file
if current_pdf_text:
try:
with open(txt_filepath, 'w', encoding='utf-8') as txt_file:
txt_file.write(current_pdf_text)
print(f" Saved extracted text to {txt_filename}")
all_pdf_text += current_pdf_text # Add newly extracted text
except Exception as write_e:
print(f" Warning: Could not write text file {txt_filename}. Error: {write_e}")
# Still add the text to all_pdf_text even if saving failed, so comparison can proceed
all_pdf_text += current_pdf_text
else:
print(f" Warning: No text extracted from {filename}.")
except Exception as e:
print(f" Error processing {filename}. Skipping this file. Error: {e}")
# --- End PDF Processing ---
if not pdf_files_found:
print(f"Warning: No PDF files found in the folder '{pdf_folder}'.")
return "" # Return empty string if no PDFs found
print(f"\nPDF processing summary: Processed {processed_count} new PDFs, skipped {skipped_count} (used existing text files).")
return all_pdf_text
def find_missing_aliases(aliases, pdf_text):
"""Finds aliases from the set that are not present in the combined PDF text.
Uses aggressive normalization (lowercase, - to _, no whitespace) on both
the manifest alias and the entire PDF text, then performs a substring check.
"""
missing = set()
if pdf_text is None or aliases is None:
print("Error: Missing aliases or PDF text input.")
return None # Propagate error state
# Normalize the entire PDF text blob once using the aggressive method
print("Normalizing the combined PDF text...")
normalized_pdf_text_blob = normalize(pdf_text)
print("Normalization complete.")
# --- Debugging: Print a snippet of the normalized blob (Optional) ---
# print(f"DEBUG: Length of normalized text blob: {len(normalized_pdf_text_blob)}")
# snippet_length = 200
# if len(normalized_pdf_text_blob) > snippet_length:
# print(f"DEBUG: Start of normalized text blob: {normalized_pdf_text_blob[:snippet_length]}...")
# else:
# print(f"DEBUG: Normalized text blob: {normalized_pdf_text_blob}")
# -------------------------------------------------------------------
print("\nComparing normalized manifest aliases against normalized PDF text blob (substring check)...")
for alias in aliases:
# Normalize the current alias from the manifest using the same aggressive method
normalized_alias_from_manifest = normalize(alias)
if not normalized_alias_from_manifest:
# print(f"Skipping empty or invalid normalized alias derived from: '{alias}'")
continue
# Check if the normalized manifest alias exists as a substring within the normalized text blob
if normalized_alias_from_manifest not in normalized_pdf_text_blob:
missing.add(alias) # Add the ORIGINAL alias if the substring was not found
# --- Debugging: Print missing normalized alias (Optional) ---
# print(f" -> Missing (normalized): {normalized_alias_from_manifest} (Original: {alias})")
# -----------------------------------------------------------
return sorted(list(missing)) # Return sorted list for consistent output
if __name__ == "__main__":
# --- Configuration ---
script_dir = os.path.dirname(os.path.abspath(__file__))
default_manifest_path = os.path.join(script_dir, 'MTN6 Equipment Manifest REV6(Conveyor List).csv')
default_pdf_folder = os.path.join(script_dir, 'pdfs')
default_text_output_folder = os.path.join(script_dir, 'extracted_texts')
manifest_file_path = default_manifest_path
pdf_folder_path = default_pdf_folder
text_output_folder_path = default_text_output_folder
if len(sys.argv) == 3:
manifest_file_path = sys.argv[1]
pdf_folder_path = sys.argv[2]
# If args are provided, still save text relative to script/default location,
# or you could add a third argument for the text output path.
print("Using command-line paths for Manifest and PDFs.")
print(f" Manifest: {manifest_file_path}")
print(f" PDF Folder: {pdf_folder_path}")
print(f" Text Output Folder: {text_output_folder_path} (Default)")
elif len(sys.argv) != 1: # Check if incorrect number of args were given (but not zero extra)
print("Usage: python pdf_manifest_checker.py [<path_to_manifest.csv> <path_to_pdf_folder>]")
print("If no arguments are provided, default paths will be used.")
sys.exit(1) # Exit if incorrect arguments
else:
# No arguments provided, use defaults
print("Using default paths:")
print(f" Manifest: {manifest_file_path}")
print(f" PDF Folder: {pdf_folder_path}")
print(f" Text Output Folder: {text_output_folder_path}")
# ---------------------
aliases_from_manifest = read_aliases_from_manifest(manifest_file_path)
if aliases_from_manifest is not None:
# Pass the output folder path to the extraction function
extracted_text = extract_text_from_pdfs(pdf_folder_path, text_output_folder_path)
if extracted_text is not None:
missing_names = find_missing_aliases(aliases_from_manifest, extracted_text)
if missing_names is not None:
print("\n--- Report ---")
if not missing_names:
print("All aliases from the manifest were found in the combined PDF text.")
else:
print("The following aliases from the manifest were NOT found in the combined PDF text:")
for name in missing_names:
print(f"- {name}")
print(f"\nTotal aliases in manifest: {len(aliases_from_manifest)}")
print(f"Total missing aliases: {len(missing_names)}")
else:
print("\nComparison could not be completed due to previous errors.")
else:
print("\nText extraction failed or produced no text. Cannot proceed with comparison.")
else:
print("\nManifest reading failed. Cannot proceed.")
print("\nScript finished.")