206 lines
9.6 KiB
Python
206 lines
9.6 KiB
Python
# pdf_manifest_checker.py
|
|
|
|
import csv
|
|
import os
|
|
from pypdf import PdfReader
|
|
import sys
|
|
import re # Import the regex module
|
|
import argparse # Import argparse
|
|
|
|
def normalize(text):
|
|
"""Normalize string for comparison: lowercase, treat '-' and '_' the same, remove all whitespace."""
|
|
if not isinstance(text, str):
|
|
return ""
|
|
text = text.lower() # Convert to lowercase
|
|
text = text.replace('-', '_') # Replace hyphens with underscores
|
|
text = re.sub(r'\\s+', '', text) # Remove ALL whitespace characters
|
|
return text
|
|
|
|
def read_aliases_from_manifest(csv_filepath, alias_column_name='Alias'):
|
|
"""Reads the specified column from a CSV file into a set."""
|
|
aliases = set()
|
|
try:
|
|
with open(csv_filepath, mode='r', newline='', encoding='utf-8') as infile:
|
|
reader = csv.DictReader(infile)
|
|
if alias_column_name not in reader.fieldnames:
|
|
print(f"Error: Column '{alias_column_name}' not found in {csv_filepath}")
|
|
print(f"Available columns: {', '.join(reader.fieldnames)}")
|
|
return None
|
|
for row in reader:
|
|
alias = row[alias_column_name]
|
|
if alias: # Add only non-empty aliases
|
|
aliases.add(alias.strip())
|
|
except FileNotFoundError:
|
|
print(f"Error: Manifest file not found at {csv_filepath}")
|
|
return None
|
|
except Exception as e:
|
|
print(f"Error reading CSV file {csv_filepath}: {e}")
|
|
return None
|
|
return aliases
|
|
|
|
def extract_text_from_pdfs(pdf_folder, text_output_folder):
|
|
"""
|
|
Extracts text from all PDF files in a given folder, saves each to a .txt file,
|
|
and returns the combined text of all PDFs.
|
|
Checks for existing .txt files in text_output_folder to avoid reprocessing.
|
|
"""
|
|
all_pdf_text = ""
|
|
if not os.path.isdir(pdf_folder):
|
|
print(f"Error: PDF folder not found at {pdf_folder}")
|
|
return None
|
|
|
|
# Ensure the output directory for text files exists
|
|
os.makedirs(text_output_folder, exist_ok=True)
|
|
print(f"Saving/reading extracted text files to/from: '{text_output_folder}'")
|
|
|
|
print(f"\nScanning PDF files in '{pdf_folder}'...")
|
|
pdf_files_found = False
|
|
processed_count = 0
|
|
skipped_count = 0
|
|
|
|
for filename in os.listdir(pdf_folder):
|
|
if filename.lower().endswith('.pdf'):
|
|
pdf_files_found = True
|
|
filepath = os.path.join(pdf_folder, filename)
|
|
txt_filename = os.path.splitext(filename)[0] + '.txt'
|
|
txt_filepath = os.path.join(text_output_folder, txt_filename)
|
|
current_pdf_text = ""
|
|
|
|
# --- Check for existing .txt file ---
|
|
if os.path.exists(txt_filepath):
|
|
try:
|
|
with open(txt_filepath, 'r', encoding='utf-8') as txt_file:
|
|
current_pdf_text = txt_file.read()
|
|
print(f" Skipping PDF processing for {filename}, using existing text from {txt_filename}.")
|
|
all_pdf_text += current_pdf_text # Add text from file
|
|
skipped_count += 1
|
|
continue # Move to the next file in the pdf_folder
|
|
except Exception as read_e:
|
|
print(f" Warning: Could not read existing text file {txt_filename}. Will re-process PDF. Error: {read_e}")
|
|
# --------------------------------------
|
|
|
|
# --- Process PDF if .txt doesn't exist or couldn't be read ---
|
|
print(f" Processing PDF: {filename}...")
|
|
processed_count += 1
|
|
try:
|
|
reader = PdfReader(filepath)
|
|
for page_num, page in enumerate(reader.pages):
|
|
try:
|
|
page_text = page.extract_text()
|
|
if page_text: # Append only if text was extracted
|
|
current_pdf_text += page_text + "\n"
|
|
except Exception as page_e:
|
|
print(f" Warning: Could not extract text from page {page_num + 1} of {filename}. Error: {page_e}")
|
|
|
|
# Save newly extracted text to a corresponding .txt file
|
|
if current_pdf_text:
|
|
try:
|
|
with open(txt_filepath, 'w', encoding='utf-8') as txt_file:
|
|
txt_file.write(current_pdf_text)
|
|
print(f" Saved extracted text to {txt_filename}")
|
|
all_pdf_text += current_pdf_text # Add newly extracted text
|
|
except Exception as write_e:
|
|
print(f" Warning: Could not write text file {txt_filename}. Error: {write_e}")
|
|
# Still add the text to all_pdf_text even if saving failed, so comparison can proceed
|
|
all_pdf_text += current_pdf_text
|
|
else:
|
|
print(f" Warning: No text extracted from {filename}.")
|
|
|
|
except Exception as e:
|
|
print(f" Error processing {filename}. Skipping this file. Error: {e}")
|
|
# --- End PDF Processing ---
|
|
|
|
if not pdf_files_found:
|
|
print(f"Warning: No PDF files found in the folder '{pdf_folder}'.")
|
|
return "" # Return empty string if no PDFs found
|
|
|
|
print(f"\nPDF processing summary: Processed {processed_count} new PDFs, skipped {skipped_count} (used existing text files).")
|
|
return all_pdf_text
|
|
|
|
def find_missing_aliases(aliases, pdf_text):
|
|
"""Finds aliases from the set that are not present in the combined PDF text.
|
|
Uses aggressive normalization (lowercase, - to _, no whitespace) on both
|
|
the manifest alias and the entire PDF text, then performs a substring check.
|
|
"""
|
|
missing = set()
|
|
if pdf_text is None or aliases is None:
|
|
print("Error: Missing aliases or PDF text input.")
|
|
return None # Propagate error state
|
|
|
|
# Normalize the entire PDF text blob once using the aggressive method
|
|
print("Normalizing the combined PDF text...")
|
|
normalized_pdf_text_blob = normalize(pdf_text)
|
|
print("Normalization complete.")
|
|
|
|
# --- Debugging: Print a snippet of the normalized blob (Optional) ---
|
|
# print(f"DEBUG: Length of normalized text blob: {len(normalized_pdf_text_blob)}")
|
|
# snippet_length = 200
|
|
# if len(normalized_pdf_text_blob) > snippet_length:
|
|
# print(f"DEBUG: Start of normalized text blob: {normalized_pdf_text_blob[:snippet_length]}...")
|
|
# else:
|
|
# print(f"DEBUG: Normalized text blob: {normalized_pdf_text_blob}")
|
|
# -------------------------------------------------------------------
|
|
|
|
print("\nComparing normalized manifest aliases against normalized PDF text blob (substring check)...")
|
|
for alias in aliases:
|
|
# Normalize the current alias from the manifest using the same aggressive method
|
|
normalized_alias_from_manifest = normalize(alias)
|
|
|
|
if not normalized_alias_from_manifest:
|
|
# print(f"Skipping empty or invalid normalized alias derived from: '{alias}'")
|
|
continue
|
|
|
|
# Check if the normalized manifest alias exists as a substring within the normalized text blob
|
|
if normalized_alias_from_manifest not in normalized_pdf_text_blob:
|
|
missing.add(alias) # Add the ORIGINAL alias if the substring was not found
|
|
# --- Debugging: Print missing normalized alias (Optional) ---
|
|
# print(f" -> Missing (normalized): {normalized_alias_from_manifest} (Original: {alias})")
|
|
# -----------------------------------------------------------
|
|
|
|
return sorted(list(missing)) # Return sorted list for consistent output
|
|
|
|
if __name__ == "__main__":
|
|
# --- Argument Parsing ---
|
|
parser = argparse.ArgumentParser(description='Check if aliases from a manifest CSV exist in text extracted from PDFs.')
|
|
parser.add_argument('manifest_file', help='Path to the manifest CSV file.')
|
|
parser.add_argument('pdf_folder', help='Path to the folder containing PDF files.')
|
|
parser.add_argument('text_output_folder', help='Path to the folder where extracted text files should be saved/read from.')
|
|
args = parser.parse_args()
|
|
|
|
manifest_file_path = args.manifest_file
|
|
pdf_folder_path = args.pdf_folder
|
|
text_output_folder_path = args.text_output_folder
|
|
|
|
print("Using provided paths:")
|
|
print(f" Manifest: {manifest_file_path}")
|
|
print(f" PDF Folder: {pdf_folder_path}")
|
|
print(f" Text Output Folder: {text_output_folder_path}")
|
|
# ---------------------
|
|
|
|
aliases_from_manifest = read_aliases_from_manifest(manifest_file_path)
|
|
|
|
if aliases_from_manifest is not None:
|
|
# Pass the output folder path to the extraction function
|
|
extracted_text = extract_text_from_pdfs(pdf_folder_path, text_output_folder_path)
|
|
|
|
if extracted_text is not None:
|
|
missing_names = find_missing_aliases(aliases_from_manifest, extracted_text)
|
|
|
|
if missing_names is not None:
|
|
print("\n--- Report ---")
|
|
if not missing_names:
|
|
print("All aliases from the manifest were found in the combined PDF text.")
|
|
else:
|
|
print("The following aliases from the manifest were NOT found in the combined PDF text:")
|
|
for name in missing_names:
|
|
print(f"- {name}")
|
|
print(f"\nTotal aliases in manifest: {len(aliases_from_manifest)}")
|
|
print(f"Total missing aliases: {len(missing_names)}")
|
|
else:
|
|
print("\nComparison could not be completed due to previous errors.")
|
|
else:
|
|
print("\nText extraction failed or produced no text. Cannot proceed with comparison.")
|
|
else:
|
|
print("\nManifest reading failed. Cannot proceed.")
|
|
|
|
print("\nScript finished.") |