MonitorProgress/pdf_manifest_checker.py

# pdf_manifest_checker.py

import csv
import os
from pypdf import PdfReader
import sys
import re # Import the regex module
import argparse # Import argparse

def normalize(text):
    """Normalize string for comparison: lowercase, treat '-' and '_' the same, remove all whitespace."""
    if not isinstance(text, str):
        return ""
    text = text.lower()          # Convert to lowercase
    text = text.replace('-', '_') # Replace hyphens with underscores
    text = re.sub(r'\\s+', '', text) # Remove ALL whitespace characters
    return text

def read_aliases_from_manifest(csv_filepath, alias_column_name='Alias'):
    """Reads the specified column from a CSV file into a set."""
    aliases = set()
    try:
        with open(csv_filepath, mode='r', newline='', encoding='utf-8') as infile:
            reader = csv.DictReader(infile)
            if alias_column_name not in reader.fieldnames:
                print(f"Error: Column '{alias_column_name}' not found in {csv_filepath}")
                print(f"Available columns: {', '.join(reader.fieldnames)}")
                return None
            for row in reader:
                alias = row[alias_column_name]
                if alias: # Add only non-empty aliases
                    aliases.add(alias.strip())
    except FileNotFoundError:
        print(f"Error: Manifest file not found at {csv_filepath}")
        return None
    except Exception as e:
        print(f"Error reading CSV file {csv_filepath}: {e}")
        return None
    return aliases

def extract_text_from_pdfs(pdf_folder, text_output_folder):
    """
    Extracts text from all PDF files in a given folder, saves each to a .txt file,
    and returns the combined text of all PDFs.
    Checks for existing .txt files in text_output_folder to avoid reprocessing.
    """
    all_pdf_text = ""
    if not os.path.isdir(pdf_folder):
        print(f"Error: PDF folder not found at {pdf_folder}")
        return None

    # Ensure the output directory for text files exists
    os.makedirs(text_output_folder, exist_ok=True)
    print(f"Saving/reading extracted text files to/from: '{text_output_folder}'")

    print(f"\nScanning PDF files in '{pdf_folder}'...")
    pdf_files_found = False
    processed_count = 0
    skipped_count = 0

    for filename in os.listdir(pdf_folder):
        if filename.lower().endswith('.pdf'):
            pdf_files_found = True
            filepath = os.path.join(pdf_folder, filename)
            txt_filename = os.path.splitext(filename)[0] + '.txt'
            txt_filepath = os.path.join(text_output_folder, txt_filename)
            current_pdf_text = ""

            # --- Check for existing .txt file ---
            if os.path.exists(txt_filepath):
                try:
                    with open(txt_filepath, 'r', encoding='utf-8') as txt_file:
                        current_pdf_text = txt_file.read()
                    print(f"  Skipping PDF processing for {filename}, using existing text from {txt_filename}.")
                    all_pdf_text += current_pdf_text # Add text from file
                    skipped_count += 1
                    continue # Move to the next file in the pdf_folder
                except Exception as read_e:
                    print(f"    Warning: Could not read existing text file {txt_filename}. Will re-process PDF. Error: {read_e}")
            # --------------------------------------

            # --- Process PDF if .txt doesn't exist or couldn't be read ---
            print(f"  Processing PDF: {filename}...")
            processed_count += 1
            try:
                reader = PdfReader(filepath)
                for page_num, page in enumerate(reader.pages):
                    try:
                        page_text = page.extract_text()
                        if page_text: # Append only if text was extracted
                           current_pdf_text += page_text + "\n"
                    except Exception as page_e:
                         print(f"    Warning: Could not extract text from page {page_num + 1} of {filename}. Error: {page_e}")

                # Save newly extracted text to a corresponding .txt file
                if current_pdf_text:
                    try:
                        with open(txt_filepath, 'w', encoding='utf-8') as txt_file:
                            txt_file.write(current_pdf_text)
                        print(f"    Saved extracted text to {txt_filename}")
                        all_pdf_text += current_pdf_text # Add newly extracted text
                    except Exception as write_e:
                        print(f"    Warning: Could not write text file {txt_filename}. Error: {write_e}")
                        # Still add the text to all_pdf_text even if saving failed, so comparison can proceed
                        all_pdf_text += current_pdf_text
                else:
                     print(f"    Warning: No text extracted from {filename}.")

            except Exception as e:
                print(f"    Error processing {filename}. Skipping this file. Error: {e}")
            # --- End PDF Processing ---

    if not pdf_files_found:
        print(f"Warning: No PDF files found in the folder '{pdf_folder}'.")
        return "" # Return empty string if no PDFs found

    print(f"\nPDF processing summary: Processed {processed_count} new PDFs, skipped {skipped_count} (used existing text files).")
    return all_pdf_text

def find_missing_aliases(aliases, pdf_text):
    """Finds aliases from the set that are not present in the combined PDF text.
       Uses aggressive normalization (lowercase, - to _, no whitespace) on both
       the manifest alias and the entire PDF text, then performs a substring check.
       """
    missing = set()
    if pdf_text is None or aliases is None:
        print("Error: Missing aliases or PDF text input.")
        return None # Propagate error state

    # Normalize the entire PDF text blob once using the aggressive method
    print("Normalizing the combined PDF text...")
    normalized_pdf_text_blob = normalize(pdf_text)
    print("Normalization complete.")

    # --- Debugging: Print a snippet of the normalized blob (Optional) ---
    # print(f"DEBUG: Length of normalized text blob: {len(normalized_pdf_text_blob)}")
    # snippet_length = 200
    # if len(normalized_pdf_text_blob) > snippet_length:
    #     print(f"DEBUG: Start of normalized text blob: {normalized_pdf_text_blob[:snippet_length]}...")
    # else:
    #     print(f"DEBUG: Normalized text blob: {normalized_pdf_text_blob}")
    # -------------------------------------------------------------------

    print("\nComparing normalized manifest aliases against normalized PDF text blob (substring check)...")
    for alias in aliases:
        # Normalize the current alias from the manifest using the same aggressive method
        normalized_alias_from_manifest = normalize(alias)

        if not normalized_alias_from_manifest:
            # print(f"Skipping empty or invalid normalized alias derived from: '{alias}'")
            continue

        # Check if the normalized manifest alias exists as a substring within the normalized text blob
        if normalized_alias_from_manifest not in normalized_pdf_text_blob:
            missing.add(alias) # Add the ORIGINAL alias if the substring was not found
            # --- Debugging: Print missing normalized alias (Optional) ---
            # print(f"  -> Missing (normalized): {normalized_alias_from_manifest} (Original: {alias})")
            # -----------------------------------------------------------

    return sorted(list(missing)) # Return sorted list for consistent output

if __name__ == "__main__":
    # --- Argument Parsing ---
    parser = argparse.ArgumentParser(description='Check if aliases from a manifest CSV exist in text extracted from PDFs.')
    parser.add_argument('manifest_file', help='Path to the manifest CSV file.')
    parser.add_argument('pdf_folder', help='Path to the folder containing PDF files.')
    parser.add_argument('text_output_folder', help='Path to the folder where extracted text files should be saved/read from.')
    args = parser.parse_args()

    manifest_file_path = args.manifest_file
    pdf_folder_path = args.pdf_folder
    text_output_folder_path = args.text_output_folder

    print("Using provided paths:")
    print(f"  Manifest: {manifest_file_path}")
    print(f"  PDF Folder: {pdf_folder_path}")
    print(f"  Text Output Folder: {text_output_folder_path}")
    # ---------------------

    aliases_from_manifest = read_aliases_from_manifest(manifest_file_path)

    if aliases_from_manifest is not None:
        # Pass the output folder path to the extraction function
        extracted_text = extract_text_from_pdfs(pdf_folder_path, text_output_folder_path)

        if extracted_text is not None:
            missing_names = find_missing_aliases(aliases_from_manifest, extracted_text)

            if missing_names is not None:
                print("\n--- Report ---")
                if not missing_names:
                    print("All aliases from the manifest were found in the combined PDF text.")
                else:
                    print("The following aliases from the manifest were NOT found in the combined PDF text:")
                    for name in missing_names:
                        print(f"- {name}")
                print(f"\nTotal aliases in manifest: {len(aliases_from_manifest)}")
                print(f"Total missing aliases: {len(missing_names)}")
            else:
                print("\nComparison could not be completed due to previous errors.")
        else:
             print("\nText extraction failed or produced no text. Cannot proceed with comparison.")
    else:
        print("\nManifest reading failed. Cannot proceed.")

    print("\nScript finished.")