# pdf_manifest_checker.py import csv import os from pypdf import PdfReader import sys import re # Import the regex module import argparse # Import argparse def normalize(text): """Normalize string for comparison: lowercase, treat '-' and '_' the same, remove all whitespace.""" if not isinstance(text, str): return "" text = text.lower() # Convert to lowercase text = text.replace('-', '_') # Replace hyphens with underscores text = re.sub(r'\\s+', '', text) # Remove ALL whitespace characters return text def read_aliases_from_manifest(csv_filepath, alias_column_name='Alias'): """Reads the specified column from a CSV file into a set.""" aliases = set() try: with open(csv_filepath, mode='r', newline='', encoding='latin-1') as infile: reader = csv.DictReader(infile) if alias_column_name not in reader.fieldnames: print(f"Error: Column '{alias_column_name}' not found in {csv_filepath}") print(f"Available columns: {', '.join(reader.fieldnames)}") return None for row in reader: alias = row[alias_column_name] if alias: # Add only non-empty aliases aliases.add(alias.strip()) except FileNotFoundError: print(f"Error: Manifest file not found at {csv_filepath}") return None except Exception as e: print(f"Error reading CSV file {csv_filepath}: {e}") return None return aliases def extract_text_from_pdfs(pdf_folder, text_output_folder): """ Extracts text from all PDF files in a given folder, saves each to a .txt file, and returns the combined text of all PDFs. Checks for existing .txt files in text_output_folder to avoid reprocessing. """ all_pdf_text = "" if not os.path.isdir(pdf_folder): print(f"Error: PDF folder not found at {pdf_folder}") return None # Ensure the output directory for text files exists os.makedirs(text_output_folder, exist_ok=True) print(f"Saving/reading extracted text files to/from: '{text_output_folder}'") print(f"\nScanning PDF files in '{pdf_folder}'...") pdf_files_found = False processed_count = 0 skipped_count = 0 for filename in os.listdir(pdf_folder): if filename.lower().endswith('.pdf'): pdf_files_found = True filepath = os.path.join(pdf_folder, filename) txt_filename = os.path.splitext(filename)[0] + '.txt' txt_filepath = os.path.join(text_output_folder, txt_filename) current_pdf_text = "" # --- Check for existing .txt file --- if os.path.exists(txt_filepath): try: with open(txt_filepath, 'r', encoding='utf-8') as txt_file: current_pdf_text = txt_file.read() print(f" Skipping PDF processing for {filename}, using existing text from {txt_filename}.") all_pdf_text += current_pdf_text # Add text from file skipped_count += 1 continue # Move to the next file in the pdf_folder except Exception as read_e: print(f" Warning: Could not read existing text file {txt_filename}. Will re-process PDF. Error: {read_e}") # -------------------------------------- # --- Process PDF if .txt doesn't exist or couldn't be read --- print(f" Processing PDF: {filename}...") processed_count += 1 try: reader = PdfReader(filepath) for page_num, page in enumerate(reader.pages): try: page_text = page.extract_text() if page_text: # Append only if text was extracted current_pdf_text += page_text + "\n" except Exception as page_e: print(f" Warning: Could not extract text from page {page_num + 1} of {filename}. Error: {page_e}") # Save newly extracted text to a corresponding .txt file if current_pdf_text: try: with open(txt_filepath, 'w', encoding='utf-8') as txt_file: txt_file.write(current_pdf_text) print(f" Saved extracted text to {txt_filename}") all_pdf_text += current_pdf_text # Add newly extracted text except Exception as write_e: print(f" Warning: Could not write text file {txt_filename}. Error: {write_e}") # Still add the text to all_pdf_text even if saving failed, so comparison can proceed all_pdf_text += current_pdf_text else: print(f" Warning: No text extracted from {filename}.") except Exception as e: print(f" Error processing {filename}. Skipping this file. Error: {e}") # --- End PDF Processing --- if not pdf_files_found: print(f"Warning: No PDF files found in the folder '{pdf_folder}'.") return "" # Return empty string if no PDFs found print(f"\nPDF processing summary: Processed {processed_count} new PDFs, skipped {skipped_count} (used existing text files).") return all_pdf_text def find_missing_aliases(aliases, pdf_text): """Finds aliases from the set that are not present in the combined PDF text. Uses aggressive normalization (lowercase, - to _, no whitespace) on both the manifest alias and the entire PDF text, then performs a substring check. """ missing = set() if pdf_text is None or aliases is None: print("Error: Missing aliases or PDF text input.") return None # Propagate error state # Normalize the entire PDF text blob once using the aggressive method print("Normalizing the combined PDF text...") normalized_pdf_text_blob = normalize(pdf_text) print("Normalization complete.") # --- Debugging: Print a snippet of the normalized blob (Optional) --- # print(f"DEBUG: Length of normalized text blob: {len(normalized_pdf_text_blob)}") # snippet_length = 200 # if len(normalized_pdf_text_blob) > snippet_length: # print(f"DEBUG: Start of normalized text blob: {normalized_pdf_text_blob[:snippet_length]}...") # else: # print(f"DEBUG: Normalized text blob: {normalized_pdf_text_blob}") # ------------------------------------------------------------------- print("\nComparing normalized manifest aliases against normalized PDF text blob (substring check)...") for alias in aliases: # Normalize the current alias from the manifest using the same aggressive method normalized_alias_from_manifest = normalize(alias) if not normalized_alias_from_manifest: # print(f"Skipping empty or invalid normalized alias derived from: '{alias}'") continue # Check if the normalized manifest alias exists as a substring within the normalized text blob if normalized_alias_from_manifest not in normalized_pdf_text_blob: missing.add(alias) # Add the ORIGINAL alias if the substring was not found # --- Debugging: Print missing normalized alias (Optional) --- # print(f" -> Missing (normalized): {normalized_alias_from_manifest} (Original: {alias})") # ----------------------------------------------------------- return sorted(list(missing)) # Return sorted list for consistent output if __name__ == "__main__": # --- Argument Parsing --- parser = argparse.ArgumentParser(description='Check if aliases from a manifest CSV exist in text extracted from PDFs.') parser.add_argument('manifest_file', help='Path to the manifest CSV file.') parser.add_argument('pdf_folder', help='Path to the folder containing PDF files.') parser.add_argument('text_output_folder', help='Path to the folder where extracted text files should be saved/read from.') args = parser.parse_args() manifest_file_path = args.manifest_file pdf_folder_path = args.pdf_folder text_output_folder_path = args.text_output_folder print("Using provided paths:") print(f" Manifest: {manifest_file_path}") print(f" PDF Folder: {pdf_folder_path}") print(f" Text Output Folder: {text_output_folder_path}") # --------------------- aliases_from_manifest = read_aliases_from_manifest(manifest_file_path) if aliases_from_manifest is not None: # Pass the output folder path to the extraction function extracted_text = extract_text_from_pdfs(pdf_folder_path, text_output_folder_path) if extracted_text is not None: missing_names = find_missing_aliases(aliases_from_manifest, extracted_text) if missing_names is not None: print("\n--- Report ---") if not missing_names: print("All aliases from the manifest were found in the combined PDF text.") else: print("The following aliases from the manifest were NOT found in the combined PDF text:") for name in missing_names: print(f"- {name}") print(f"\nTotal aliases in manifest: {len(aliases_from_manifest)}") print(f"Total missing aliases: {len(missing_names)}") else: print("\nComparison could not be completed due to previous errors.") else: print("\nText extraction failed or produced no text. Cannot proceed with comparison.") else: print("\nManifest reading failed. Cannot proceed.") print("\nScript finished.")