MonitorProgress/drawing_checker.py

import os
# Assume utils contains the necessary helper functions
import utils
from utils import normalize, get_text_output_dir_path
from concurrent.futures import ProcessPoolExecutor, as_completed

# Define the worker function at the module level
def extract_worker(pdf_path, txt_path, pdf_filename):
    """Worker function to extract text from a single PDF."""
    try:
        # Ensure utils.extract_text_from_pdf is accessible and works here
        success = utils.extract_text_from_pdf(pdf_path, txt_path)
        return success, pdf_filename, None # Return success status, filename, and no error
    except AttributeError:
         # Handle missing function specifically if possible
         # Note: Logging from here might be tricky with multiprocessing, returning the error is safer
         # print(f"ERROR: utils.extract_text_from_pdf function not found! Cannot extract text for {pdf_filename}.")
         return False, pdf_filename, "AttributeError: utils.extract_text_from_pdf not found"
    except Exception as extract_err:
        # print(f"Error during extraction for '{pdf_filename}': {extract_err}") # Logging might not show
        return False, pdf_filename, extract_err # Return failure, filename, and the error

def check_drawings(project_name, manifest_data):
    """
    Checks if aliases from manifest exist ANYWHERE within the combined text extracted
    from all available drawings for a project.
    Attempts to extract text from PDFs if the corresponding TXT file is missing.
    Updates the 'found_drawing' flag in the manifest_data items directly.
    """
    if not manifest_data:
        print(f"[{project_name}] Drawings Check: No manifest data provided.")
        return

    print(f"[{project_name}] Starting Drawings check...")
    text_output_dir = utils.get_text_output_dir_path(project_name)
    pdf_source_dir = utils.get_pdf_dir_path(project_name)
    os.makedirs(text_output_dir, exist_ok=True) # Ensure output dir exists

    # --- Preliminary Step: Ensure TXT exists for every PDF ---
    print(f"[{project_name}] Checking PDF source directory ({pdf_source_dir}) against text output directory ({text_output_dir})...")
    extraction_attempts = 0
    successful_extractions = 0
    failed_extractions = 0
    pdfs_to_extract = [] # List to hold tasks

    if not os.path.isdir(pdf_source_dir):
        print(f"  Warning: PDF source directory not found at '{pdf_source_dir}'. Skipping extraction check.")
    else:
        print(f"[{project_name}] Identifying PDFs needing text extraction...")
        for pdf_filename in os.listdir(pdf_source_dir):
            if pdf_filename.lower().endswith('.pdf'):
                pdf_path = os.path.join(pdf_source_dir, pdf_filename)
                txt_filename = os.path.splitext(pdf_filename)[0] + '.txt'
                txt_path = os.path.join(text_output_dir, txt_filename)

                if not os.path.exists(txt_path):
                    pdfs_to_extract.append((pdf_path, txt_path, pdf_filename)) # Store info

        if not pdfs_to_extract:
            print(f"[{project_name}] No missing TXT files found. Text extraction step skipped.")
        else:
            print(f"[{project_name}] Attempting parallel extraction for {len(pdfs_to_extract)} PDF(s)...")
            # Worker function is now defined at module level

            # Use ProcessPoolExecutor for parallel execution
            # Adjust max_workers based on your system and expected load, None uses default (often os.cpu_count())
            with ProcessPoolExecutor(max_workers=os.cpu_count()) as executor:
                # Submit tasks using the module-level function
                futures = {executor.submit(extract_worker, pdf_p, txt_p, pdf_fn): (pdf_p, txt_p, pdf_fn)
                           for pdf_p, txt_p, pdf_fn in pdfs_to_extract}

                for future in as_completed(futures):
                    pdf_p, txt_p, pdf_fn = futures[future]
                    try:
                        success, filename, error = future.result()
                        if success:
                            successful_extractions += 1
                            # Optional: print(f"    Successfully extracted text for '{filename}'.")
                        else:
                            failed_extractions += 1
                            if error:
                                # Print error only if one occurred (AttributeError already printed in worker)
                                if not isinstance(error, AttributeError):
                                    print(f"    Extraction failed for '{filename}': {error}")
                            # else: # Optional: print(f"    Extraction failed or produced no text for '{filename}'.")

                    except Exception as exc:
                        failed_extractions += 1
                        print(f"    Exception processing result for '{pdf_fn}': {exc}")

            extraction_attempts = len(pdfs_to_extract) # Update total attempts

    print(f"[{project_name}] Parallel text extraction complete. Attempted: {extraction_attempts}, Succeeded: {successful_extractions}, Failed: {failed_extractions}.")
    # --- End Preliminary Step ---

    # --- Main Check: Scan all available TXT files and compare aliases ---
    print(f"[{project_name}] Reading and combining text from all .txt files in: {text_output_dir}...")

    all_raw_content = "" # Combine all raw text content here
    processed_files = 0
    found_txt_files = []

    try:
        if not os.path.isdir(text_output_dir):
             print(f"  Error: Text output directory not found: {text_output_dir}. Cannot perform drawing check.")
             for item in manifest_data: item['found_drawing'] = False
             return

        txt_files = [f for f in os.listdir(text_output_dir) if f.lower().endswith('.txt')]
        if not txt_files:
            print("  Warning: No .txt files found in the directory. Cannot perform drawing check.")
            for item in manifest_data: item['found_drawing'] = False
            return

        for filename in txt_files:
            filepath = os.path.join(text_output_dir, filename)
            processed_files += 1
            try:
                with open(filepath, 'r', encoding='utf-8') as f:
                    content = f.read()
                    # Simple concatenation is sufficient now
                    all_raw_content += content + "\n" # Add newline as separator
                    found_txt_files.append(filename)
            except Exception as e:
                print(f"  Warning: Could not read or process text file {filepath}: {e}")

        print(f"  Read content from {len(found_txt_files)} out of {processed_files} total .txt files found.")

        # Step 2: Normalize the entire combined content ONCE
        print(f"  Normalizing combined text content...")
        all_normalized_content = utils.normalize(all_raw_content)
        print(f"  Normalization complete. Total normalized length: {len(all_normalized_content)} chars.")

        # Step 3: Check each manifest alias against the normalized combined content
        found_count = 0
        checked_count = 0
        for item in manifest_data:
            # Ensure 'found_drawing' is initialized to False
            item['found_drawing'] = False

            alias = item.get('alias') # Use lowercase 'alias' key
            if alias:
                 checked_count += 1
                 normalized_alias = utils.normalize(alias)
                 if normalized_alias and normalized_alias in all_normalized_content:
                      item['found_drawing'] = True
                      found_count += 1
                 # else: item['found_drawing'] remains False

        print(f"[{project_name}] Drawings check finished. Checked {checked_count} aliases. Found {found_count} aliases within the combined drawing text.")

    except Exception as e:
        print(f"  Error during drawings check main phase: {e}")
        # Ensure flags are false on error
        for item in manifest_data: item['found_drawing'] = False