MonitorProgress/drawing_checker.py

import os
# Assume utils contains the necessary helper functions
import utils
from utils import normalize, get_text_output_dir_path

def check_drawings(project_name, manifest_data):
    """
    Checks if aliases from manifest exist ANYWHERE within the combined text extracted
    from all available drawings for a project.
    Attempts to extract text from PDFs if the corresponding TXT file is missing.
    Updates the 'found_drawing' flag in the manifest_data items directly.
    """
    if not manifest_data:
        print(f"[{project_name}] Drawings Check: No manifest data provided.")
        return

    print(f"[{project_name}] Starting Drawings check...")
    text_output_dir = utils.get_text_output_dir_path(project_name)
    pdf_source_dir = utils.get_pdf_dir_path(project_name)
    os.makedirs(text_output_dir, exist_ok=True) # Ensure output dir exists

    # --- Preliminary Step: Ensure TXT exists for every PDF ---
    print(f"[{project_name}] Checking PDF source directory ({pdf_source_dir}) against text output directory ({text_output_dir})...")
    extraction_attempts = 0
    successful_extractions = 0
    failed_extractions = 0

    if not os.path.isdir(pdf_source_dir):
        print(f"  Warning: PDF source directory not found at '{pdf_source_dir}'. Skipping extraction check.")
    else:
        for pdf_filename in os.listdir(pdf_source_dir):
            if pdf_filename.lower().endswith('.pdf'):
                pdf_path = os.path.join(pdf_source_dir, pdf_filename)
                txt_filename = os.path.splitext(pdf_filename)[0] + '.txt'
                txt_path = os.path.join(text_output_dir, txt_filename)

                if not os.path.exists(txt_path):
                    extraction_attempts += 1
                    print(f"  TXT file '{txt_filename}' missing for PDF '{pdf_filename}'. Attempting extraction...")
                    try:
                        success = utils.extract_text_from_pdf(pdf_path, txt_path)
                        if success:
                            # print(f"    Successfully extracted text to '{txt_filename}'.")
                            successful_extractions += 1
                        else:
                            # print(f"    Extraction failed or produced no text for '{pdf_filename}'.")
                            failed_extractions += 1
                    except AttributeError:
                        print(f"    ERROR: utils.extract_text_from_pdf function not found! Cannot extract text.")
                        break # Stop trying if function missing
                    except Exception as extract_err:
                        print(f"    Error during text extraction for '{pdf_filename}': {extract_err}")
                        failed_extractions += 1

    print(f"[{project_name}] Text extraction check complete. Attempted: {extraction_attempts}, Succeeded: {successful_extractions}, Failed: {failed_extractions}.")
    # --- End Preliminary Step ---

    # --- Main Check: Scan all available TXT files and compare aliases ---
    print(f"[{project_name}] Reading and combining text from all .txt files in: {text_output_dir}...")

    all_raw_content = "" # Combine all raw text content here
    processed_files = 0
    found_txt_files = []

    try:
        if not os.path.isdir(text_output_dir):
             print(f"  Error: Text output directory not found: {text_output_dir}. Cannot perform drawing check.")
             for item in manifest_data: item['found_drawing'] = False
             return

        txt_files = [f for f in os.listdir(text_output_dir) if f.lower().endswith('.txt')]
        if not txt_files:
            print("  Warning: No .txt files found in the directory. Cannot perform drawing check.")
            for item in manifest_data: item['found_drawing'] = False
            return

        for filename in txt_files:
            filepath = os.path.join(text_output_dir, filename)
            processed_files += 1
            try:
                with open(filepath, 'r', encoding='utf-8') as f:
                    content = f.read()
                    # Simple concatenation is sufficient now
                    all_raw_content += content + "\n" # Add newline as separator
                    found_txt_files.append(filename)
            except Exception as e:
                print(f"  Warning: Could not read or process text file {filepath}: {e}")

        print(f"  Read content from {len(found_txt_files)} out of {processed_files} total .txt files found.")

        # Step 2: Normalize the entire combined content ONCE
        print(f"  Normalizing combined text content...")
        all_normalized_content = utils.normalize(all_raw_content)
        print(f"  Normalization complete. Total normalized length: {len(all_normalized_content)} chars.")

        # Step 3: Check each manifest alias against the normalized combined content
        found_count = 0
        checked_count = 0
        for item in manifest_data:
            # Ensure 'found_drawing' is initialized to False
            item['found_drawing'] = False

            alias = item.get('alias') # Use lowercase 'alias' key
            if alias:
                 checked_count += 1
                 normalized_alias = utils.normalize(alias)
                 if normalized_alias and normalized_alias in all_normalized_content:
                      item['found_drawing'] = True
                      found_count += 1
                 # else: item['found_drawing'] remains False

        print(f"[{project_name}] Drawings check finished. Checked {checked_count} aliases. Found {found_count} aliases within the combined drawing text.")

    except Exception as e:
        print(f"  Error during drawings check main phase: {e}")
        # Ensure flags are false on error
        for item in manifest_data: item['found_drawing'] = False