import os # Assume utils contains the necessary helper functions import utils from utils import normalize, get_text_output_dir_path def check_drawings(project_name, manifest_data): """ Checks if aliases from manifest exist ANYWHERE within the combined text extracted from all available drawings for a project. Attempts to extract text from PDFs if the corresponding TXT file is missing. Updates the 'found_drawing' flag in the manifest_data items directly. """ if not manifest_data: print(f"[{project_name}] Drawings Check: No manifest data provided.") return print(f"[{project_name}] Starting Drawings check...") text_output_dir = utils.get_text_output_dir_path(project_name) pdf_source_dir = utils.get_pdf_dir_path(project_name) os.makedirs(text_output_dir, exist_ok=True) # Ensure output dir exists # --- Preliminary Step: Ensure TXT exists for every PDF --- print(f"[{project_name}] Checking PDF source directory ({pdf_source_dir}) against text output directory ({text_output_dir})...") extraction_attempts = 0 successful_extractions = 0 failed_extractions = 0 if not os.path.isdir(pdf_source_dir): print(f" Warning: PDF source directory not found at '{pdf_source_dir}'. Skipping extraction check.") else: for pdf_filename in os.listdir(pdf_source_dir): if pdf_filename.lower().endswith('.pdf'): pdf_path = os.path.join(pdf_source_dir, pdf_filename) txt_filename = os.path.splitext(pdf_filename)[0] + '.txt' txt_path = os.path.join(text_output_dir, txt_filename) if not os.path.exists(txt_path): extraction_attempts += 1 print(f" TXT file '{txt_filename}' missing for PDF '{pdf_filename}'. Attempting extraction...") try: success = utils.extract_text_from_pdf(pdf_path, txt_path) if success: # print(f" Successfully extracted text to '{txt_filename}'.") successful_extractions += 1 else: # print(f" Extraction failed or produced no text for '{pdf_filename}'.") failed_extractions += 1 except AttributeError: print(f" ERROR: utils.extract_text_from_pdf function not found! Cannot extract text.") break # Stop trying if function missing except Exception as extract_err: print(f" Error during text extraction for '{pdf_filename}': {extract_err}") failed_extractions += 1 print(f"[{project_name}] Text extraction check complete. Attempted: {extraction_attempts}, Succeeded: {successful_extractions}, Failed: {failed_extractions}.") # --- End Preliminary Step --- # --- Main Check: Scan all available TXT files and compare aliases --- print(f"[{project_name}] Reading and combining text from all .txt files in: {text_output_dir}...") all_raw_content = "" # Combine all raw text content here processed_files = 0 found_txt_files = [] try: if not os.path.isdir(text_output_dir): print(f" Error: Text output directory not found: {text_output_dir}. Cannot perform drawing check.") for item in manifest_data: item['found_drawing'] = False return txt_files = [f for f in os.listdir(text_output_dir) if f.lower().endswith('.txt')] if not txt_files: print(" Warning: No .txt files found in the directory. Cannot perform drawing check.") for item in manifest_data: item['found_drawing'] = False return for filename in txt_files: filepath = os.path.join(text_output_dir, filename) processed_files += 1 try: with open(filepath, 'r', encoding='utf-8') as f: content = f.read() # Simple concatenation is sufficient now all_raw_content += content + "\n" # Add newline as separator found_txt_files.append(filename) except Exception as e: print(f" Warning: Could not read or process text file {filepath}: {e}") print(f" Read content from {len(found_txt_files)} out of {processed_files} total .txt files found.") # Step 2: Normalize the entire combined content ONCE print(f" Normalizing combined text content...") all_normalized_content = utils.normalize(all_raw_content) print(f" Normalization complete. Total normalized length: {len(all_normalized_content)} chars.") # Step 3: Check each manifest alias against the normalized combined content found_count = 0 checked_count = 0 for item in manifest_data: # Ensure 'found_drawing' is initialized to False item['found_drawing'] = False alias = item.get('alias') # Use lowercase 'alias' key if alias: checked_count += 1 normalized_alias = utils.normalize(alias) if normalized_alias and normalized_alias in all_normalized_content: item['found_drawing'] = True found_count += 1 # else: item['found_drawing'] remains False print(f"[{project_name}] Drawings check finished. Checked {checked_count} aliases. Found {found_count} aliases within the combined drawing text.") except Exception as e: print(f" Error during drawings check main phase: {e}") # Ensure flags are false on error for item in manifest_data: item['found_drawing'] = False