import os # Assume utils contains the necessary helper functions from . import utils from .utils import normalize, get_text_output_dir_path from concurrent.futures import ProcessPoolExecutor, as_completed # Define the worker function at the module level def extract_worker(pdf_path, txt_path, pdf_filename): """Worker function to extract text from a single PDF.""" try: # Ensure utils.extract_text_from_pdf is accessible and works here success = utils.extract_text_from_pdf(pdf_path, txt_path) return success, pdf_filename, None # Return success status, filename, and no error except AttributeError: # Handle missing function specifically if possible # Note: Logging from here might be tricky with multiprocessing, returning the error is safer # print(f"ERROR: utils.extract_text_from_pdf function not found! Cannot extract text for {pdf_filename}.") return False, pdf_filename, "AttributeError: utils.extract_text_from_pdf not found" except Exception as extract_err: # print(f"Error during extraction for '{pdf_filename}': {extract_err}") # Logging might not show return False, pdf_filename, extract_err # Return failure, filename, and the error def check_drawings(project_name, manifest_data): """ Checks if aliases from manifest exist ANYWHERE within the combined text extracted from all available drawings for a project. Attempts to extract text from PDFs if the corresponding TXT file is missing. Updates the 'found_drawing' flag in the manifest_data items directly. """ if not manifest_data: print(f"[{project_name}] Drawings Check: No manifest data provided.") return print(f"[{project_name}] Starting Drawings check...") text_output_dir = utils.get_text_output_dir_path(project_name) pdf_source_dir = utils.get_pdf_dir_path(project_name) os.makedirs(text_output_dir, exist_ok=True) # Ensure output dir exists # --- Preliminary Step: Ensure TXT exists for every PDF --- print(f"[{project_name}] Checking PDF source directory ({pdf_source_dir}) against text output directory ({text_output_dir})...") extraction_attempts = 0 successful_extractions = 0 failed_extractions = 0 pdfs_to_extract = [] # List to hold tasks if not os.path.isdir(pdf_source_dir): print(f" Warning: PDF source directory not found at '{pdf_source_dir}'. Skipping extraction check.") else: print(f"[{project_name}] Identifying PDFs needing text extraction...") for pdf_filename in os.listdir(pdf_source_dir): if pdf_filename.lower().endswith('.pdf'): pdf_path = os.path.join(pdf_source_dir, pdf_filename) txt_filename = os.path.splitext(pdf_filename)[0] + '.txt' txt_path = os.path.join(text_output_dir, txt_filename) if not os.path.exists(txt_path): pdfs_to_extract.append((pdf_path, txt_path, pdf_filename)) # Store info if not pdfs_to_extract: print(f"[{project_name}] No missing TXT files found. Text extraction step skipped.") else: print(f"[{project_name}] Attempting parallel extraction for {len(pdfs_to_extract)} PDF(s)...") # Worker function is now defined at module level # Use ProcessPoolExecutor for parallel execution # Adjust max_workers based on your system and expected load, None uses default (often os.cpu_count()) with ProcessPoolExecutor(max_workers=os.cpu_count()) as executor: # Submit tasks using the module-level function futures = {executor.submit(extract_worker, pdf_p, txt_p, pdf_fn): (pdf_p, txt_p, pdf_fn) for pdf_p, txt_p, pdf_fn in pdfs_to_extract} for future in as_completed(futures): pdf_p, txt_p, pdf_fn = futures[future] try: success, filename, error = future.result() if success: successful_extractions += 1 # Optional: print(f" Successfully extracted text for '{filename}'.") else: failed_extractions += 1 if error: # Print error only if one occurred (AttributeError already printed in worker) if not isinstance(error, AttributeError): print(f" Extraction failed for '{filename}': {error}") # else: # Optional: print(f" Extraction failed or produced no text for '{filename}'.") except Exception as exc: failed_extractions += 1 print(f" Exception processing result for '{pdf_fn}': {exc}") extraction_attempts = len(pdfs_to_extract) # Update total attempts print(f"[{project_name}] Parallel text extraction complete. Attempted: {extraction_attempts}, Succeeded: {successful_extractions}, Failed: {failed_extractions}.") # --- End Preliminary Step --- # --- Main Check: Scan all available TXT files and compare aliases --- print(f"[{project_name}] Reading and combining text from all .txt files in: {text_output_dir}...") all_raw_content = "" # Combine all raw text content here processed_files = 0 found_txt_files = [] try: if not os.path.isdir(text_output_dir): print(f" Error: Text output directory not found: {text_output_dir}. Cannot perform drawing check.") for item in manifest_data: item['found_drawing'] = False return txt_files = [f for f in os.listdir(text_output_dir) if f.lower().endswith('.txt')] if not txt_files: print(" Warning: No .txt files found in the directory. Cannot perform drawing check.") for item in manifest_data: item['found_drawing'] = False return for filename in txt_files: filepath = os.path.join(text_output_dir, filename) processed_files += 1 try: with open(filepath, 'r', encoding='utf-8') as f: content = f.read() # Simple concatenation is sufficient now all_raw_content += content + "\n" # Add newline as separator found_txt_files.append(filename) except Exception as e: print(f" Warning: Could not read or process text file {filepath}: {e}") print(f" Read content from {len(found_txt_files)} out of {processed_files} total .txt files found.") # Step 2: Normalize the entire combined content ONCE print(f" Normalizing combined text content...") all_normalized_content = utils.normalize(all_raw_content) print(f" Normalization complete. Total normalized length: {len(all_normalized_content)} chars.") # Step 3: Check each manifest alias against the normalized combined content found_count = 0 checked_count = 0 for item in manifest_data: # Ensure 'found_drawing' is initialized to False item['found_drawing'] = False alias = item.get('alias') # Use lowercase 'alias' key if alias: checked_count += 1 normalized_alias = utils.normalize(alias) if normalized_alias and normalized_alias in all_normalized_content: item['found_drawing'] = True found_count += 1 # else: item['found_drawing'] remains False print(f"[{project_name}] Drawings check finished. Checked {checked_count} aliases. Found {found_count} aliases within the combined drawing text.") except Exception as e: print(f" Error during drawings check main phase: {e}") # Ensure flags are false on error for item in manifest_data: item['found_drawing'] = False