MonitorProgress/drawing_checker.py

156 lines
7.9 KiB
Python

import os
# Assume utils contains the necessary helper functions
import utils
from utils import normalize, get_text_output_dir_path
from concurrent.futures import ProcessPoolExecutor, as_completed
# Define the worker function at the module level
def extract_worker(pdf_path, txt_path, pdf_filename):
"""Worker function to extract text from a single PDF."""
try:
# Ensure utils.extract_text_from_pdf is accessible and works here
success = utils.extract_text_from_pdf(pdf_path, txt_path)
return success, pdf_filename, None # Return success status, filename, and no error
except AttributeError:
# Handle missing function specifically if possible
# Note: Logging from here might be tricky with multiprocessing, returning the error is safer
# print(f"ERROR: utils.extract_text_from_pdf function not found! Cannot extract text for {pdf_filename}.")
return False, pdf_filename, "AttributeError: utils.extract_text_from_pdf not found"
except Exception as extract_err:
# print(f"Error during extraction for '{pdf_filename}': {extract_err}") # Logging might not show
return False, pdf_filename, extract_err # Return failure, filename, and the error
def check_drawings(project_name, manifest_data):
"""
Checks if aliases from manifest exist ANYWHERE within the combined text extracted
from all available drawings for a project.
Attempts to extract text from PDFs if the corresponding TXT file is missing.
Updates the 'found_drawing' flag in the manifest_data items directly.
"""
if not manifest_data:
print(f"[{project_name}] Drawings Check: No manifest data provided.")
return
print(f"[{project_name}] Starting Drawings check...")
text_output_dir = utils.get_text_output_dir_path(project_name)
pdf_source_dir = utils.get_pdf_dir_path(project_name)
os.makedirs(text_output_dir, exist_ok=True) # Ensure output dir exists
# --- Preliminary Step: Ensure TXT exists for every PDF ---
print(f"[{project_name}] Checking PDF source directory ({pdf_source_dir}) against text output directory ({text_output_dir})...")
extraction_attempts = 0
successful_extractions = 0
failed_extractions = 0
pdfs_to_extract = [] # List to hold tasks
if not os.path.isdir(pdf_source_dir):
print(f" Warning: PDF source directory not found at '{pdf_source_dir}'. Skipping extraction check.")
else:
print(f"[{project_name}] Identifying PDFs needing text extraction...")
for pdf_filename in os.listdir(pdf_source_dir):
if pdf_filename.lower().endswith('.pdf'):
pdf_path = os.path.join(pdf_source_dir, pdf_filename)
txt_filename = os.path.splitext(pdf_filename)[0] + '.txt'
txt_path = os.path.join(text_output_dir, txt_filename)
if not os.path.exists(txt_path):
pdfs_to_extract.append((pdf_path, txt_path, pdf_filename)) # Store info
if not pdfs_to_extract:
print(f"[{project_name}] No missing TXT files found. Text extraction step skipped.")
else:
print(f"[{project_name}] Attempting parallel extraction for {len(pdfs_to_extract)} PDF(s)...")
# Worker function is now defined at module level
# Use ProcessPoolExecutor for parallel execution
# Adjust max_workers based on your system and expected load, None uses default (often os.cpu_count())
with ProcessPoolExecutor(max_workers=os.cpu_count()) as executor:
# Submit tasks using the module-level function
futures = {executor.submit(extract_worker, pdf_p, txt_p, pdf_fn): (pdf_p, txt_p, pdf_fn)
for pdf_p, txt_p, pdf_fn in pdfs_to_extract}
for future in as_completed(futures):
pdf_p, txt_p, pdf_fn = futures[future]
try:
success, filename, error = future.result()
if success:
successful_extractions += 1
# Optional: print(f" Successfully extracted text for '{filename}'.")
else:
failed_extractions += 1
if error:
# Print error only if one occurred (AttributeError already printed in worker)
if not isinstance(error, AttributeError):
print(f" Extraction failed for '{filename}': {error}")
# else: # Optional: print(f" Extraction failed or produced no text for '{filename}'.")
except Exception as exc:
failed_extractions += 1
print(f" Exception processing result for '{pdf_fn}': {exc}")
extraction_attempts = len(pdfs_to_extract) # Update total attempts
print(f"[{project_name}] Parallel text extraction complete. Attempted: {extraction_attempts}, Succeeded: {successful_extractions}, Failed: {failed_extractions}.")
# --- End Preliminary Step ---
# --- Main Check: Scan all available TXT files and compare aliases ---
print(f"[{project_name}] Reading and combining text from all .txt files in: {text_output_dir}...")
all_raw_content = "" # Combine all raw text content here
processed_files = 0
found_txt_files = []
try:
if not os.path.isdir(text_output_dir):
print(f" Error: Text output directory not found: {text_output_dir}. Cannot perform drawing check.")
for item in manifest_data: item['found_drawing'] = False
return
txt_files = [f for f in os.listdir(text_output_dir) if f.lower().endswith('.txt')]
if not txt_files:
print(" Warning: No .txt files found in the directory. Cannot perform drawing check.")
for item in manifest_data: item['found_drawing'] = False
return
for filename in txt_files:
filepath = os.path.join(text_output_dir, filename)
processed_files += 1
try:
with open(filepath, 'r', encoding='utf-8') as f:
content = f.read()
# Simple concatenation is sufficient now
all_raw_content += content + "\n" # Add newline as separator
found_txt_files.append(filename)
except Exception as e:
print(f" Warning: Could not read or process text file {filepath}: {e}")
print(f" Read content from {len(found_txt_files)} out of {processed_files} total .txt files found.")
# Step 2: Normalize the entire combined content ONCE
print(f" Normalizing combined text content...")
all_normalized_content = utils.normalize(all_raw_content)
print(f" Normalization complete. Total normalized length: {len(all_normalized_content)} chars.")
# Step 3: Check each manifest alias against the normalized combined content
found_count = 0
checked_count = 0
for item in manifest_data:
# Ensure 'found_drawing' is initialized to False
item['found_drawing'] = False
alias = item.get('alias') # Use lowercase 'alias' key
if alias:
checked_count += 1
normalized_alias = utils.normalize(alias)
if normalized_alias and normalized_alias in all_normalized_content:
item['found_drawing'] = True
found_count += 1
# else: item['found_drawing'] remains False
print(f"[{project_name}] Drawings check finished. Checked {checked_count} aliases. Found {found_count} aliases within the combined drawing text.")
except Exception as e:
print(f" Error during drawings check main phase: {e}")
# Ensure flags are false on error
for item in manifest_data: item['found_drawing'] = False