156 lines
7.9 KiB
Python
156 lines
7.9 KiB
Python
import os
|
|
# Assume utils contains the necessary helper functions
|
|
import utils
|
|
from utils import normalize, get_text_output_dir_path
|
|
from concurrent.futures import ProcessPoolExecutor, as_completed
|
|
|
|
# Define the worker function at the module level
|
|
def extract_worker(pdf_path, txt_path, pdf_filename):
|
|
"""Worker function to extract text from a single PDF."""
|
|
try:
|
|
# Ensure utils.extract_text_from_pdf is accessible and works here
|
|
success = utils.extract_text_from_pdf(pdf_path, txt_path)
|
|
return success, pdf_filename, None # Return success status, filename, and no error
|
|
except AttributeError:
|
|
# Handle missing function specifically if possible
|
|
# Note: Logging from here might be tricky with multiprocessing, returning the error is safer
|
|
# print(f"ERROR: utils.extract_text_from_pdf function not found! Cannot extract text for {pdf_filename}.")
|
|
return False, pdf_filename, "AttributeError: utils.extract_text_from_pdf not found"
|
|
except Exception as extract_err:
|
|
# print(f"Error during extraction for '{pdf_filename}': {extract_err}") # Logging might not show
|
|
return False, pdf_filename, extract_err # Return failure, filename, and the error
|
|
|
|
def check_drawings(project_name, manifest_data):
|
|
"""
|
|
Checks if aliases from manifest exist ANYWHERE within the combined text extracted
|
|
from all available drawings for a project.
|
|
Attempts to extract text from PDFs if the corresponding TXT file is missing.
|
|
Updates the 'found_drawing' flag in the manifest_data items directly.
|
|
"""
|
|
if not manifest_data:
|
|
print(f"[{project_name}] Drawings Check: No manifest data provided.")
|
|
return
|
|
|
|
print(f"[{project_name}] Starting Drawings check...")
|
|
text_output_dir = utils.get_text_output_dir_path(project_name)
|
|
pdf_source_dir = utils.get_pdf_dir_path(project_name)
|
|
os.makedirs(text_output_dir, exist_ok=True) # Ensure output dir exists
|
|
|
|
# --- Preliminary Step: Ensure TXT exists for every PDF ---
|
|
print(f"[{project_name}] Checking PDF source directory ({pdf_source_dir}) against text output directory ({text_output_dir})...")
|
|
extraction_attempts = 0
|
|
successful_extractions = 0
|
|
failed_extractions = 0
|
|
pdfs_to_extract = [] # List to hold tasks
|
|
|
|
if not os.path.isdir(pdf_source_dir):
|
|
print(f" Warning: PDF source directory not found at '{pdf_source_dir}'. Skipping extraction check.")
|
|
else:
|
|
print(f"[{project_name}] Identifying PDFs needing text extraction...")
|
|
for pdf_filename in os.listdir(pdf_source_dir):
|
|
if pdf_filename.lower().endswith('.pdf'):
|
|
pdf_path = os.path.join(pdf_source_dir, pdf_filename)
|
|
txt_filename = os.path.splitext(pdf_filename)[0] + '.txt'
|
|
txt_path = os.path.join(text_output_dir, txt_filename)
|
|
|
|
if not os.path.exists(txt_path):
|
|
pdfs_to_extract.append((pdf_path, txt_path, pdf_filename)) # Store info
|
|
|
|
if not pdfs_to_extract:
|
|
print(f"[{project_name}] No missing TXT files found. Text extraction step skipped.")
|
|
else:
|
|
print(f"[{project_name}] Attempting parallel extraction for {len(pdfs_to_extract)} PDF(s)...")
|
|
# Worker function is now defined at module level
|
|
|
|
# Use ProcessPoolExecutor for parallel execution
|
|
# Adjust max_workers based on your system and expected load, None uses default (often os.cpu_count())
|
|
with ProcessPoolExecutor(max_workers=os.cpu_count()) as executor:
|
|
# Submit tasks using the module-level function
|
|
futures = {executor.submit(extract_worker, pdf_p, txt_p, pdf_fn): (pdf_p, txt_p, pdf_fn)
|
|
for pdf_p, txt_p, pdf_fn in pdfs_to_extract}
|
|
|
|
for future in as_completed(futures):
|
|
pdf_p, txt_p, pdf_fn = futures[future]
|
|
try:
|
|
success, filename, error = future.result()
|
|
if success:
|
|
successful_extractions += 1
|
|
# Optional: print(f" Successfully extracted text for '{filename}'.")
|
|
else:
|
|
failed_extractions += 1
|
|
if error:
|
|
# Print error only if one occurred (AttributeError already printed in worker)
|
|
if not isinstance(error, AttributeError):
|
|
print(f" Extraction failed for '{filename}': {error}")
|
|
# else: # Optional: print(f" Extraction failed or produced no text for '{filename}'.")
|
|
|
|
except Exception as exc:
|
|
failed_extractions += 1
|
|
print(f" Exception processing result for '{pdf_fn}': {exc}")
|
|
|
|
extraction_attempts = len(pdfs_to_extract) # Update total attempts
|
|
|
|
print(f"[{project_name}] Parallel text extraction complete. Attempted: {extraction_attempts}, Succeeded: {successful_extractions}, Failed: {failed_extractions}.")
|
|
# --- End Preliminary Step ---
|
|
|
|
# --- Main Check: Scan all available TXT files and compare aliases ---
|
|
print(f"[{project_name}] Reading and combining text from all .txt files in: {text_output_dir}...")
|
|
|
|
all_raw_content = "" # Combine all raw text content here
|
|
processed_files = 0
|
|
found_txt_files = []
|
|
|
|
try:
|
|
if not os.path.isdir(text_output_dir):
|
|
print(f" Error: Text output directory not found: {text_output_dir}. Cannot perform drawing check.")
|
|
for item in manifest_data: item['found_drawing'] = False
|
|
return
|
|
|
|
txt_files = [f for f in os.listdir(text_output_dir) if f.lower().endswith('.txt')]
|
|
if not txt_files:
|
|
print(" Warning: No .txt files found in the directory. Cannot perform drawing check.")
|
|
for item in manifest_data: item['found_drawing'] = False
|
|
return
|
|
|
|
for filename in txt_files:
|
|
filepath = os.path.join(text_output_dir, filename)
|
|
processed_files += 1
|
|
try:
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
# Simple concatenation is sufficient now
|
|
all_raw_content += content + "\n" # Add newline as separator
|
|
found_txt_files.append(filename)
|
|
except Exception as e:
|
|
print(f" Warning: Could not read or process text file {filepath}: {e}")
|
|
|
|
print(f" Read content from {len(found_txt_files)} out of {processed_files} total .txt files found.")
|
|
|
|
# Step 2: Normalize the entire combined content ONCE
|
|
print(f" Normalizing combined text content...")
|
|
all_normalized_content = utils.normalize(all_raw_content)
|
|
print(f" Normalization complete. Total normalized length: {len(all_normalized_content)} chars.")
|
|
|
|
# Step 3: Check each manifest alias against the normalized combined content
|
|
found_count = 0
|
|
checked_count = 0
|
|
for item in manifest_data:
|
|
# Ensure 'found_drawing' is initialized to False
|
|
item['found_drawing'] = False
|
|
|
|
alias = item.get('alias') # Use lowercase 'alias' key
|
|
if alias:
|
|
checked_count += 1
|
|
normalized_alias = utils.normalize(alias)
|
|
if normalized_alias and normalized_alias in all_normalized_content:
|
|
item['found_drawing'] = True
|
|
found_count += 1
|
|
# else: item['found_drawing'] remains False
|
|
|
|
print(f"[{project_name}] Drawings check finished. Checked {checked_count} aliases. Found {found_count} aliases within the combined drawing text.")
|
|
|
|
except Exception as e:
|
|
print(f" Error during drawings check main phase: {e}")
|
|
# Ensure flags are false on error
|
|
for item in manifest_data: item['found_drawing'] = False
|