MonitorProgress/drawing_checker.py
2025-04-10 04:08:55 +04:00

118 lines
5.7 KiB
Python

import os
# Assume utils contains the necessary helper functions
import utils
from utils import normalize, get_text_output_dir_path
def check_drawings(project_name, manifest_data):
"""
Checks if aliases from manifest exist ANYWHERE within the combined text extracted
from all available drawings for a project.
Attempts to extract text from PDFs if the corresponding TXT file is missing.
Updates the 'found_drawing' flag in the manifest_data items directly.
"""
if not manifest_data:
print(f"[{project_name}] Drawings Check: No manifest data provided.")
return
print(f"[{project_name}] Starting Drawings check...")
text_output_dir = utils.get_text_output_dir_path(project_name)
pdf_source_dir = utils.get_pdf_dir_path(project_name)
os.makedirs(text_output_dir, exist_ok=True) # Ensure output dir exists
# --- Preliminary Step: Ensure TXT exists for every PDF ---
print(f"[{project_name}] Checking PDF source directory ({pdf_source_dir}) against text output directory ({text_output_dir})...")
extraction_attempts = 0
successful_extractions = 0
failed_extractions = 0
if not os.path.isdir(pdf_source_dir):
print(f" Warning: PDF source directory not found at '{pdf_source_dir}'. Skipping extraction check.")
else:
for pdf_filename in os.listdir(pdf_source_dir):
if pdf_filename.lower().endswith('.pdf'):
pdf_path = os.path.join(pdf_source_dir, pdf_filename)
txt_filename = os.path.splitext(pdf_filename)[0] + '.txt'
txt_path = os.path.join(text_output_dir, txt_filename)
if not os.path.exists(txt_path):
extraction_attempts += 1
print(f" TXT file '{txt_filename}' missing for PDF '{pdf_filename}'. Attempting extraction...")
try:
success = utils.extract_text_from_pdf(pdf_path, txt_path)
if success:
# print(f" Successfully extracted text to '{txt_filename}'.")
successful_extractions += 1
else:
# print(f" Extraction failed or produced no text for '{pdf_filename}'.")
failed_extractions += 1
except AttributeError:
print(f" ERROR: utils.extract_text_from_pdf function not found! Cannot extract text.")
break # Stop trying if function missing
except Exception as extract_err:
print(f" Error during text extraction for '{pdf_filename}': {extract_err}")
failed_extractions += 1
print(f"[{project_name}] Text extraction check complete. Attempted: {extraction_attempts}, Succeeded: {successful_extractions}, Failed: {failed_extractions}.")
# --- End Preliminary Step ---
# --- Main Check: Scan all available TXT files and compare aliases ---
print(f"[{project_name}] Reading and combining text from all .txt files in: {text_output_dir}...")
all_raw_content = "" # Combine all raw text content here
processed_files = 0
found_txt_files = []
try:
if not os.path.isdir(text_output_dir):
print(f" Error: Text output directory not found: {text_output_dir}. Cannot perform drawing check.")
for item in manifest_data: item['found_drawing'] = False
return
txt_files = [f for f in os.listdir(text_output_dir) if f.lower().endswith('.txt')]
if not txt_files:
print(" Warning: No .txt files found in the directory. Cannot perform drawing check.")
for item in manifest_data: item['found_drawing'] = False
return
for filename in txt_files:
filepath = os.path.join(text_output_dir, filename)
processed_files += 1
try:
with open(filepath, 'r', encoding='utf-8') as f:
content = f.read()
# Simple concatenation is sufficient now
all_raw_content += content + "\n" # Add newline as separator
found_txt_files.append(filename)
except Exception as e:
print(f" Warning: Could not read or process text file {filepath}: {e}")
print(f" Read content from {len(found_txt_files)} out of {processed_files} total .txt files found.")
# Step 2: Normalize the entire combined content ONCE
print(f" Normalizing combined text content...")
all_normalized_content = utils.normalize(all_raw_content)
print(f" Normalization complete. Total normalized length: {len(all_normalized_content)} chars.")
# Step 3: Check each manifest alias against the normalized combined content
found_count = 0
checked_count = 0
for item in manifest_data:
# Ensure 'found_drawing' is initialized to False
item['found_drawing'] = False
alias = item.get('alias') # Use lowercase 'alias' key
if alias:
checked_count += 1
normalized_alias = utils.normalize(alias)
if normalized_alias and normalized_alias in all_normalized_content:
item['found_drawing'] = True
found_count += 1
# else: item['found_drawing'] remains False
print(f"[{project_name}] Drawings check finished. Checked {checked_count} aliases. Found {found_count} aliases within the combined drawing text.")
except Exception as e:
print(f" Error during drawings check main phase: {e}")
# Ensure flags are false on error
for item in manifest_data: item['found_drawing'] = False