import os import re import glob # Import glob for finding CSV files import config # Need pypdf for text extraction from pypdf import PdfReader def discover_projects(): """Discovers projects by listing subdirectories in the PROJECTS_ROOT_DIR.""" projects = [] if not os.path.exists(config.PROJECTS_ROOT_DIR): print(f"Warning: Projects root directory not found: {config.PROJECTS_ROOT_DIR}") return [] for item in os.listdir(config.PROJECTS_ROOT_DIR): item_path = os.path.join(config.PROJECTS_ROOT_DIR, item) if os.path.isdir(item_path): # Simple check: assume any directory is a project # More robust check could look for specific files/folders inside projects.append(item) print(f"Discovered projects: {projects}") return projects def get_project_base_path(project_name): """Returns the absolute path to a specific project's directory.""" return os.path.join(config.PROJECTS_ROOT_DIR, project_name) def get_repo_path(project_name): """Returns the absolute path to the repository directory for a given project.""" # Assume repo is always in a subdir named 'repo' within the project base return os.path.join(get_project_base_path(project_name), "repo") def find_csv_path(project_name): """Finds the first CSV file within the project's base directory.""" project_base = get_project_base_path(project_name) csv_files = glob.glob(os.path.join(project_base, '*.csv')) if csv_files: if len(csv_files) > 1: print(f"Warning: Multiple CSV files found in {project_base}. Using the first one: {csv_files[0]}") return csv_files[0] else: print(f"Error: No CSV file found in project directory: {project_base}") return None def get_views_dir_path(project_name): """Returns the absolute path to the SCADA views directory within the project's repo.""" repo_path = get_repo_path(project_name) # Dynamically find the SCADA data directory (e.g., 'MTN6_SCADA') scada_data_dir = None try: for item in os.listdir(repo_path): item_path = os.path.join(repo_path, item) # Simple check: find first directory ending with '_SCADA' (case-insensitive) if os.path.isdir(item_path) and item.upper().endswith('_SCADA'): scada_data_dir = item_path print(f"[{project_name}] Found SCADA data directory: {scada_data_dir}") break # Use the first one found except FileNotFoundError: print(f"Warning: Repo path not found for project '{project_name}' at '{repo_path}' when searching for SCADA dir.") # Fall through to return a potentially invalid path except Exception as e: print(f"Warning: Error searching for SCADA dir in '{repo_path}': {e}") # Fall through if not scada_data_dir: print(f"Warning: Could not automatically find a *_SCADA directory in {repo_path}. Using fallback path structure.") # Fallback: Reconstruct a path assuming a fixed name (less ideal) # Or simply return None or let it fail? Returning the best guess path: scada_data_dir = os.path.join(repo_path, f"{project_name}_SCADA") # Guess the folder name # Append the common relative path from config return os.path.join(scada_data_dir, config.VIEWS_DIR_RELATIVE) def get_text_output_dir_path(project_name): """Returns the absolute path to the extracted drawing text output directory for a project.""" # Uses the relative folder name from config return os.path.join(get_project_base_path(project_name), config.TEXT_OUTPUT_FOLDER_RELATIVE) def get_pdf_dir_path(project_name): """Returns the absolute path to the source PDF directory for a project.""" # ASSUMPTION: PDFs are stored in a 'pdfs' subdirectory within the project base path # Adjust 'pdfs' if the actual directory name is different. return os.path.join(get_project_base_path(project_name), 'pdfs') def normalize(text): """Normalize string for comparison: lowercase, treat '-' and '_' the same, remove all whitespace.""" if not isinstance(text, str): return "" text = text.lower() # Convert to lowercase text = text.replace('-', '_') # Replace hyphens with underscores text = re.sub(r'\s+', '', text) # Remove ALL whitespace characters (including newlines) return text def extract_text_from_pdf(pdf_path, txt_path): """ Extracts text from a single PDF file and saves it to a TXT file. Returns True on success (incl. writing empty file), False on failure. """ base_filename = os.path.basename(pdf_path) print(f" [Extractor] Attempting to process: {base_filename}") extracted_text = "" success = False # Track overall success reader = None # Initialize reader to None try: # --- Step 1: Open and Decrypt (if necessary) --- try: print(f" [Extractor] Opening PDF: {base_filename}") reader = PdfReader(pdf_path) print(f" [Extractor] PDF opened successfully: {base_filename}") except Exception as open_err: print(f" [Extractor] CRITICAL ERROR opening PDF {base_filename}: {open_err}") # Log traceback for detailed debugging import traceback traceback.print_exc() return False # Cannot proceed if reader.is_encrypted: print(f" [Extractor] PDF is encrypted: {base_filename}. Attempting decryption...") try: # Try decrypting with empty password - adjust if needed reader.decrypt('') print(f" [Extractor] Decryption successful (or not needed) for {base_filename}") except Exception as decrypt_err: print(f" [Extractor] WARNING: Could not decrypt PDF {base_filename}: {decrypt_err}. Skipping.") return False # Treat decryption failure as critical for this file # --- Step 2: Extract Text Page by Page --- print(f" [Extractor] Starting page-by-page text extraction for: {base_filename} ({len(reader.pages)} pages)") page_texts = [] for i, page in enumerate(reader.pages): try: # print(f" [Extractor] Extracting text from page {i+1}") # Can be verbose page_text = page.extract_text() if page_text: page_texts.append(page_text) # else: print(f" [Extractor] No text found on page {i+1}") except Exception as page_err: # Log page-specific errors but continue if possible print(f" [Extractor] WARNING: Error extracting text from page {i+1} in {base_filename}: {page_err}") # Decide if this is fatal for the file? For now, we continue. extracted_text = "\n".join(page_texts) print(f" [Extractor] Finished text extraction for {base_filename}. Total chars extracted: {len(extracted_text)}") # Handle case where no text is extracted - write empty file to prevent re-attempts if not extracted_text: print(f" [Extractor] WARNING: No text extracted from {base_filename}. An empty TXT file will be created.") # --- Step 3: Write to TXT File --- print(f" [Extractor] Attempting to write TXT file: {os.path.basename(txt_path)}") try: with open(txt_path, 'w', encoding='utf-8') as txt_file: txt_file.write(extracted_text) print(f" [Extractor] Successfully wrote TXT file: {os.path.basename(txt_path)}") success = True # Mark as successful except Exception as write_err: print(f" [Extractor] ERROR writing text file {os.path.basename(txt_path)}: {write_err}") success = False # Failed to write except FileNotFoundError: # This should technically be caught by the initial open_err block now print(f" [Extractor] ERROR: PDF file not found at {pdf_path}.") success = False except Exception as e: # Catch-all for unexpected errors during the process print(f" [Extractor] UNEXPECTED CRITICAL ERROR processing PDF {base_filename}: {e}") import traceback traceback.print_exc() success = False # --- No finally block needed as we return directly --- print(f" [Extractor] Finished processing {base_filename}. Result: {'Success' if success else 'Failure'}") return success