MonitorProgress/app.py

import os
import threading
import time
import json
import git
import re # Import re for project name validation
from flask import Flask, render_template, jsonify, Response, request # Add request
from werkzeug.utils import secure_filename # For securing filenames
from concurrent.futures import ThreadPoolExecutor # Import ThreadPoolExecutor

# Import configurations and new modules
import config
import utils
from manifest_reader import read_manifest
from scada_checker import check_scada
from drawing_checker import check_drawings
from progress_calculator import calculate_combined_progress

app = Flask(__name__)

# --- Global state (Per-Project) ---
# Dictionaries keyed by project name
project_last_commit = {}
project_progress_data = {}
project_status = {}
all_projects = utils.discover_projects() # Discover projects at startup

# Initialize state for discovered projects
def get_default_progress():
    # Helper to return a fresh copy of the default progress structure
    return {
        "overall": {"total_csv": 0, "found_both": 0, "found_scada_only": 0, "found_drawing_only": 0, "missing_both": 0, "percentage_found_both": 0, "missing_list": [], "found_scada_only_list": [], "found_drawing_only_list": [], "found_both_list": []},
        "panels": {}
    }

for proj_name in all_projects:
    project_last_commit[proj_name] = None
    project_progress_data[proj_name] = get_default_progress()
    project_status[proj_name] = "Initializing..."

repo_lock = threading.Lock() # Lock remains global for now, managing access to shared dicts
data_updated_event = threading.Event() # Event signals ANY project update

# Define max workers for thread pools
MAX_INITIAL_CHECK_WORKERS = 5 # Adjust as needed
MAX_PERIODIC_CHECK_WORKERS = 5 # Adjust as needed

# --- Core Logic Orchestration (Per-Project) ---

def update_progress_data(project_name):
    """Reads manifest, runs checks, combines results for a specific project."""
    global project_progress_data, project_status # Reference the global dicts
    current_status = ""
    new_data_calculated = None

    print(f"[{project_name}] Starting analysis workflow...")

    # 1. Read Manifest
    set_status(project_name, "Reading manifest file...")
    manifest_data = read_manifest(project_name)
    if manifest_data is None:
        current_status = f"[{project_name}] Error: Failed to read or process manifest file."
        print(current_status)
        set_status(project_name, current_status)
        # Reset progress data for this project on manifest error
        with repo_lock:
             project_progress_data[project_name] = get_default_progress()
        data_updated_event.set(); data_updated_event.clear() # Signal update (error status + reset data)
        return # Cannot proceed without manifest

    # 2. Check SCADA (JSON files)
    set_status(project_name, "Checking SCADA views...")
    check_scada(project_name, manifest_data)

    # 3. Check Drawings (TXT files)
    set_status(project_name, "Checking drawing text files...")
    check_drawings(project_name, manifest_data)

    # 4. Calculate Combined Progress
    set_status(project_name, "Calculating combined progress...")
    try:
        new_data_calculated = calculate_combined_progress(project_name, manifest_data)
        if new_data_calculated:
             current_status = f"[{project_name}] Analysis complete at {time.strftime('%Y-%m-%d %H:%M:%S')}"
        else:
            current_status = f"[{project_name}] Warning: Progress calculation yielded no results (manifest might be empty)."
            new_data_calculated = get_default_progress() # Reset to default empty structure
    except Exception as e:
        current_status = f"[{project_name}] Error during progress calculation: {e}"
        print(f"Detailed Calculation Error: {e}") # Log stack trace (removed exc_info)
        new_data_calculated = None # Ensure no partial data update

    # Update global state atomically for this project
    with repo_lock:
        print(current_status)
        # Update status first (always)
        project_status[project_name] = current_status
        # Update progress data only if calculation was successful or yielded default empty
        if new_data_calculated is not None:
            project_progress_data[project_name] = new_data_calculated
        # Signal update regardless of calculation success if status changed or data changed
        data_updated_event.set()
        data_updated_event.clear()

def set_status(project_name, message):
    """Helper to update status message for a project and signal change."""
    global project_status
    with repo_lock:
        if project_status.get(project_name) != message:
             print(f"[{project_name}] Status: {message}")
             project_status[project_name] = message
             data_updated_event.set()
             data_updated_event.clear()

# --- Git Repo Handling (Per-Project) ---

def check_and_update_repo(project_name):
    """Checks and updates the Git repository for a specific project, minimizing lock contention."""
    global project_last_commit, project_status # Reference global dicts

    repo_path = utils.get_repo_path(project_name)
    repo_url = config.REPO_URL # Assuming global for now
    branch = config.BRANCH   # Assuming global for now

    did_update = False # Flag to track if files were actually updated
    initial_hash = None
    with repo_lock: # Briefly lock to get initial hash
        initial_hash = project_last_commit.get(project_name)

    try:
        project_base_path = utils.get_project_base_path(project_name)
        if not os.path.exists(project_base_path):
             # Use set_status which handles locking
             set_status(project_name, f"Error: Project directory not found: {project_base_path}")
             return False # Cannot proceed

        # Ensure parent directory exists (outside lock)
        os.makedirs(os.path.dirname(repo_path), exist_ok=True)

        repo_existed = os.path.exists(os.path.join(repo_path, ".git"))

        if not repo_existed:
            print(f"[{project_name}] Cloning repository {repo_url} into {repo_path}...")
            set_status(project_name, "Cloning repository...")
            # --- Clone happens OUTSIDE lock ---
            try:
                git.Repo.clone_from(repo_url, repo_path, branch=branch)
                repo = git.Repo(repo_path)
                new_commit_hash = repo.head.commit.hexsha
                with repo_lock: # Lock ONLY to update shared state
                    project_last_commit[project_name] = new_commit_hash
                print(f"[{project_name}] Initial clone complete. Commit: {new_commit_hash}")
                did_update = True
            except git.GitCommandError as clone_err:
                set_status(project_name, f"Error cloning repository: {clone_err}")
                print(f"[{project_name}] Git clone error: {clone_err}")
                # Ensure commit state reflects error if needed
                with repo_lock:
                    if project_last_commit.get(project_name) is None:
                        project_last_commit[project_name] = "Clone Error"
                return False # Indicate no update occurred
            # --- End Clone ---
        else:
            # --- Fetch/Pull Logic ---
            repo = git.Repo(repo_path)
            current_local_commit = repo.head.commit.hexsha

            # Ensure initial hash is set if missing (brief lock)
            with repo_lock:
                if project_last_commit.get(project_name) is None:
                    project_last_commit[project_name] = current_local_commit
                    initial_hash = current_local_commit # Update local var too

            print(f"[{project_name}] Fetching updates from remote...")
            set_status(project_name, "Checking for updates...")
            origin = repo.remotes.origin

            # --- Fetch happens OUTSIDE lock ---
            try:
                fetch_info = origin.fetch()
            except git.GitCommandError as fetch_err:
                set_status(project_name, f"Error fetching remote: {fetch_err}")
                print(f"[{project_name}] Git fetch error: {fetch_err}")
                return False # No update occurred
            # --- End Fetch ---

            # --- Check commits (brief lock) ---
            current_remote_commit = None
            pull_needed = False
            try:
                # Must read remote commit *after* fetch
                current_remote_commit = repo.commit(f'origin/{branch}').hexsha
                # Check if pull is needed inside the try block after getting remote commit
                if current_local_commit != current_remote_commit:
                     pull_needed = True
            except git.GitCommandError as commit_err:
                set_status(project_name, f"Error accessing remote branch origin/{branch}: {commit_err}")
                print(f"[{project_name}] Error accessing remote branch: {commit_err}")
                return False # Cannot compare/pull
            # --- End Check commits ---

            print(f"[{project_name}] Local commit: {current_local_commit}, Remote commit (origin/{branch}): {current_remote_commit}")

            if pull_needed:
                print(f"[{project_name}] New commit detected! Pulling changes...")
                set_status(project_name, "Pulling updates...")
                # --- Pull happens OUTSIDE lock ---
                try:
                    pull_info = origin.pull()
                    new_commit_hash = repo.head.commit.hexsha # Get hash after pull
                    with repo_lock: # Lock ONLY to update shared state
                        project_last_commit[project_name] = new_commit_hash
                    print(f"[{project_name}] Pull successful. New commit: {new_commit_hash}")
                    did_update = True
                except git.GitCommandError as pull_err:
                    set_status(project_name, f"Error pulling repository: {pull_err}")
                    print(f"[{project_name}] Git pull error: {pull_err}")
                    # Revert shared state hash if pull failed? Safest is to keep the pre-pull local commit.
                    with repo_lock:
                         project_last_commit[project_name] = current_local_commit # Revert to known local state before pull attempt
                    # Keep did_update = False
                # --- End Pull ---
            else:
                print(f"[{project_name}] No new commits detected.")
                # Update status only if it wasn't an error before (set_status handles lock)
                current_status = project_status.get(project_name, "")
                if not current_status.startswith("Error"):
                     set_status(project_name, f"Checked repo at {time.strftime('%Y-%m-%d %H:%M:%S')}. No changes.")
            # --- End Fetch/Pull Logic ---

        # --- Run analysis IF repo was updated (outside lock) ---
        if did_update:
             print(f"[{project_name}] Repository updated. Triggering analysis...")
             update_progress_data(project_name) # Calls the orchestrator function

    except git.InvalidGitRepositoryError:
         msg = f"Error: Directory '{repo_path}' exists but is not a valid Git repository. Consider deleting it and restarting."
         set_status(project_name, msg) # Handles lock
         print(f"[{project_name}] {msg}")
         with repo_lock: # Lock to update commit state
            project_last_commit[project_name] = "Invalid Repository"
    except git.GitCommandError as e:
        # General Git command error (if not caught above)
        msg = f"Git command error: {e}"
        set_status(project_name, msg) # Handles lock
        print(f"[{project_name}] {msg}")
        # Try to set commit hash state even on error (brief lock)
        with repo_lock:
            if project_last_commit.get(project_name) is None: # Only set if not already set (e.g., by failed pull)
                try:
                    if os.path.exists(os.path.join(repo_path, ".git")):
                        repo = git.Repo(repo_path)
                        project_last_commit[project_name] = repo.head.commit.hexsha
                    else:
                        project_last_commit[project_name] = "Error (No repo)"
                except Exception:
                    project_last_commit[project_name] = "Error reading commit"

    except Exception as e:
        # Catch-all for other unexpected errors
        msg = f"Unexpected error checking repository: {e}"
        set_status(project_name, msg) # Handles lock
        print(f"[{project_name}] {msg}") # Log stack trace for unexpected errors
        with repo_lock: # Lock to update commit state
             if project_last_commit.get(project_name) is None:
                  project_last_commit[project_name] = "Error checking repo"

    # Return true if analysis was run (because repo changed), false otherwise
    return did_update

def periodic_repo_check():
    """Runs the check_and_update_repo function periodically for all projects using a thread pool."""
    global all_projects
    # Use a ThreadPoolExecutor to manage periodic checks concurrently
    with ThreadPoolExecutor(max_workers=MAX_PERIODIC_CHECK_WORKERS) as executor:
        while True:
            print(f"\nStarting periodic check cycle for all projects (Interval: {config.CHECK_INTERVAL_SECONDS}s)...")
            current_projects = list(all_projects) # Copy list in case it changes

            futures = []
            for project_name in current_projects:
                print(f"--- Submitting periodic check for project: {project_name} ---")
                # Submit check_and_update_repo to the thread pool
                futures.append(executor.submit(run_check_and_log_errors, project_name, "periodic"))

            # Wait briefly for tasks to start, but don't block the loop long
            # time.sleep(1) # Optional: short sleep if needed

            print(f"Periodic check cycle submitted. Sleeping for {config.CHECK_INTERVAL_SECONDS}s...")
            time.sleep(config.CHECK_INTERVAL_SECONDS)
            # Note: We don't explicitly wait for futures to complete here.
            # The pool manages threads, and the loop continues periodically.

def run_check_and_log_errors(project_name, check_type="initial"):
    """Wrapper to run check_and_update_repo and log any exceptions."""
    try:
        print(f"--- [{check_type.capitalize()}] Running check for project: {project_name} ---")
        check_and_update_repo(project_name)
        print(f"--- [{check_type.capitalize()}] Finished check for project: {project_name} ---")
    except Exception as e:
        err_msg = f"Critical error during {check_type} check for {project_name}: {e}"
        print(err_msg)
        # Use set_status which handles locking and event signaling
        set_status(project_name, f"Error during {check_type} check: {e}")

def initial_project_setup_and_analysis(project_name):
    """Performs initial repo check/update AND ensures initial analysis runs."""
    try:
        print(f"--- [Initial Setup] Starting for project: {project_name} ---")
        # Run check_and_update_repo first. It returns True if it triggered an update/analysis.
        update_occurred = check_and_update_repo(project_name)

        # If no update occurred (repo was cloned before or was already up-to-date),
        # we still need to run the analysis once on startup.
        if not update_occurred:
            print(f"--- [Initial Analysis] Repo up-to-date or non-git. Running analysis for project: {project_name} ---")
            update_progress_data(project_name) # Run the analysis explicitly
        print(f"--- [Initial Setup] Finished for project: {project_name} ---")

    except Exception as e:
        err_msg = f"Critical error during initial setup/analysis for {project_name}: {e}"
        print(err_msg)
        set_status(project_name, f"Error during initial setup: {e}")

# --- Flask Routes ---

@app.route('/')
def index():
    # Pass the list of projects and initial statuses to the template
    with repo_lock:
        initial_statuses = dict(project_status) # Get a consistent snapshot
        project_list = list(all_projects)
    return render_template('index.html', projects=project_list, initial_statuses=initial_statuses)

# Removed redundant routes for /drawings and /conflicts as index.html handles tabs

@app.route('/stream')
def stream():
    def event_stream():
        # Track state sent to *this specific client* (using a copy of global state)
        last_sent_state = {}

        # Send initial state immediately on connection
        with repo_lock:
            # Send data for all known projects
            current_global_state = {
                 "projects": list(all_projects),
                 "status": dict(project_status),
                 "progress": dict(project_progress_data),
                 "last_commit": dict(project_last_commit)
            }

        initial_payload = json.dumps(current_global_state)
        yield f"data: {initial_payload}\n\n"
        last_sent_state = current_global_state # Store the state sent to this client
        print(f"Sent initial state to new client for projects: {last_sent_state.get('projects')}")

        # Now wait for subsequent updates signaled by the event
        while True:
            data_updated_event.wait() # Wait for ANY background thread signal

            with repo_lock: # Re-acquire lock to get the latest state
                 current_global_state = {
                     "projects": list(all_projects),
                     "status": dict(project_status),
                     "progress": dict(project_progress_data),
                     "last_commit": dict(project_last_commit)
                 }

            # Basic check: Compare entire state dictionaries (can be refined if needed)
            # Using json.dumps for a quick deep comparison, might be slow for huge data
            current_state_json = json.dumps(current_global_state, sort_keys=True)
            last_sent_state_json = json.dumps(last_sent_state, sort_keys=True)

            if current_state_json != last_sent_state_json:
                print(f"Global state changed. Sending update to client.")
                # print(f"Debug: Old state: {last_sent_state_json}") # Optional debug
                # print(f"Debug: New state: {current_state_json}") # Optional debug
                yield f"data: {current_state_json}\n\n"
                last_sent_state = current_global_state # Update the state sent to this client
            # else: # Log if event triggered but nothing changed
            #    print(f"Data update event triggered, but state unchanged for this client.")

    return Response(event_stream(), mimetype="text/event-stream")

# --- NEW: Add Project Endpoint ---
ALLOWED_PROJECT_NAME_REGEX = re.compile(r'^[a-zA-Z0-9_-]+$')

@app.route('/add_project', methods=['POST'])
def add_project():
    if 'projectName' not in request.form:
        return jsonify(success=False, message="Missing project name."), 400
    if 'repoUrl' not in request.form: # We receive it but don't use it for cloning yet
        return jsonify(success=False, message="Missing repository URL."), 400
    if 'manifestFile' not in request.files:
        return jsonify(success=False, message="Missing manifest CSV file."), 400

    project_name_raw = request.form['projectName'].strip()
    repo_url = request.form['repoUrl'].strip()
    manifest_file = request.files['manifestFile']
    pdf_files = request.files.getlist('pdfFiles') # Use getlist for multiple files

    # --- Validation ---
    if not project_name_raw:
        return jsonify(success=False, message="Project name cannot be empty."), 400
    if not ALLOWED_PROJECT_NAME_REGEX.match(project_name_raw):
         return jsonify(success=False, message="Invalid Project Name. Use only letters, numbers, underscores, or hyphens."), 400
    if not manifest_file.filename or not manifest_file.filename.lower().endswith('.csv'):
         return jsonify(success=False, message="Manifest file must be a .csv file."), 400
    if not pdf_files or all(not f.filename for f in pdf_files): # Check if list is empty or contains only empty filenames
        return jsonify(success=False, message="At least one PDF file must be provided."), 400
    for pdf_file in pdf_files:
        if not pdf_file.filename or not pdf_file.filename.lower().endswith('.pdf'):
            return jsonify(success=False, message=f"Invalid file type uploaded: {pdf_file.filename}. Only PDF files allowed."), 400

    # Use secure_filename for the project name used in paths
    # Although we validated with regex, this adds another layer against path traversal etc.
    safe_project_name = secure_filename(project_name_raw)
    if safe_project_name != project_name_raw: # Extra check if secure_filename modified it unexpectedly (e.g., spaces removed)
        print(f"Warning: Project name sanitized from '{project_name_raw}' to '{safe_project_name}'")
        # Optionally reject here, or proceed with the sanitized name

    project_base_path = os.path.join(config.PROJECTS_ROOT_DIR, safe_project_name)
    pdf_dir_path = os.path.join(project_base_path, 'pdfs')
    repo_dir_path = os.path.join(project_base_path, 'repo') # Create repo dir, but don't clone yet

    # --- Check if project already exists ---
    if os.path.exists(project_base_path):
        return jsonify(success=False, message=f"Project '{safe_project_name}' already exists."), 400

    # --- Create Directories ---
    try:
        print(f"Creating directory structure for project: {safe_project_name}")
        os.makedirs(project_base_path, exist_ok=False) # Base dir first, fail if exists
        os.makedirs(pdf_dir_path, exist_ok=True)
        os.makedirs(repo_dir_path, exist_ok=True)
    except OSError as e:
        print(f"Error creating directories for {safe_project_name}: {e}")
        return jsonify(success=False, message=f"Server error creating project directories: {e}"), 500

    # --- Save Manifest File ---
    try:
        manifest_filename = secure_filename(manifest_file.filename)
        manifest_save_path = os.path.join(project_base_path, manifest_filename)
        print(f"Saving manifest file to: {manifest_save_path}")
        manifest_file.save(manifest_save_path)
    except Exception as e:
        print(f"Error saving manifest file for {safe_project_name}: {e}")
        # Clean up created directories on error?
        # shutil.rmtree(project_base_path, ignore_errors=True)
        return jsonify(success=False, message=f"Error saving manifest file: {e}"), 500

    # --- Save PDF Files ---
    saved_pdfs = []
    try:
        for pdf_file in pdf_files:
            if pdf_file and pdf_file.filename: # Check again if file is valid
                pdf_filename = secure_filename(pdf_file.filename)
                pdf_save_path = os.path.join(pdf_dir_path, pdf_filename)
                print(f"Saving PDF file to: {pdf_save_path}")
                pdf_file.save(pdf_save_path)
                saved_pdfs.append(pdf_filename)
    except Exception as e:
        print(f"Error saving PDF files for {safe_project_name}: {e}")
        # Clean up potentially partially saved files and directories?
        # shutil.rmtree(project_base_path, ignore_errors=True)
        return jsonify(success=False, message=f"Error saving PDF files: {e}"), 500

    # --- Store Repo URL (optional, e.g., in a simple info file) ---
    try:
        info_file_path = os.path.join(project_base_path, 'project_info.txt')
        with open(info_file_path, 'w') as f:
            f.write(f"ProjectName: {safe_project_name}\n")
            f.write(f"RepoURL: {repo_url}\n")
        print(f"Saved project info (including repo URL) to: {info_file_path}")
    except Exception as e:
         print(f"Warning: Could not save project_info.txt for {safe_project_name}: {e}")
         # Don't treat this as a fatal error for the add operation itself


    print(f"Successfully added project '{safe_project_name}' with {len(saved_pdfs)} PDF(s).")
    # NOTE: Server needs restart for this new project to be discovered and processed.
    return jsonify(success=True, message=f"Project '{safe_project_name}' created successfully.")

# --- Main Execution ---

if __name__ == '__main__':
    # Ensure project-specific directories (like text output) exist if needed
    # This is now handled within drawing_checker

    # Perform initial check/clone and data load FOR EACH PROJECT in parallel
    print("--- Performing initial checks and analysis for all discovered projects in background threads ---")
    if not all_projects:
        print("Warning: No projects discovered in projects directory.")
    else:
        # Use a ThreadPoolExecutor for initial setup
        with ThreadPoolExecutor(max_workers=MAX_INITIAL_CHECK_WORKERS, thread_name_prefix='InitialCheck') as executor:
            for proj_name in all_projects:
                print(f"--- Submitting initial setup for project: {proj_name} ---")
                # Submit the combined setup and analysis function to the pool
                executor.submit(initial_project_setup_and_analysis, proj_name)
            # We exit the 'with' block here, but the threads continue running in the background.
            # The executor automatically manages the threads. We don't call shutdown(wait=True).

    # Start the background thread for PERIODIC checks (now uses its own thread pool internally)
    print("--- Starting background periodic check manager thread ---")
    # This thread now manages submitting tasks to its own pool
    repo_check_thread = threading.Thread(target=periodic_repo_check, daemon=True, name="PeriodicCheckManager")
    repo_check_thread.start()

    # Run the Flask app - This will start *before* initial checks might be complete
    print(f"--- Starting Flask server on http://0.0.0.0:5050 ... ---")
    # Ensure Flask runs threaded to handle multiple requests (like SSE connections)
    app.run(host='0.0.0.0', port=5050, debug=False, threaded=True)