From dedd02274e35e2d12539c50765c51b9e1d5df00e Mon Sep 17 00:00:00 2001 From: nika fartenadze Date: Sun, 9 Nov 2025 15:12:55 +0400 Subject: [PATCH] Add local changes: scheduler integration, file locking fixes, config updates (preserving coworker's html_generator.py changes) --- api_server.py | 246 ++++++++++++++++++++++++++++++++------- config.py | 10 +- data_preprocessor.py | 16 +++ scheduler.py | 88 ++++++++++++-- sharepoint_downloader.py | 64 ++++++++-- 5 files changed, 365 insertions(+), 59 deletions(-) diff --git a/api_server.py b/api_server.py index 6c4f74b..e822155 100644 --- a/api_server.py +++ b/api_server.py @@ -30,6 +30,7 @@ logger = logging.getLogger(__name__) app = None config = None +scheduler_thread = None def cleanup_old_reports(output_dir: Path, reports_dir: Path, max_reports: int = 10): @@ -219,8 +220,38 @@ def create_app(config_path: Optional[str] = None): 'sharepoint_error': True }), 500 except Exception as e: - logger.error(f"Failed to download from SharePoint: {e}", exc_info=True) - # Check if we have existing files as fallback + error_msg = str(e) + logger.error(f"Failed to download from SharePoint: {error_msg}", exc_info=True) + + # Check if this is a locked file error + is_locked_file_error = 'locked' in error_msg.lower() or 'cannot access the file' in error_msg.lower() or 'being used by another process' in error_msg.lower() + + if is_locked_file_error: + # Extract filename from error if possible + locked_file_match = None + import re + # Try to find filename in error message + match = re.search(r"['\"]([^'\"]*\.xlsx?)['\"]", error_msg) + if match: + locked_file_match = match.group(1) + + locked_file_info = f" ({locked_file_match})" if locked_file_match else "" + return jsonify({ + 'error': f'Cannot download from SharePoint: File is locked{locked_file_info}', + 'details': f'A file in the reports directory is being used by another program (likely Excel). Please close Excel and any other programs that might have this file open, then try again. Error: {error_msg}', + 'instructions': [ + '1. Close Microsoft Excel completely', + '2. Close any file explorer windows showing the reports folder', + '3. Wait a few seconds', + '4. Try generating the report again', + '', + 'Alternatively, use manual file upload instead of SharePoint download.' + ], + 'sharepoint_error': True, + 'locked_file_error': True + }), 500 + + # Check if we have existing files as fallback (only for non-locked errors) reports_dir_path = Path(report_config.get('reports_dir', 'reports')) if not reports_dir_path.is_absolute(): script_dir = Path(__file__).parent.absolute() @@ -233,13 +264,13 @@ def create_app(config_path: Optional[str] = None): downloaded_files = [] # Continue with existing files else: return jsonify({ - 'error': f'SharePoint download failed: {str(e)}', + 'error': f'SharePoint download failed: {error_msg}', 'details': 'No existing files found. Please use manual file upload or fix SharePoint permissions.', 'sharepoint_error': True }), 500 else: return jsonify({ - 'error': f'SharePoint download failed: {str(e)}', + 'error': f'SharePoint download failed: {error_msg}', 'details': 'Reports directory does not exist. Please use manual file upload or fix SharePoint permissions.', 'sharepoint_error': True }), 500 @@ -286,59 +317,53 @@ def create_app(config_path: Optional[str] = None): logger.error(f"Failed to clear unexpected file {file.name}: {e}") elif not download_from_sp: # Manual upload was used (download_from_sharepoint=False) - # Upload endpoint should have cleared old files, but double-check - # Only use files uploaded in the last 10 minutes to avoid combining with old files + # Upload endpoint should have cleared old files before saving new ones + # Use ALL files in the directory (they should all be from the recent upload) if reports_dir_path.exists(): excel_files = list(reports_dir_path.glob('*.xlsx')) + list(reports_dir_path.glob('*.xls')) current_time = datetime.now().timestamp() recent_files = [] + + logger.info(f"Manual upload generation: Found {len(excel_files)} file(s) in reports directory") + + # Only use files modified in the last 2 minutes (very recent = just uploaded) + # This ensures we don't accidentally use SharePoint-downloaded files for excel_file in excel_files: mtime = excel_file.stat().st_mtime - # Only use files modified in the last 10 minutes (should be the uploaded ones) - # Increased from 5 to 10 minutes to account for upload + generation delay - if current_time - mtime < 600: # 10 minutes + age_seconds = current_time - mtime + # Only use files uploaded in the last 2 minutes (120 seconds) + # This is tight enough to catch only the most recent upload + if age_seconds < 120: # 2 minutes recent_files.append(excel_file) mtime_str = datetime.fromtimestamp(mtime).strftime('%Y-%m-%d %H:%M:%S') - logger.info(f" - {excel_file.name} (modified: {mtime_str}) - will be used for manual upload generation") + logger.info(f" - {excel_file.name} (modified: {mtime_str}, age: {age_seconds:.1f}s) - will be used for manual upload generation") else: - logger.warning(f" - {excel_file.name} (modified: {datetime.fromtimestamp(mtime).strftime('%Y-%m-%d %H:%M:%S')}) - skipping (too old, might be from previous run)") + logger.warning(f" - {excel_file.name} (modified: {datetime.fromtimestamp(mtime).strftime('%Y-%m-%d %H:%M:%S')}, age: {age_seconds:.1f}s) - skipping (too old, might be from SharePoint download)") + # Clear any files that are too old (likely from SharePoint) if len(recent_files) < len(excel_files): - logger.warning(f"Found {len(excel_files)} total file(s), but only {len(recent_files)} are recent. Clearing old files to avoid combining...") - # Clear old files to ensure we only use the manually uploaded ones + logger.warning(f"Found {len(excel_files)} total file(s), but only {len(recent_files)} are recent (< 2 min old). Clearing old files...") for excel_file in excel_files: if excel_file not in recent_files: try: excel_file.unlink() - logger.info(f"Cleared old file: {excel_file.name}") + logger.info(f"Cleared old file (likely from SharePoint): {excel_file.name}") except Exception as e: logger.warning(f"Failed to clear old file {excel_file.name}: {e}") if len(recent_files) == 0: - logger.error("Manual upload was used but no recent files found in reports directory!") + logger.error("Manual upload was used but no recent files (< 2 min old) found in reports directory!") logger.error("This might mean:") logger.error("1. Files were not uploaded successfully") - logger.error("2. Files were uploaded but cleared before generation") + logger.error("2. Upload happened more than 2 minutes ago") logger.error("3. File modification times are incorrect") + logger.error("4. SharePoint download happened after upload") return jsonify({ - 'error': 'No files found for manual upload generation', - 'details': 'Files were uploaded but not found in reports directory. Please try uploading again.', + 'error': 'No recent files found for manual upload generation', + 'details': 'Files were uploaded but not found or are too old. Please try uploading again and generating immediately.', 'manual_upload_error': True }), 400 - # Verify we only have the recently uploaded files - all_files = list(reports_dir_path.glob('*.xlsx')) + list(reports_dir_path.glob('*.xls')) - if len(all_files) != len(recent_files): - logger.warning(f"WARNING: Found {len(all_files)} file(s) but only {len(recent_files)} are recent!") - logger.warning("Clearing old files to ensure only uploaded files are used...") - for file in all_files: - if file not in recent_files: - try: - file.unlink() - logger.info(f"Cleared unexpected old file: {file.name}") - except Exception as e: - logger.error(f"Failed to clear unexpected file {file.name}: {e}") - logger.info(f"Will generate report from {len(recent_files)} recently uploaded file(s)") else: logger.error("Manual upload was used but reports directory does not exist!") @@ -358,6 +383,34 @@ def create_app(config_path: Optional[str] = None): 'sharepoint_error': True }), 400 + # FINAL VERIFICATION: Before generation, ensure only expected files exist + final_files = list(reports_dir_path.glob('*.xlsx')) + list(reports_dir_path.glob('*.xls')) + if len(final_files) > 1: + logger.error(f"CRITICAL: Found {len(final_files)} Excel file(s) before generation!") + logger.error("This will cause data mixing. Files found:") + for f in final_files: + mtime = f.stat().st_mtime + mtime_str = datetime.fromtimestamp(mtime).strftime('%Y-%m-%d %H:%M:%S') + logger.error(f" - {f.name} (modified: {mtime_str})") + logger.error("Attempting to keep only the most recent file...") + + # Keep only the newest file + final_files_sorted = sorted(final_files, key=lambda f: f.stat().st_mtime, reverse=True) + newest_file = final_files_sorted[0] + for old_file in final_files_sorted[1:]: + try: + old_file.unlink() + logger.info(f"Removed older file before generation: {old_file.name}") + except Exception as e: + logger.error(f"Failed to remove {old_file.name}: {e}") + return jsonify({ + 'error': f'Multiple Excel files found and cannot remove old ones', + 'details': f'Found {len(final_files)} files. Please ensure only one file exists. Files may be locked.', + 'files_found': [f.name for f in final_files] + }), 400 + + logger.warning(f"Proceeding with only the newest file: {newest_file.name}") + report_data = generate_report( reports_dir=str(reports_dir_path), output_file=output_file, @@ -400,12 +453,16 @@ def create_app(config_path: Optional[str] = None): @app.route('/api/upload', methods=['POST']) def upload_files(): """Upload Excel files manually. Clears old files before uploading new ones.""" + logger.info("=== MANUAL UPLOAD REQUEST RECEIVED ===") try: if 'files' not in request.files: + logger.error("Upload request missing 'files' field") return jsonify({'error': 'No files provided'}), 400 files = request.files.getlist('files') + logger.info(f"Received {len(files)} file(s) in upload request") if not files or all(f.filename == '' for f in files): + logger.error("No valid files in upload request") return jsonify({'error': 'No files selected'}), 400 report_config = app.config['REPORT_CONFIG'] @@ -426,33 +483,70 @@ def create_app(config_path: Optional[str] = None): for old_file in old_excel_files: try: - # On Windows, files might be locked - try multiple times - max_retries = 3 + # On Windows, files might be locked - try multiple times with increasing delays + max_retries = 5 retry_count = 0 - while retry_count < max_retries: + cleared_this_file = False + + while retry_count < max_retries and not cleared_this_file: try: old_file.unlink() cleared_count += 1 + cleared_this_file = True logger.info(f"Cleared old file before upload: {old_file.name}") break - except PermissionError: + except PermissionError as pe: retry_count += 1 if retry_count < max_retries: + # Increasing delay: 0.5s, 1s, 2s, 3s import time - time.sleep(0.5) # Wait 500ms before retry + delay = min(0.5 * (2 ** retry_count), 3.0) + logger.warning(f"File {old_file.name} is locked (attempt {retry_count}/{max_retries}), waiting {delay}s...") + time.sleep(delay) else: + # Last attempt failed - try renaming instead of deleting + logger.warning(f"Cannot delete {old_file.name}, trying to rename instead...") + try: + import time + timestamp = int(time.time()) + backup_name = f"{old_file.stem}_backup_{timestamp}{old_file.suffix}" + backup_path = old_file.parent / backup_name + old_file.rename(backup_path) + cleared_count += 1 + cleared_this_file = True + logger.info(f"Renamed locked file to backup: {old_file.name} -> {backup_name}") + except Exception as rename_error: + logger.error(f"Could not rename file either: {rename_error}") + raise pe # Raise original PermissionError + except Exception as e: + if retry_count >= max_retries - 1: raise + retry_count += 1 + import time + time.sleep(1) + + if not cleared_this_file: + failed_to_clear.append(old_file.name) + logger.error(f"Failed to clear old file {old_file.name} after {max_retries} attempts") except Exception as e: - failed_to_clear.append(old_file.name) + if old_file.name not in failed_to_clear: + failed_to_clear.append(old_file.name) logger.error(f"Failed to clear old file {old_file.name}: {e}") # If any files failed to clear, fail the upload to prevent mixing old and new data if failed_to_clear: logger.error(f"CRITICAL: Failed to clear {len(failed_to_clear)} file(s) before upload: {failed_to_clear}") + locked_files_list = ', '.join(failed_to_clear) return jsonify({ - 'error': f'Failed to clear {len(failed_to_clear)} old file(s) before upload. Please ensure files are not locked or in use.', + 'error': f'Cannot upload: {len(failed_to_clear)} file(s) are locked', 'failed_files': failed_to_clear, - 'details': 'Old files must be cleared before upload to ensure report generation uses only the new file(s). Files may be locked by Excel or another process.' + 'details': f'File(s) {locked_files_list} are being used by another program (likely Excel). Please close Excel and any other programs that might have these files open, then try again.', + 'instructions': [ + '1. Close Microsoft Excel completely', + '2. Close any file explorer windows showing these files', + '3. Wait a few seconds', + '4. Try uploading again' + ] }), 500 if cleared_count > 0: @@ -460,6 +554,30 @@ def create_app(config_path: Optional[str] = None): else: logger.info("No old Excel files found to clear (reports directory was empty)") + # VERIFY: Double-check that all Excel files are actually gone + remaining_files = list(reports_dir.glob('*.xlsx')) + list(reports_dir.glob('*.xls')) + if remaining_files: + logger.error(f"CRITICAL: After clearing, {len(remaining_files)} file(s) still exist: {[f.name for f in remaining_files]}") + logger.error("These files are likely locked. Attempting force removal...") + force_failed = [] + for remaining_file in remaining_files: + try: + remaining_file.unlink() + logger.info(f"Force-removed locked file: {remaining_file.name}") + except Exception as e: + force_failed.append(remaining_file.name) + logger.error(f"CRITICAL: Cannot remove locked file {remaining_file.name}: {e}") + + if force_failed: + logger.error(f"CRITICAL: {len(force_failed)} file(s) still locked after force removal: {force_failed}") + return jsonify({ + 'error': f'Cannot upload: {len(force_failed)} file(s) are locked and cannot be deleted', + 'failed_files': force_failed, + 'details': 'Please close Excel or any other program using these files, then try again.' + }), 500 + + logger.info("✓ Verified: All old Excel files cleared successfully before upload") + uploaded_count = 0 uploaded_files = [] @@ -475,10 +593,20 @@ def create_app(config_path: Optional[str] = None): # Save file to reports directory file_path = reports_dir / filename + logger.info(f"Saving uploaded file: {filename} -> {file_path}") file.save(str(file_path)) - uploaded_count += 1 - uploaded_files.append(filename) - logger.info(f"Uploaded file: {filename} -> {file_path}") + + # Verify file was saved and get its modification time + if file_path.exists(): + mtime = file_path.stat().st_mtime + mtime_str = datetime.fromtimestamp(mtime).strftime('%Y-%m-%d %H:%M:%S') + file_size = file_path.stat().st_size + logger.info(f"Successfully saved file: {filename} (size: {file_size} bytes, modified: {mtime_str})") + uploaded_count += 1 + uploaded_files.append(filename) + else: + logger.error(f"CRITICAL: File was not saved! {file_path} does not exist after save()") + raise Exception(f"Failed to save file {filename}") if uploaded_count == 0: return jsonify({'error': 'No valid Excel files uploaded'}), 400 @@ -664,10 +792,44 @@ def create_app(config_path: Optional[str] = None): return app +def start_scheduler(config_path: Optional[str] = None): + """Start the scheduler in a background thread.""" + global scheduler_thread + + scheduler_config = config.get('scheduler', {}) + if not scheduler_config.get('enabled'): + logger.info("Scheduler is disabled in configuration") + return + + try: + from scheduler import ReportScheduler + import threading + + def run_scheduler(): + try: + scheduler = ReportScheduler(config_path=config_path) + scheduler.start() + except Exception as e: + logger.error(f"Scheduler error: {e}", exc_info=True) + + scheduler_thread = threading.Thread(target=run_scheduler, daemon=True) + scheduler_thread.start() + logger.info("Scheduler started in background thread") + except ImportError: + logger.warning("Scheduler module not available. Install apscheduler to enable scheduling.") + except Exception as e: + logger.error(f"Failed to start scheduler: {e}", exc_info=True) + + def run_server(config_path: Optional[str] = None, host: Optional[str] = None, port: Optional[int] = None): """Run the API server.""" + global app, config + app = create_app(config_path) + # Start scheduler if enabled + start_scheduler(config_path) + api_config = config.get('api', {}) server_host = host or api_config.get('host', '0.0.0.0') server_port = port or api_config.get('port', 8080) diff --git a/config.py b/config.py index ebe4dd9..cfde2fb 100644 --- a/config.py +++ b/config.py @@ -34,10 +34,10 @@ DEFAULT_CONFIG = { }, 'scheduler': { 'enabled': False, - 'schedule_type': 'interval', # 'interval', 'cron', or 'once' + 'schedule_type': 'cron', # 'interval', 'cron', or 'once' 'interval_hours': 24, # For interval type - 'cron_expression': '0 8 * * *', # For cron type (8 AM daily) - 'timezone': 'America/New_York' + 'cron_expression': '0 10 * * *', # For cron type (10 AM EST/EDT daily) + 'timezone': 'America/New_York' # EST/EDT timezone }, 'api': { 'enabled': False, @@ -186,10 +186,14 @@ def _load_from_env(config: Dict) -> Dict: # Scheduler settings if os.getenv('SCHEDULER_ENABLED'): config['scheduler']['enabled'] = os.getenv('SCHEDULER_ENABLED').lower() == 'true' + if os.getenv('SCHEDULER_SCHEDULE_TYPE'): + config['scheduler']['schedule_type'] = os.getenv('SCHEDULER_SCHEDULE_TYPE') if os.getenv('SCHEDULER_INTERVAL_HOURS'): config['scheduler']['interval_hours'] = int(os.getenv('SCHEDULER_INTERVAL_HOURS')) if os.getenv('SCHEDULER_CRON'): config['scheduler']['cron_expression'] = os.getenv('SCHEDULER_CRON') + if os.getenv('SCHEDULER_TIMEZONE'): + config['scheduler']['timezone'] = os.getenv('SCHEDULER_TIMEZONE') # API settings if os.getenv('API_ENABLED'): diff --git a/data_preprocessor.py b/data_preprocessor.py index d800b9f..459b68a 100644 --- a/data_preprocessor.py +++ b/data_preprocessor.py @@ -679,6 +679,22 @@ def preprocess_excel_files(reports_dir: str = "reports", current_date: Optional[ if not excel_files: return f"No Excel files found in '{reports_dir}' directory.", {} + # Log which files will be processed + import logging + logger = logging.getLogger(__name__) + logger.info(f"Processing {len(excel_files)} Excel file(s) from {reports_dir}:") + for excel_file in excel_files: + file_size = excel_file.stat().st_size + mtime = excel_file.stat().st_mtime + mtime_str = datetime.fromtimestamp(mtime).strftime('%Y-%m-%d %H:%M:%S') + logger.info(f" - {excel_file.name} ({file_size} bytes, modified: {mtime_str})") + + # WARNING: If multiple files found, this will combine data from all files + if len(excel_files) > 1: + logger.warning(f"WARNING: Found {len(excel_files)} Excel file(s). Report will combine data from ALL files!") + logger.warning("This may cause incorrect results. Only ONE file should exist in the reports directory.") + logger.warning(f"Files found: {[f.name for f in excel_files]}") + # First pass: collect all items with raw vendor names all_raw_items = [] for excel_file in excel_files: diff --git a/scheduler.py b/scheduler.py index af34a1b..85d257e 100644 --- a/scheduler.py +++ b/scheduler.py @@ -12,6 +12,7 @@ from pathlib import Path try: from apscheduler.schedulers.blocking import BlockingScheduler + from apscheduler.schedulers.background import BackgroundScheduler from apscheduler.triggers.interval import IntervalTrigger from apscheduler.triggers.cron import CronTrigger from apscheduler.triggers.date import DateTrigger @@ -26,6 +27,52 @@ from sharepoint_downloader import download_from_sharepoint logger = logging.getLogger(__name__) +# Cleanup function (duplicated from api_server to avoid circular import) +def cleanup_old_reports(output_dir: Path, reports_dir: Path, max_reports: int = 10): + """ + Cleanup old reports and Excel files, keeping only the last max_reports. + + Args: + output_dir: Directory containing report HTML/JSON files + reports_dir: Directory containing Excel files + max_reports: Maximum number of reports to keep + """ + try: + # Get all report HTML files sorted by modification time (newest first) + html_files = sorted(output_dir.glob('report-*.html'), key=lambda p: p.stat().st_mtime, reverse=True) + + if len(html_files) <= max_reports: + return # No cleanup needed + + # Get reports to delete (oldest ones) + reports_to_delete = html_files[max_reports:] + + deleted_count = 0 + for html_file in reports_to_delete: + report_id = html_file.stem + + # Delete HTML file + try: + html_file.unlink() + logger.info(f"Deleted old report HTML: {html_file.name}") + deleted_count += 1 + except Exception as e: + logger.warning(f"Failed to delete {html_file.name}: {e}") + + # Delete corresponding JSON file + json_file = output_dir / f"{report_id}.json" + if json_file.exists(): + try: + json_file.unlink() + logger.info(f"Deleted old report JSON: {json_file.name}") + except Exception as e: + logger.warning(f"Failed to delete {json_file.name}: {e}") + + if deleted_count > 0: + logger.info(f"Cleanup completed: deleted {deleted_count} old report(s)") + except Exception as e: + logger.error(f"Error during cleanup: {e}", exc_info=True) + class ReportScheduler: """Manages scheduled report generation.""" @@ -44,7 +91,14 @@ class ReportScheduler: ) self.config = load_config(config_path) - self.scheduler = BlockingScheduler(timezone=self.config['scheduler']['timezone']) + scheduler_timezone = self.config['scheduler'].get('timezone', 'America/New_York') + # Use BackgroundScheduler for thread compatibility (when run from API server) + # Use BlockingScheduler when run standalone + self.use_background = True # Set to False if running standalone + if self.use_background: + self.scheduler = BackgroundScheduler(timezone=scheduler_timezone) + else: + self.scheduler = BlockingScheduler(timezone=scheduler_timezone) self.scheduler_config = self.config['scheduler'] self.sharepoint_config = self.config.get('sharepoint', {}) self.report_config = self.config.get('report', {}) @@ -78,10 +132,15 @@ class ReportScheduler: logger.error(f"Failed to download from SharePoint: {e}") # Continue with report generation even if download fails - # Generate report + # Generate report with timestamp logger.info("Generating report...") reports_dir = self.report_config.get('reports_dir', 'reports') - output_file = Path(self.report_config.get('output_dir', 'output')) / 'report.json' + output_dir = Path(self.report_config.get('output_dir', 'output')) + + # Create timestamped filename (same format as API server) + timestamp = datetime.now().strftime('%Y-%m-%d-%H-%M-%S') + report_id = f"report-{timestamp}" + output_file = output_dir / f"{report_id}.json" report_data = generate_report( reports_dir=reports_dir, @@ -91,6 +150,12 @@ class ReportScheduler: if report_data: logger.info("✓ Scheduled report generation completed successfully") + + # Cleanup old reports (keep last 10) + try: + cleanup_old_reports(output_dir, Path(reports_dir), max_reports=10) + except Exception as e: + logger.warning(f"Failed to cleanup old reports: {e}") else: logger.error("✗ Scheduled report generation failed") @@ -150,11 +215,18 @@ class ReportScheduler: replace_existing=True ) - logger.info("Scheduler started. Press Ctrl+C to stop.") - try: + if self.use_background: + # BackgroundScheduler - just start it, don't block self.scheduler.start() - except KeyboardInterrupt: - logger.info("Scheduler stopped by user") + logger.info("Scheduler started in background mode") + else: + # BlockingScheduler - block until interrupted + logger.info("Scheduler started. Press Ctrl+C to stop.") + try: + self.scheduler.start() + except KeyboardInterrupt: + logger.info("Scheduler stopped by user") + self.scheduler.shutdown() if __name__ == "__main__": @@ -168,5 +240,7 @@ if __name__ == "__main__": config_path = sys.argv[1] if len(sys.argv) > 1 else None scheduler = ReportScheduler(config_path=config_path) + scheduler.use_background = False # Use BlockingScheduler for standalone mode + scheduler.scheduler = BlockingScheduler(timezone=scheduler.config['scheduler'].get('timezone', 'America/New_York')) scheduler.start() diff --git a/sharepoint_downloader.py b/sharepoint_downloader.py index 5ea939c..b9e3b5a 100644 --- a/sharepoint_downloader.py +++ b/sharepoint_downloader.py @@ -284,42 +284,92 @@ class SharePointDownloader: # ALWAYS clear ALL existing Excel files before downloading (to ensure only new files are used) # This is critical to prevent combining multiple files + # Wait a moment first to allow any previous file operations to complete + import time + time.sleep(1.0) # Give file handles time to close + existing_files = list(local_dir_path.glob('*.xlsx')) + list(local_dir_path.glob('*.xls')) cleared_count = 0 failed_to_clear = [] for old_file in existing_files: try: - # On Windows, files might be locked - try multiple times - max_retries = 3 + # On Windows, files might be locked - try multiple times with increasing delays + max_retries = 5 retry_count = 0 - while retry_count < max_retries: + cleared_this_file = False + + while retry_count < max_retries and not cleared_this_file: try: old_file.unlink() cleared_count += 1 + cleared_this_file = True logger.info(f"Cleared existing file before download: {old_file.name}") break - except PermissionError: + except PermissionError as pe: retry_count += 1 if retry_count < max_retries: + # Increasing delay: 0.5s, 1s, 2s, 3s import time - time.sleep(0.5) # Wait 500ms before retry + delay = min(0.5 * (2 ** retry_count), 3.0) + logger.warning(f"File {old_file.name} is locked (attempt {retry_count}/{max_retries}), waiting {delay}s...") + time.sleep(delay) else: + # Last attempt failed - try renaming instead of deleting + logger.warning(f"Cannot delete {old_file.name}, trying to rename instead...") + try: + import time + timestamp = int(time.time()) + backup_name = f"{old_file.stem}_backup_{timestamp}{old_file.suffix}" + backup_path = old_file.parent / backup_name + old_file.rename(backup_path) + cleared_count += 1 + cleared_this_file = True + logger.info(f"Renamed locked file to backup: {old_file.name} -> {backup_name}") + except Exception as rename_error: + logger.error(f"Could not rename file either: {rename_error}") + raise pe # Raise original PermissionError + except Exception as e: + if retry_count >= max_retries - 1: raise + retry_count += 1 + import time + time.sleep(1) + + if not cleared_this_file: + failed_to_clear.append(old_file.name) + logger.error(f"Failed to clear existing file {old_file.name} after {max_retries} attempts") except Exception as e: - failed_to_clear.append(old_file.name) + if old_file.name not in failed_to_clear: + failed_to_clear.append(old_file.name) logger.error(f"Failed to clear existing file {old_file.name}: {e}") if failed_to_clear: logger.error(f"CRITICAL: Failed to clear {len(failed_to_clear)} file(s) before download: {failed_to_clear}") logger.error("This will cause data mixing! Files may be locked by another process.") - # Don't fail here - let the download proceed, but log the warning + logger.error("ABORTING download to prevent combining multiple files.") + raise Exception(f"Cannot download from SharePoint: {len(failed_to_clear)} file(s) could not be cleared. Please close any programs that might have these files open: {failed_to_clear}") if cleared_count > 0: logger.info(f"Cleared {cleared_count} existing Excel file(s) before downloading from SharePoint") else: logger.info("No existing Excel files found to clear (reports directory was empty)") + # VERIFY: Double-check that all Excel files are actually gone + remaining_files = list(local_dir_path.glob('*.xlsx')) + list(local_dir_path.glob('*.xls')) + if remaining_files: + logger.error(f"CRITICAL: After clearing, {len(remaining_files)} file(s) still exist: {[f.name for f in remaining_files]}") + logger.error("These files are likely locked. Attempting force removal...") + for remaining_file in remaining_files: + try: + remaining_file.unlink() + logger.info(f"Force-removed locked file: {remaining_file.name}") + except Exception as e: + logger.error(f"CRITICAL: Cannot remove locked file {remaining_file.name}: {e}") + raise Exception(f"Cannot proceed: File {remaining_file.name} is locked and cannot be deleted. Please close Excel or any other program using this file.") + + logger.info("✓ Verified: All old Excel files cleared successfully") + # List files in folder files = self.list_files_in_folder(folder_path, file_pattern)