Add local changes: scheduler integration, file locking fixes, config updates (preserving coworker's html_generator.py changes)

This commit is contained in:
nika fartenadze 2025-11-09 15:12:55 +04:00
parent 51f618b654
commit dedd02274e
5 changed files with 365 additions and 59 deletions

View File

@ -30,6 +30,7 @@ logger = logging.getLogger(__name__)
app = None app = None
config = None config = None
scheduler_thread = None
def cleanup_old_reports(output_dir: Path, reports_dir: Path, max_reports: int = 10): def cleanup_old_reports(output_dir: Path, reports_dir: Path, max_reports: int = 10):
@ -219,8 +220,38 @@ def create_app(config_path: Optional[str] = None):
'sharepoint_error': True 'sharepoint_error': True
}), 500 }), 500
except Exception as e: except Exception as e:
logger.error(f"Failed to download from SharePoint: {e}", exc_info=True) error_msg = str(e)
# Check if we have existing files as fallback logger.error(f"Failed to download from SharePoint: {error_msg}", exc_info=True)
# Check if this is a locked file error
is_locked_file_error = 'locked' in error_msg.lower() or 'cannot access the file' in error_msg.lower() or 'being used by another process' in error_msg.lower()
if is_locked_file_error:
# Extract filename from error if possible
locked_file_match = None
import re
# Try to find filename in error message
match = re.search(r"['\"]([^'\"]*\.xlsx?)['\"]", error_msg)
if match:
locked_file_match = match.group(1)
locked_file_info = f" ({locked_file_match})" if locked_file_match else ""
return jsonify({
'error': f'Cannot download from SharePoint: File is locked{locked_file_info}',
'details': f'A file in the reports directory is being used by another program (likely Excel). Please close Excel and any other programs that might have this file open, then try again. Error: {error_msg}',
'instructions': [
'1. Close Microsoft Excel completely',
'2. Close any file explorer windows showing the reports folder',
'3. Wait a few seconds',
'4. Try generating the report again',
'',
'Alternatively, use manual file upload instead of SharePoint download.'
],
'sharepoint_error': True,
'locked_file_error': True
}), 500
# Check if we have existing files as fallback (only for non-locked errors)
reports_dir_path = Path(report_config.get('reports_dir', 'reports')) reports_dir_path = Path(report_config.get('reports_dir', 'reports'))
if not reports_dir_path.is_absolute(): if not reports_dir_path.is_absolute():
script_dir = Path(__file__).parent.absolute() script_dir = Path(__file__).parent.absolute()
@ -233,13 +264,13 @@ def create_app(config_path: Optional[str] = None):
downloaded_files = [] # Continue with existing files downloaded_files = [] # Continue with existing files
else: else:
return jsonify({ return jsonify({
'error': f'SharePoint download failed: {str(e)}', 'error': f'SharePoint download failed: {error_msg}',
'details': 'No existing files found. Please use manual file upload or fix SharePoint permissions.', 'details': 'No existing files found. Please use manual file upload or fix SharePoint permissions.',
'sharepoint_error': True 'sharepoint_error': True
}), 500 }), 500
else: else:
return jsonify({ return jsonify({
'error': f'SharePoint download failed: {str(e)}', 'error': f'SharePoint download failed: {error_msg}',
'details': 'Reports directory does not exist. Please use manual file upload or fix SharePoint permissions.', 'details': 'Reports directory does not exist. Please use manual file upload or fix SharePoint permissions.',
'sharepoint_error': True 'sharepoint_error': True
}), 500 }), 500
@ -286,59 +317,53 @@ def create_app(config_path: Optional[str] = None):
logger.error(f"Failed to clear unexpected file {file.name}: {e}") logger.error(f"Failed to clear unexpected file {file.name}: {e}")
elif not download_from_sp: elif not download_from_sp:
# Manual upload was used (download_from_sharepoint=False) # Manual upload was used (download_from_sharepoint=False)
# Upload endpoint should have cleared old files, but double-check # Upload endpoint should have cleared old files before saving new ones
# Only use files uploaded in the last 10 minutes to avoid combining with old files # Use ALL files in the directory (they should all be from the recent upload)
if reports_dir_path.exists(): if reports_dir_path.exists():
excel_files = list(reports_dir_path.glob('*.xlsx')) + list(reports_dir_path.glob('*.xls')) excel_files = list(reports_dir_path.glob('*.xlsx')) + list(reports_dir_path.glob('*.xls'))
current_time = datetime.now().timestamp() current_time = datetime.now().timestamp()
recent_files = [] recent_files = []
logger.info(f"Manual upload generation: Found {len(excel_files)} file(s) in reports directory")
# Only use files modified in the last 2 minutes (very recent = just uploaded)
# This ensures we don't accidentally use SharePoint-downloaded files
for excel_file in excel_files: for excel_file in excel_files:
mtime = excel_file.stat().st_mtime mtime = excel_file.stat().st_mtime
# Only use files modified in the last 10 minutes (should be the uploaded ones) age_seconds = current_time - mtime
# Increased from 5 to 10 minutes to account for upload + generation delay # Only use files uploaded in the last 2 minutes (120 seconds)
if current_time - mtime < 600: # 10 minutes # This is tight enough to catch only the most recent upload
if age_seconds < 120: # 2 minutes
recent_files.append(excel_file) recent_files.append(excel_file)
mtime_str = datetime.fromtimestamp(mtime).strftime('%Y-%m-%d %H:%M:%S') mtime_str = datetime.fromtimestamp(mtime).strftime('%Y-%m-%d %H:%M:%S')
logger.info(f" - {excel_file.name} (modified: {mtime_str}) - will be used for manual upload generation") logger.info(f" - {excel_file.name} (modified: {mtime_str}, age: {age_seconds:.1f}s) - will be used for manual upload generation")
else: else:
logger.warning(f" - {excel_file.name} (modified: {datetime.fromtimestamp(mtime).strftime('%Y-%m-%d %H:%M:%S')}) - skipping (too old, might be from previous run)") logger.warning(f" - {excel_file.name} (modified: {datetime.fromtimestamp(mtime).strftime('%Y-%m-%d %H:%M:%S')}, age: {age_seconds:.1f}s) - skipping (too old, might be from SharePoint download)")
# Clear any files that are too old (likely from SharePoint)
if len(recent_files) < len(excel_files): if len(recent_files) < len(excel_files):
logger.warning(f"Found {len(excel_files)} total file(s), but only {len(recent_files)} are recent. Clearing old files to avoid combining...") logger.warning(f"Found {len(excel_files)} total file(s), but only {len(recent_files)} are recent (< 2 min old). Clearing old files...")
# Clear old files to ensure we only use the manually uploaded ones
for excel_file in excel_files: for excel_file in excel_files:
if excel_file not in recent_files: if excel_file not in recent_files:
try: try:
excel_file.unlink() excel_file.unlink()
logger.info(f"Cleared old file: {excel_file.name}") logger.info(f"Cleared old file (likely from SharePoint): {excel_file.name}")
except Exception as e: except Exception as e:
logger.warning(f"Failed to clear old file {excel_file.name}: {e}") logger.warning(f"Failed to clear old file {excel_file.name}: {e}")
if len(recent_files) == 0: if len(recent_files) == 0:
logger.error("Manual upload was used but no recent files found in reports directory!") logger.error("Manual upload was used but no recent files (< 2 min old) found in reports directory!")
logger.error("This might mean:") logger.error("This might mean:")
logger.error("1. Files were not uploaded successfully") logger.error("1. Files were not uploaded successfully")
logger.error("2. Files were uploaded but cleared before generation") logger.error("2. Upload happened more than 2 minutes ago")
logger.error("3. File modification times are incorrect") logger.error("3. File modification times are incorrect")
logger.error("4. SharePoint download happened after upload")
return jsonify({ return jsonify({
'error': 'No files found for manual upload generation', 'error': 'No recent files found for manual upload generation',
'details': 'Files were uploaded but not found in reports directory. Please try uploading again.', 'details': 'Files were uploaded but not found or are too old. Please try uploading again and generating immediately.',
'manual_upload_error': True 'manual_upload_error': True
}), 400 }), 400
# Verify we only have the recently uploaded files
all_files = list(reports_dir_path.glob('*.xlsx')) + list(reports_dir_path.glob('*.xls'))
if len(all_files) != len(recent_files):
logger.warning(f"WARNING: Found {len(all_files)} file(s) but only {len(recent_files)} are recent!")
logger.warning("Clearing old files to ensure only uploaded files are used...")
for file in all_files:
if file not in recent_files:
try:
file.unlink()
logger.info(f"Cleared unexpected old file: {file.name}")
except Exception as e:
logger.error(f"Failed to clear unexpected file {file.name}: {e}")
logger.info(f"Will generate report from {len(recent_files)} recently uploaded file(s)") logger.info(f"Will generate report from {len(recent_files)} recently uploaded file(s)")
else: else:
logger.error("Manual upload was used but reports directory does not exist!") logger.error("Manual upload was used but reports directory does not exist!")
@ -358,6 +383,34 @@ def create_app(config_path: Optional[str] = None):
'sharepoint_error': True 'sharepoint_error': True
}), 400 }), 400
# FINAL VERIFICATION: Before generation, ensure only expected files exist
final_files = list(reports_dir_path.glob('*.xlsx')) + list(reports_dir_path.glob('*.xls'))
if len(final_files) > 1:
logger.error(f"CRITICAL: Found {len(final_files)} Excel file(s) before generation!")
logger.error("This will cause data mixing. Files found:")
for f in final_files:
mtime = f.stat().st_mtime
mtime_str = datetime.fromtimestamp(mtime).strftime('%Y-%m-%d %H:%M:%S')
logger.error(f" - {f.name} (modified: {mtime_str})")
logger.error("Attempting to keep only the most recent file...")
# Keep only the newest file
final_files_sorted = sorted(final_files, key=lambda f: f.stat().st_mtime, reverse=True)
newest_file = final_files_sorted[0]
for old_file in final_files_sorted[1:]:
try:
old_file.unlink()
logger.info(f"Removed older file before generation: {old_file.name}")
except Exception as e:
logger.error(f"Failed to remove {old_file.name}: {e}")
return jsonify({
'error': f'Multiple Excel files found and cannot remove old ones',
'details': f'Found {len(final_files)} files. Please ensure only one file exists. Files may be locked.',
'files_found': [f.name for f in final_files]
}), 400
logger.warning(f"Proceeding with only the newest file: {newest_file.name}")
report_data = generate_report( report_data = generate_report(
reports_dir=str(reports_dir_path), reports_dir=str(reports_dir_path),
output_file=output_file, output_file=output_file,
@ -400,12 +453,16 @@ def create_app(config_path: Optional[str] = None):
@app.route('/api/upload', methods=['POST']) @app.route('/api/upload', methods=['POST'])
def upload_files(): def upload_files():
"""Upload Excel files manually. Clears old files before uploading new ones.""" """Upload Excel files manually. Clears old files before uploading new ones."""
logger.info("=== MANUAL UPLOAD REQUEST RECEIVED ===")
try: try:
if 'files' not in request.files: if 'files' not in request.files:
logger.error("Upload request missing 'files' field")
return jsonify({'error': 'No files provided'}), 400 return jsonify({'error': 'No files provided'}), 400
files = request.files.getlist('files') files = request.files.getlist('files')
logger.info(f"Received {len(files)} file(s) in upload request")
if not files or all(f.filename == '' for f in files): if not files or all(f.filename == '' for f in files):
logger.error("No valid files in upload request")
return jsonify({'error': 'No files selected'}), 400 return jsonify({'error': 'No files selected'}), 400
report_config = app.config['REPORT_CONFIG'] report_config = app.config['REPORT_CONFIG']
@ -426,33 +483,70 @@ def create_app(config_path: Optional[str] = None):
for old_file in old_excel_files: for old_file in old_excel_files:
try: try:
# On Windows, files might be locked - try multiple times # On Windows, files might be locked - try multiple times with increasing delays
max_retries = 3 max_retries = 5
retry_count = 0 retry_count = 0
while retry_count < max_retries: cleared_this_file = False
while retry_count < max_retries and not cleared_this_file:
try: try:
old_file.unlink() old_file.unlink()
cleared_count += 1 cleared_count += 1
cleared_this_file = True
logger.info(f"Cleared old file before upload: {old_file.name}") logger.info(f"Cleared old file before upload: {old_file.name}")
break break
except PermissionError: except PermissionError as pe:
retry_count += 1 retry_count += 1
if retry_count < max_retries: if retry_count < max_retries:
# Increasing delay: 0.5s, 1s, 2s, 3s
import time import time
time.sleep(0.5) # Wait 500ms before retry delay = min(0.5 * (2 ** retry_count), 3.0)
logger.warning(f"File {old_file.name} is locked (attempt {retry_count}/{max_retries}), waiting {delay}s...")
time.sleep(delay)
else: else:
raise # Last attempt failed - try renaming instead of deleting
logger.warning(f"Cannot delete {old_file.name}, trying to rename instead...")
try:
import time
timestamp = int(time.time())
backup_name = f"{old_file.stem}_backup_{timestamp}{old_file.suffix}"
backup_path = old_file.parent / backup_name
old_file.rename(backup_path)
cleared_count += 1
cleared_this_file = True
logger.info(f"Renamed locked file to backup: {old_file.name} -> {backup_name}")
except Exception as rename_error:
logger.error(f"Could not rename file either: {rename_error}")
raise pe # Raise original PermissionError
except Exception as e: except Exception as e:
if retry_count >= max_retries - 1:
raise
retry_count += 1
import time
time.sleep(1)
if not cleared_this_file:
failed_to_clear.append(old_file.name)
logger.error(f"Failed to clear old file {old_file.name} after {max_retries} attempts")
except Exception as e:
if old_file.name not in failed_to_clear:
failed_to_clear.append(old_file.name) failed_to_clear.append(old_file.name)
logger.error(f"Failed to clear old file {old_file.name}: {e}") logger.error(f"Failed to clear old file {old_file.name}: {e}")
# If any files failed to clear, fail the upload to prevent mixing old and new data # If any files failed to clear, fail the upload to prevent mixing old and new data
if failed_to_clear: if failed_to_clear:
logger.error(f"CRITICAL: Failed to clear {len(failed_to_clear)} file(s) before upload: {failed_to_clear}") logger.error(f"CRITICAL: Failed to clear {len(failed_to_clear)} file(s) before upload: {failed_to_clear}")
locked_files_list = ', '.join(failed_to_clear)
return jsonify({ return jsonify({
'error': f'Failed to clear {len(failed_to_clear)} old file(s) before upload. Please ensure files are not locked or in use.', 'error': f'Cannot upload: {len(failed_to_clear)} file(s) are locked',
'failed_files': failed_to_clear, 'failed_files': failed_to_clear,
'details': 'Old files must be cleared before upload to ensure report generation uses only the new file(s). Files may be locked by Excel or another process.' 'details': f'File(s) {locked_files_list} are being used by another program (likely Excel). Please close Excel and any other programs that might have these files open, then try again.',
'instructions': [
'1. Close Microsoft Excel completely',
'2. Close any file explorer windows showing these files',
'3. Wait a few seconds',
'4. Try uploading again'
]
}), 500 }), 500
if cleared_count > 0: if cleared_count > 0:
@ -460,6 +554,30 @@ def create_app(config_path: Optional[str] = None):
else: else:
logger.info("No old Excel files found to clear (reports directory was empty)") logger.info("No old Excel files found to clear (reports directory was empty)")
# VERIFY: Double-check that all Excel files are actually gone
remaining_files = list(reports_dir.glob('*.xlsx')) + list(reports_dir.glob('*.xls'))
if remaining_files:
logger.error(f"CRITICAL: After clearing, {len(remaining_files)} file(s) still exist: {[f.name for f in remaining_files]}")
logger.error("These files are likely locked. Attempting force removal...")
force_failed = []
for remaining_file in remaining_files:
try:
remaining_file.unlink()
logger.info(f"Force-removed locked file: {remaining_file.name}")
except Exception as e:
force_failed.append(remaining_file.name)
logger.error(f"CRITICAL: Cannot remove locked file {remaining_file.name}: {e}")
if force_failed:
logger.error(f"CRITICAL: {len(force_failed)} file(s) still locked after force removal: {force_failed}")
return jsonify({
'error': f'Cannot upload: {len(force_failed)} file(s) are locked and cannot be deleted',
'failed_files': force_failed,
'details': 'Please close Excel or any other program using these files, then try again.'
}), 500
logger.info("✓ Verified: All old Excel files cleared successfully before upload")
uploaded_count = 0 uploaded_count = 0
uploaded_files = [] uploaded_files = []
@ -475,10 +593,20 @@ def create_app(config_path: Optional[str] = None):
# Save file to reports directory # Save file to reports directory
file_path = reports_dir / filename file_path = reports_dir / filename
logger.info(f"Saving uploaded file: {filename} -> {file_path}")
file.save(str(file_path)) file.save(str(file_path))
# Verify file was saved and get its modification time
if file_path.exists():
mtime = file_path.stat().st_mtime
mtime_str = datetime.fromtimestamp(mtime).strftime('%Y-%m-%d %H:%M:%S')
file_size = file_path.stat().st_size
logger.info(f"Successfully saved file: {filename} (size: {file_size} bytes, modified: {mtime_str})")
uploaded_count += 1 uploaded_count += 1
uploaded_files.append(filename) uploaded_files.append(filename)
logger.info(f"Uploaded file: {filename} -> {file_path}") else:
logger.error(f"CRITICAL: File was not saved! {file_path} does not exist after save()")
raise Exception(f"Failed to save file {filename}")
if uploaded_count == 0: if uploaded_count == 0:
return jsonify({'error': 'No valid Excel files uploaded'}), 400 return jsonify({'error': 'No valid Excel files uploaded'}), 400
@ -664,10 +792,44 @@ def create_app(config_path: Optional[str] = None):
return app return app
def start_scheduler(config_path: Optional[str] = None):
"""Start the scheduler in a background thread."""
global scheduler_thread
scheduler_config = config.get('scheduler', {})
if not scheduler_config.get('enabled'):
logger.info("Scheduler is disabled in configuration")
return
try:
from scheduler import ReportScheduler
import threading
def run_scheduler():
try:
scheduler = ReportScheduler(config_path=config_path)
scheduler.start()
except Exception as e:
logger.error(f"Scheduler error: {e}", exc_info=True)
scheduler_thread = threading.Thread(target=run_scheduler, daemon=True)
scheduler_thread.start()
logger.info("Scheduler started in background thread")
except ImportError:
logger.warning("Scheduler module not available. Install apscheduler to enable scheduling.")
except Exception as e:
logger.error(f"Failed to start scheduler: {e}", exc_info=True)
def run_server(config_path: Optional[str] = None, host: Optional[str] = None, port: Optional[int] = None): def run_server(config_path: Optional[str] = None, host: Optional[str] = None, port: Optional[int] = None):
"""Run the API server.""" """Run the API server."""
global app, config
app = create_app(config_path) app = create_app(config_path)
# Start scheduler if enabled
start_scheduler(config_path)
api_config = config.get('api', {}) api_config = config.get('api', {})
server_host = host or api_config.get('host', '0.0.0.0') server_host = host or api_config.get('host', '0.0.0.0')
server_port = port or api_config.get('port', 8080) server_port = port or api_config.get('port', 8080)

View File

@ -34,10 +34,10 @@ DEFAULT_CONFIG = {
}, },
'scheduler': { 'scheduler': {
'enabled': False, 'enabled': False,
'schedule_type': 'interval', # 'interval', 'cron', or 'once' 'schedule_type': 'cron', # 'interval', 'cron', or 'once'
'interval_hours': 24, # For interval type 'interval_hours': 24, # For interval type
'cron_expression': '0 8 * * *', # For cron type (8 AM daily) 'cron_expression': '0 10 * * *', # For cron type (10 AM EST/EDT daily)
'timezone': 'America/New_York' 'timezone': 'America/New_York' # EST/EDT timezone
}, },
'api': { 'api': {
'enabled': False, 'enabled': False,
@ -186,10 +186,14 @@ def _load_from_env(config: Dict) -> Dict:
# Scheduler settings # Scheduler settings
if os.getenv('SCHEDULER_ENABLED'): if os.getenv('SCHEDULER_ENABLED'):
config['scheduler']['enabled'] = os.getenv('SCHEDULER_ENABLED').lower() == 'true' config['scheduler']['enabled'] = os.getenv('SCHEDULER_ENABLED').lower() == 'true'
if os.getenv('SCHEDULER_SCHEDULE_TYPE'):
config['scheduler']['schedule_type'] = os.getenv('SCHEDULER_SCHEDULE_TYPE')
if os.getenv('SCHEDULER_INTERVAL_HOURS'): if os.getenv('SCHEDULER_INTERVAL_HOURS'):
config['scheduler']['interval_hours'] = int(os.getenv('SCHEDULER_INTERVAL_HOURS')) config['scheduler']['interval_hours'] = int(os.getenv('SCHEDULER_INTERVAL_HOURS'))
if os.getenv('SCHEDULER_CRON'): if os.getenv('SCHEDULER_CRON'):
config['scheduler']['cron_expression'] = os.getenv('SCHEDULER_CRON') config['scheduler']['cron_expression'] = os.getenv('SCHEDULER_CRON')
if os.getenv('SCHEDULER_TIMEZONE'):
config['scheduler']['timezone'] = os.getenv('SCHEDULER_TIMEZONE')
# API settings # API settings
if os.getenv('API_ENABLED'): if os.getenv('API_ENABLED'):

View File

@ -679,6 +679,22 @@ def preprocess_excel_files(reports_dir: str = "reports", current_date: Optional[
if not excel_files: if not excel_files:
return f"No Excel files found in '{reports_dir}' directory.", {} return f"No Excel files found in '{reports_dir}' directory.", {}
# Log which files will be processed
import logging
logger = logging.getLogger(__name__)
logger.info(f"Processing {len(excel_files)} Excel file(s) from {reports_dir}:")
for excel_file in excel_files:
file_size = excel_file.stat().st_size
mtime = excel_file.stat().st_mtime
mtime_str = datetime.fromtimestamp(mtime).strftime('%Y-%m-%d %H:%M:%S')
logger.info(f" - {excel_file.name} ({file_size} bytes, modified: {mtime_str})")
# WARNING: If multiple files found, this will combine data from all files
if len(excel_files) > 1:
logger.warning(f"WARNING: Found {len(excel_files)} Excel file(s). Report will combine data from ALL files!")
logger.warning("This may cause incorrect results. Only ONE file should exist in the reports directory.")
logger.warning(f"Files found: {[f.name for f in excel_files]}")
# First pass: collect all items with raw vendor names # First pass: collect all items with raw vendor names
all_raw_items = [] all_raw_items = []
for excel_file in excel_files: for excel_file in excel_files:

View File

@ -12,6 +12,7 @@ from pathlib import Path
try: try:
from apscheduler.schedulers.blocking import BlockingScheduler from apscheduler.schedulers.blocking import BlockingScheduler
from apscheduler.schedulers.background import BackgroundScheduler
from apscheduler.triggers.interval import IntervalTrigger from apscheduler.triggers.interval import IntervalTrigger
from apscheduler.triggers.cron import CronTrigger from apscheduler.triggers.cron import CronTrigger
from apscheduler.triggers.date import DateTrigger from apscheduler.triggers.date import DateTrigger
@ -26,6 +27,52 @@ from sharepoint_downloader import download_from_sharepoint
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
# Cleanup function (duplicated from api_server to avoid circular import)
def cleanup_old_reports(output_dir: Path, reports_dir: Path, max_reports: int = 10):
"""
Cleanup old reports and Excel files, keeping only the last max_reports.
Args:
output_dir: Directory containing report HTML/JSON files
reports_dir: Directory containing Excel files
max_reports: Maximum number of reports to keep
"""
try:
# Get all report HTML files sorted by modification time (newest first)
html_files = sorted(output_dir.glob('report-*.html'), key=lambda p: p.stat().st_mtime, reverse=True)
if len(html_files) <= max_reports:
return # No cleanup needed
# Get reports to delete (oldest ones)
reports_to_delete = html_files[max_reports:]
deleted_count = 0
for html_file in reports_to_delete:
report_id = html_file.stem
# Delete HTML file
try:
html_file.unlink()
logger.info(f"Deleted old report HTML: {html_file.name}")
deleted_count += 1
except Exception as e:
logger.warning(f"Failed to delete {html_file.name}: {e}")
# Delete corresponding JSON file
json_file = output_dir / f"{report_id}.json"
if json_file.exists():
try:
json_file.unlink()
logger.info(f"Deleted old report JSON: {json_file.name}")
except Exception as e:
logger.warning(f"Failed to delete {json_file.name}: {e}")
if deleted_count > 0:
logger.info(f"Cleanup completed: deleted {deleted_count} old report(s)")
except Exception as e:
logger.error(f"Error during cleanup: {e}", exc_info=True)
class ReportScheduler: class ReportScheduler:
"""Manages scheduled report generation.""" """Manages scheduled report generation."""
@ -44,7 +91,14 @@ class ReportScheduler:
) )
self.config = load_config(config_path) self.config = load_config(config_path)
self.scheduler = BlockingScheduler(timezone=self.config['scheduler']['timezone']) scheduler_timezone = self.config['scheduler'].get('timezone', 'America/New_York')
# Use BackgroundScheduler for thread compatibility (when run from API server)
# Use BlockingScheduler when run standalone
self.use_background = True # Set to False if running standalone
if self.use_background:
self.scheduler = BackgroundScheduler(timezone=scheduler_timezone)
else:
self.scheduler = BlockingScheduler(timezone=scheduler_timezone)
self.scheduler_config = self.config['scheduler'] self.scheduler_config = self.config['scheduler']
self.sharepoint_config = self.config.get('sharepoint', {}) self.sharepoint_config = self.config.get('sharepoint', {})
self.report_config = self.config.get('report', {}) self.report_config = self.config.get('report', {})
@ -78,10 +132,15 @@ class ReportScheduler:
logger.error(f"Failed to download from SharePoint: {e}") logger.error(f"Failed to download from SharePoint: {e}")
# Continue with report generation even if download fails # Continue with report generation even if download fails
# Generate report # Generate report with timestamp
logger.info("Generating report...") logger.info("Generating report...")
reports_dir = self.report_config.get('reports_dir', 'reports') reports_dir = self.report_config.get('reports_dir', 'reports')
output_file = Path(self.report_config.get('output_dir', 'output')) / 'report.json' output_dir = Path(self.report_config.get('output_dir', 'output'))
# Create timestamped filename (same format as API server)
timestamp = datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
report_id = f"report-{timestamp}"
output_file = output_dir / f"{report_id}.json"
report_data = generate_report( report_data = generate_report(
reports_dir=reports_dir, reports_dir=reports_dir,
@ -91,6 +150,12 @@ class ReportScheduler:
if report_data: if report_data:
logger.info("✓ Scheduled report generation completed successfully") logger.info("✓ Scheduled report generation completed successfully")
# Cleanup old reports (keep last 10)
try:
cleanup_old_reports(output_dir, Path(reports_dir), max_reports=10)
except Exception as e:
logger.warning(f"Failed to cleanup old reports: {e}")
else: else:
logger.error("✗ Scheduled report generation failed") logger.error("✗ Scheduled report generation failed")
@ -150,11 +215,18 @@ class ReportScheduler:
replace_existing=True replace_existing=True
) )
if self.use_background:
# BackgroundScheduler - just start it, don't block
self.scheduler.start()
logger.info("Scheduler started in background mode")
else:
# BlockingScheduler - block until interrupted
logger.info("Scheduler started. Press Ctrl+C to stop.") logger.info("Scheduler started. Press Ctrl+C to stop.")
try: try:
self.scheduler.start() self.scheduler.start()
except KeyboardInterrupt: except KeyboardInterrupt:
logger.info("Scheduler stopped by user") logger.info("Scheduler stopped by user")
self.scheduler.shutdown()
if __name__ == "__main__": if __name__ == "__main__":
@ -168,5 +240,7 @@ if __name__ == "__main__":
config_path = sys.argv[1] if len(sys.argv) > 1 else None config_path = sys.argv[1] if len(sys.argv) > 1 else None
scheduler = ReportScheduler(config_path=config_path) scheduler = ReportScheduler(config_path=config_path)
scheduler.use_background = False # Use BlockingScheduler for standalone mode
scheduler.scheduler = BlockingScheduler(timezone=scheduler.config['scheduler'].get('timezone', 'America/New_York'))
scheduler.start() scheduler.start()

View File

@ -284,42 +284,92 @@ class SharePointDownloader:
# ALWAYS clear ALL existing Excel files before downloading (to ensure only new files are used) # ALWAYS clear ALL existing Excel files before downloading (to ensure only new files are used)
# This is critical to prevent combining multiple files # This is critical to prevent combining multiple files
# Wait a moment first to allow any previous file operations to complete
import time
time.sleep(1.0) # Give file handles time to close
existing_files = list(local_dir_path.glob('*.xlsx')) + list(local_dir_path.glob('*.xls')) existing_files = list(local_dir_path.glob('*.xlsx')) + list(local_dir_path.glob('*.xls'))
cleared_count = 0 cleared_count = 0
failed_to_clear = [] failed_to_clear = []
for old_file in existing_files: for old_file in existing_files:
try: try:
# On Windows, files might be locked - try multiple times # On Windows, files might be locked - try multiple times with increasing delays
max_retries = 3 max_retries = 5
retry_count = 0 retry_count = 0
while retry_count < max_retries: cleared_this_file = False
while retry_count < max_retries and not cleared_this_file:
try: try:
old_file.unlink() old_file.unlink()
cleared_count += 1 cleared_count += 1
cleared_this_file = True
logger.info(f"Cleared existing file before download: {old_file.name}") logger.info(f"Cleared existing file before download: {old_file.name}")
break break
except PermissionError: except PermissionError as pe:
retry_count += 1 retry_count += 1
if retry_count < max_retries: if retry_count < max_retries:
# Increasing delay: 0.5s, 1s, 2s, 3s
import time import time
time.sleep(0.5) # Wait 500ms before retry delay = min(0.5 * (2 ** retry_count), 3.0)
logger.warning(f"File {old_file.name} is locked (attempt {retry_count}/{max_retries}), waiting {delay}s...")
time.sleep(delay)
else: else:
raise # Last attempt failed - try renaming instead of deleting
logger.warning(f"Cannot delete {old_file.name}, trying to rename instead...")
try:
import time
timestamp = int(time.time())
backup_name = f"{old_file.stem}_backup_{timestamp}{old_file.suffix}"
backup_path = old_file.parent / backup_name
old_file.rename(backup_path)
cleared_count += 1
cleared_this_file = True
logger.info(f"Renamed locked file to backup: {old_file.name} -> {backup_name}")
except Exception as rename_error:
logger.error(f"Could not rename file either: {rename_error}")
raise pe # Raise original PermissionError
except Exception as e: except Exception as e:
if retry_count >= max_retries - 1:
raise
retry_count += 1
import time
time.sleep(1)
if not cleared_this_file:
failed_to_clear.append(old_file.name)
logger.error(f"Failed to clear existing file {old_file.name} after {max_retries} attempts")
except Exception as e:
if old_file.name not in failed_to_clear:
failed_to_clear.append(old_file.name) failed_to_clear.append(old_file.name)
logger.error(f"Failed to clear existing file {old_file.name}: {e}") logger.error(f"Failed to clear existing file {old_file.name}: {e}")
if failed_to_clear: if failed_to_clear:
logger.error(f"CRITICAL: Failed to clear {len(failed_to_clear)} file(s) before download: {failed_to_clear}") logger.error(f"CRITICAL: Failed to clear {len(failed_to_clear)} file(s) before download: {failed_to_clear}")
logger.error("This will cause data mixing! Files may be locked by another process.") logger.error("This will cause data mixing! Files may be locked by another process.")
# Don't fail here - let the download proceed, but log the warning logger.error("ABORTING download to prevent combining multiple files.")
raise Exception(f"Cannot download from SharePoint: {len(failed_to_clear)} file(s) could not be cleared. Please close any programs that might have these files open: {failed_to_clear}")
if cleared_count > 0: if cleared_count > 0:
logger.info(f"Cleared {cleared_count} existing Excel file(s) before downloading from SharePoint") logger.info(f"Cleared {cleared_count} existing Excel file(s) before downloading from SharePoint")
else: else:
logger.info("No existing Excel files found to clear (reports directory was empty)") logger.info("No existing Excel files found to clear (reports directory was empty)")
# VERIFY: Double-check that all Excel files are actually gone
remaining_files = list(local_dir_path.glob('*.xlsx')) + list(local_dir_path.glob('*.xls'))
if remaining_files:
logger.error(f"CRITICAL: After clearing, {len(remaining_files)} file(s) still exist: {[f.name for f in remaining_files]}")
logger.error("These files are likely locked. Attempting force removal...")
for remaining_file in remaining_files:
try:
remaining_file.unlink()
logger.info(f"Force-removed locked file: {remaining_file.name}")
except Exception as e:
logger.error(f"CRITICAL: Cannot remove locked file {remaining_file.name}: {e}")
raise Exception(f"Cannot proceed: File {remaining_file.name} is locked and cannot be deleted. Please close Excel or any other program using this file.")
logger.info("✓ Verified: All old Excel files cleared successfully")
# List files in folder # List files in folder
files = self.list_files_in_folder(folder_path, file_pattern) files = self.list_files_in_folder(folder_path, file_pattern)