#!/usr/bin/env python3 """ Web API Server for On-Demand Report Generation Provides REST API endpoints to trigger report generation on demand. """ import logging from pathlib import Path from typing import Optional, List, Dict import json from datetime import datetime import shutil import os try: from flask import Flask, jsonify, request, send_from_directory from flask_cors import CORS from werkzeug.utils import secure_filename FLASK_AVAILABLE = True except ImportError: FLASK_AVAILABLE = False logging.warning("Flask not installed. API server features disabled.") from config import load_config from report_generator import generate_report from sharepoint_downloader import download_from_sharepoint logger = logging.getLogger(__name__) app = None config = None def cleanup_old_reports(output_dir: Path, reports_dir: Path, max_reports: int = 10): """ Cleanup old reports and Excel files, keeping only the last max_reports. Args: output_dir: Directory containing report HTML/JSON files reports_dir: Directory containing Excel files max_reports: Maximum number of reports to keep """ try: # Get all report HTML files sorted by modification time (newest first) html_files = sorted(output_dir.glob('report-*.html'), key=lambda p: p.stat().st_mtime, reverse=True) if len(html_files) <= max_reports: return # No cleanup needed # Get reports to delete (oldest ones) reports_to_delete = html_files[max_reports:] deleted_count = 0 for html_file in reports_to_delete: report_id = html_file.stem # Delete HTML file try: html_file.unlink() logger.info(f"Deleted old report HTML: {html_file.name}") deleted_count += 1 except Exception as e: logger.warning(f"Failed to delete {html_file.name}: {e}") # Delete corresponding JSON file json_file = output_dir / f"{report_id}.json" if json_file.exists(): try: json_file.unlink() logger.info(f"Deleted old report JSON: {json_file.name}") except Exception as e: logger.warning(f"Failed to delete {json_file.name}: {e}") # Cleanup Excel files - keep only files associated with remaining reports if reports_dir.exists(): excel_files = list(reports_dir.glob('*.xlsx')) + list(reports_dir.glob('*.xls')) if len(excel_files) > max_reports: # Sort by modification time and delete oldest excel_files_sorted = sorted(excel_files, key=lambda p: p.stat().st_mtime, reverse=True) excel_to_delete = excel_files_sorted[max_reports:] for excel_file in excel_to_delete: try: excel_file.unlink() logger.info(f"Deleted old Excel file: {excel_file.name}") except Exception as e: logger.warning(f"Failed to delete {excel_file.name}: {e}") logger.info(f"Cleanup completed: deleted {deleted_count} old report(s)") except Exception as e: logger.error(f"Error during cleanup: {e}", exc_info=True) def create_app(config_path: Optional[str] = None): """Create and configure Flask app.""" global app, config if not FLASK_AVAILABLE: raise ImportError( "Flask is required for API server. " "Install it with: pip install flask flask-cors" ) app = Flask(__name__) CORS(app) # Enable CORS for all routes config = load_config(config_path) api_config = config.get('api', {}) sharepoint_config = config.get('sharepoint', {}) report_config = config.get('report', {}) # Resolve paths relative to script location, not current working directory script_dir = Path(__file__).parent.absolute() # Convert relative paths to absolute paths relative to script directory if 'output_dir' in report_config and report_config['output_dir']: output_dir = Path(report_config['output_dir']) if not output_dir.is_absolute(): report_config['output_dir'] = str(script_dir / output_dir) if 'reports_dir' in report_config and report_config['reports_dir']: reports_dir = Path(report_config['reports_dir']) if not reports_dir.is_absolute(): report_config['reports_dir'] = str(script_dir / reports_dir) # Store config in app context app.config['API_KEY'] = api_config.get('api_key') app.config['SHAREPOINT_CONFIG'] = sharepoint_config app.config['REPORT_CONFIG'] = report_config @app.route('/health', methods=['GET']) def health(): """Health check endpoint.""" return jsonify({ 'status': 'healthy', 'service': 'vendor-report-generator' }) @app.route('/api/generate', methods=['POST']) def generate_report_endpoint(): """ Generate report on demand. Request body (optional): { "download_from_sharepoint": true, "reports_dir": "reports", "output_file": "output/report.json" } """ # Check API key if configured api_key = app.config.get('API_KEY') if api_key: provided_key = request.headers.get('X-API-Key') or request.json.get('api_key') if request.json else None if provided_key != api_key: return jsonify({'error': 'Invalid API key'}), 401 try: request_data = request.json or {} download_from_sp = request_data.get('download_from_sharepoint', True) # Default to True for backward compatibility downloaded_files = [] # Initialize here for scope # Get report config early - needed for error handling report_config = app.config['REPORT_CONFIG'] # Download from SharePoint if requested AND no manual upload happened # If download_from_sharepoint is False, it means manual upload was used if download_from_sp: sp_config = app.config['SHAREPOINT_CONFIG'] if not sp_config.get('enabled'): return jsonify({ 'error': 'SharePoint is not enabled in configuration' }), 400 logger.info("Downloading files from SharePoint...") try: downloaded = download_from_sharepoint( site_url=sp_config['site_url'], folder_path=sp_config.get('folder_path'), file_path=sp_config.get('file_path'), local_dir=sp_config.get('local_dir', 'reports'), tenant_id=sp_config.get('tenant_id'), client_id=sp_config.get('client_id'), client_secret=sp_config.get('client_secret'), use_app_authentication=sp_config.get('use_app_authentication', True), file_pattern=sp_config.get('file_pattern'), overwrite=sp_config.get('overwrite', True) ) downloaded_files = downloaded if downloaded else [] logger.info(f"Downloaded {len(downloaded_files)} file(s) from SharePoint: {downloaded_files}") # If SharePoint download failed (no files downloaded), check if we have existing files if len(downloaded_files) == 0: logger.warning("SharePoint download returned 0 files. This could mean:") logger.warning("1. SharePoint permissions issue (401/403 error)") logger.warning("2. No files found in the specified folder") logger.warning("3. Site access not granted (Resource-Specific Consent needed)") logger.warning("Checking if existing files are available in reports directory...") # Check if there are existing files we can use reports_dir_path = Path(report_config.get('reports_dir', 'reports')) if not reports_dir_path.is_absolute(): script_dir = Path(__file__).parent.absolute() reports_dir_path = script_dir / reports_dir_path if reports_dir_path.exists(): existing_files = list(reports_dir_path.glob('*.xlsx')) + list(reports_dir_path.glob('*.xls')) if existing_files: logger.warning(f"Found {len(existing_files)} existing file(s) in reports directory. Will use these instead.") logger.warning("NOTE: These may be old files. Consider using manual upload for fresh data.") else: logger.error("No files available - neither from SharePoint nor existing files.") return jsonify({ 'error': 'SharePoint download failed and no existing files found', 'details': 'SharePoint access may require Resource-Specific Consent (RSC). Please use manual file upload or fix SharePoint permissions.', 'sharepoint_error': True }), 500 except Exception as e: logger.error(f"Failed to download from SharePoint: {e}", exc_info=True) # Check if we have existing files as fallback reports_dir_path = Path(report_config.get('reports_dir', 'reports')) if not reports_dir_path.is_absolute(): script_dir = Path(__file__).parent.absolute() reports_dir_path = script_dir / reports_dir_path if reports_dir_path.exists(): existing_files = list(reports_dir_path.glob('*.xlsx')) + list(reports_dir_path.glob('*.xls')) if existing_files: logger.warning(f"SharePoint download failed, but found {len(existing_files)} existing file(s). Will use these.") downloaded_files = [] # Continue with existing files else: return jsonify({ 'error': f'SharePoint download failed: {str(e)}', 'details': 'No existing files found. Please use manual file upload or fix SharePoint permissions.', 'sharepoint_error': True }), 500 else: return jsonify({ 'error': f'SharePoint download failed: {str(e)}', 'details': 'Reports directory does not exist. Please use manual file upload or fix SharePoint permissions.', 'sharepoint_error': True }), 500 # Generate report with timestamp reports_dir = request_data.get('reports_dir', report_config.get('reports_dir', 'reports')) output_dir_str = report_config.get('output_dir', 'output') output_dir = Path(output_dir_str) if not output_dir.is_absolute(): script_dir = Path(__file__).parent.absolute() output_dir = script_dir / output_dir # Create timestamped filename timestamp = datetime.now().strftime('%Y-%m-%d-%H-%M-%S') report_id = f"report-{timestamp}" output_file = str(output_dir / f"{report_id}.json") # Log which files will be used for generation reports_dir_path = Path(reports_dir) if not reports_dir_path.is_absolute(): script_dir = Path(__file__).parent.absolute() reports_dir_path = script_dir / reports_dir_path logger.info(f"Generating report from {reports_dir_path.absolute()}...") logger.info(f"Reports directory exists: {reports_dir_path.exists()}") # Determine which files to use for generation # CRITICAL: Only use files that were just downloaded/uploaded, not old ones if downloaded_files: # Files were downloaded from SharePoint - use only those logger.info(f"Using {len(downloaded_files)} file(s) downloaded from SharePoint") # Verify that reports_dir only contains the downloaded files (should be empty of old files) all_files = list(reports_dir_path.glob('*.xlsx')) + list(reports_dir_path.glob('*.xls')) downloaded_file_paths = [Path(f).name for f in downloaded_files] # Get just filenames if len(all_files) != len(downloaded_files): logger.warning(f"WARNING: Found {len(all_files)} file(s) in reports_dir but only {len(downloaded_files)} were downloaded!") logger.warning("This might indicate old files weren't cleared. Clearing now...") for file in all_files: if file.name not in downloaded_file_paths: try: file.unlink() logger.info(f"Cleared unexpected file: {file.name}") except Exception as e: logger.error(f"Failed to clear unexpected file {file.name}: {e}") elif not download_from_sp: # Manual upload was used (download_from_sharepoint=False) # Upload endpoint should have cleared old files, but double-check # Only use files uploaded in the last 10 minutes to avoid combining with old files if reports_dir_path.exists(): excel_files = list(reports_dir_path.glob('*.xlsx')) + list(reports_dir_path.glob('*.xls')) current_time = datetime.now().timestamp() recent_files = [] for excel_file in excel_files: mtime = excel_file.stat().st_mtime # Only use files modified in the last 10 minutes (should be the uploaded ones) # Increased from 5 to 10 minutes to account for upload + generation delay if current_time - mtime < 600: # 10 minutes recent_files.append(excel_file) mtime_str = datetime.fromtimestamp(mtime).strftime('%Y-%m-%d %H:%M:%S') logger.info(f" - {excel_file.name} (modified: {mtime_str}) - will be used for manual upload generation") else: logger.warning(f" - {excel_file.name} (modified: {datetime.fromtimestamp(mtime).strftime('%Y-%m-%d %H:%M:%S')}) - skipping (too old, might be from previous run)") if len(recent_files) < len(excel_files): logger.warning(f"Found {len(excel_files)} total file(s), but only {len(recent_files)} are recent. Clearing old files to avoid combining...") # Clear old files to ensure we only use the manually uploaded ones for excel_file in excel_files: if excel_file not in recent_files: try: excel_file.unlink() logger.info(f"Cleared old file: {excel_file.name}") except Exception as e: logger.warning(f"Failed to clear old file {excel_file.name}: {e}") if len(recent_files) == 0: logger.error("Manual upload was used but no recent files found in reports directory!") logger.error("This might mean:") logger.error("1. Files were not uploaded successfully") logger.error("2. Files were uploaded but cleared before generation") logger.error("3. File modification times are incorrect") return jsonify({ 'error': 'No files found for manual upload generation', 'details': 'Files were uploaded but not found in reports directory. Please try uploading again.', 'manual_upload_error': True }), 400 # Verify we only have the recently uploaded files all_files = list(reports_dir_path.glob('*.xlsx')) + list(reports_dir_path.glob('*.xls')) if len(all_files) != len(recent_files): logger.warning(f"WARNING: Found {len(all_files)} file(s) but only {len(recent_files)} are recent!") logger.warning("Clearing old files to ensure only uploaded files are used...") for file in all_files: if file not in recent_files: try: file.unlink() logger.info(f"Cleared unexpected old file: {file.name}") except Exception as e: logger.error(f"Failed to clear unexpected file {file.name}: {e}") logger.info(f"Will generate report from {len(recent_files)} recently uploaded file(s)") else: logger.error("Manual upload was used but reports directory does not exist!") return jsonify({ 'error': 'Reports directory does not exist', 'details': 'Cannot generate report from manual upload - reports directory is missing.', 'manual_upload_error': True }), 500 else: # SharePoint download was attempted but failed - this shouldn't happen if download_from_sp=True # But if it does, we should NOT use existing files as they might be old logger.error("SharePoint download was requested but failed, and no manual upload was used!") logger.error("This should not happen - refusing to use potentially old files") return jsonify({ 'error': 'SharePoint download failed and no manual upload provided', 'details': 'Cannot generate report - no data source available. Please try again or use manual upload.', 'sharepoint_error': True }), 400 report_data = generate_report( reports_dir=str(reports_dir_path), output_file=output_file, verbose=False # Don't print to console in API mode ) if report_data: # Generate HTML with same timestamp html_file = output_dir / f"{report_id}.html" from html_generator import generate_html_report generate_html_report(output_file, str(html_file)) # Cleanup old reports (keep only last 10) # Ensure reports_dir is a Path object reports_dir_for_cleanup = Path(reports_dir) if isinstance(reports_dir, str) else reports_dir cleanup_old_reports(output_dir, reports_dir_for_cleanup, max_reports=10) return jsonify({ 'status': 'success', 'message': 'Report generated successfully', 'report_id': report_id, 'report_date': timestamp, 'output_file': output_file, 'summary': report_data.get('summary', {}), 'vendors_count': len(report_data.get('vendors', [])), 'downloaded_files': len(downloaded_files) if download_from_sp else 0, 'downloaded_file_names': [Path(f).name for f in downloaded_files] if download_from_sp else [] }) else: return jsonify({ 'error': 'Report generation failed' }), 500 except Exception as e: logger.error(f"Error generating report: {e}", exc_info=True) return jsonify({ 'error': f'Report generation failed: {str(e)}' }), 500 @app.route('/api/upload', methods=['POST']) def upload_files(): """Upload Excel files manually. Clears old files before uploading new ones.""" try: if 'files' not in request.files: return jsonify({'error': 'No files provided'}), 400 files = request.files.getlist('files') if not files or all(f.filename == '' for f in files): return jsonify({'error': 'No files selected'}), 400 report_config = app.config['REPORT_CONFIG'] reports_dir_str = report_config.get('reports_dir', 'reports') reports_dir = Path(reports_dir_str) if not reports_dir.is_absolute(): script_dir = Path(__file__).parent.absolute() reports_dir = script_dir / reports_dir # Ensure reports directory exists reports_dir.mkdir(parents=True, exist_ok=True) # ALWAYS clear ALL old Excel files from reports directory before uploading new ones # CRITICAL: This prevents combining multiple files in report generation old_excel_files = list(reports_dir.glob('*.xlsx')) + list(reports_dir.glob('*.xls')) cleared_count = 0 failed_to_clear = [] for old_file in old_excel_files: try: # On Windows, files might be locked - try multiple times max_retries = 3 retry_count = 0 while retry_count < max_retries: try: old_file.unlink() cleared_count += 1 logger.info(f"Cleared old file before upload: {old_file.name}") break except PermissionError: retry_count += 1 if retry_count < max_retries: import time time.sleep(0.5) # Wait 500ms before retry else: raise except Exception as e: failed_to_clear.append(old_file.name) logger.error(f"Failed to clear old file {old_file.name}: {e}") # If any files failed to clear, fail the upload to prevent mixing old and new data if failed_to_clear: logger.error(f"CRITICAL: Failed to clear {len(failed_to_clear)} file(s) before upload: {failed_to_clear}") return jsonify({ 'error': f'Failed to clear {len(failed_to_clear)} old file(s) before upload. Please ensure files are not locked or in use.', 'failed_files': failed_to_clear, 'details': 'Old files must be cleared before upload to ensure report generation uses only the new file(s). Files may be locked by Excel or another process.' }), 500 if cleared_count > 0: logger.info(f"Cleared {cleared_count} old Excel file(s) before upload") else: logger.info("No old Excel files found to clear (reports directory was empty)") uploaded_count = 0 uploaded_files = [] for file in files: if file.filename == '': continue # Check if it's an Excel file filename = secure_filename(file.filename) if not (filename.endswith('.xlsx') or filename.endswith('.xls')): logger.warning(f"Skipping non-Excel file: {filename}") continue # Save file to reports directory file_path = reports_dir / filename file.save(str(file_path)) uploaded_count += 1 uploaded_files.append(filename) logger.info(f"Uploaded file: {filename} -> {file_path}") if uploaded_count == 0: return jsonify({'error': 'No valid Excel files uploaded'}), 400 # Warn if multiple files uploaded - reports should be generated from ONE file if uploaded_count > 1: logger.warning(f"WARNING: {uploaded_count} files uploaded. Reports should be generated from a single file. Only the newest file will be used.") return jsonify({ 'status': 'success', 'message': f'Successfully uploaded {uploaded_count} file(s)', 'uploaded_count': uploaded_count, 'uploaded_files': uploaded_files, 'cleared_old_files': cleared_count, 'warning': f'{uploaded_count} file(s) uploaded - only the newest will be used for report generation' if uploaded_count > 1 else None }) except Exception as e: logger.error(f"Error uploading files: {e}", exc_info=True) return jsonify({'error': f'Failed to upload files: {str(e)}'}), 500 @app.route('/api/status', methods=['GET']) def status(): """Get service status and configuration.""" return jsonify({ 'status': 'running', 'sharepoint_enabled': app.config['SHAREPOINT_CONFIG'].get('enabled', False), 'reports_dir': app.config['REPORT_CONFIG'].get('reports_dir', 'reports'), 'output_dir': app.config['REPORT_CONFIG'].get('output_dir', 'output') }) @app.route('/api/report/json', methods=['GET']) def get_report_json(): """Get latest report JSON file.""" try: report_config = app.config['REPORT_CONFIG'] output_dir_str = report_config.get('output_dir', 'output') output_dir = Path(output_dir_str) if not output_dir.is_absolute(): script_dir = Path(__file__).parent.absolute() output_dir = script_dir / output_dir report_file = output_dir / 'report.json' if not report_file.exists(): return jsonify({'error': 'Report not found. Generate a report first.'}), 404 with open(report_file, 'r', encoding='utf-8') as f: report_data = json.load(f) return jsonify(report_data) except Exception as e: logger.error(f"Error reading report JSON: {e}", exc_info=True) return jsonify({'error': f'Failed to read report: {str(e)}'}), 500 @app.route('/api/report/html', methods=['GET']) def get_report_html(): """Get report HTML file by report_id (or latest if not specified).""" try: from flask import send_from_directory report_config = app.config['REPORT_CONFIG'] output_dir_str = report_config.get('output_dir', 'output') output_dir = Path(output_dir_str) if not output_dir.is_absolute(): script_dir = Path(__file__).parent.absolute() output_dir = script_dir / output_dir # Get report_id from query parameter, default to latest report_id = request.args.get('report_id') if report_id: # Check if it's a timestamped report or legacy report html_file = output_dir / f"{report_id}.html" # If not found and it starts with "report-", might be a legacy report with generated ID if not html_file.exists() and report_id.startswith('report-'): # Try legacy report.html legacy_file = output_dir / 'report.html' if legacy_file.exists(): html_file = legacy_file else: return jsonify({'error': f'Report {report_id} not found.'}), 404 elif not html_file.exists(): return jsonify({'error': f'Report {report_id} not found.'}), 404 else: # Get latest report (check both timestamped and legacy) timestamped_files = list(output_dir.glob('report-*.html')) legacy_file = output_dir / 'report.html' html_files = [] if legacy_file.exists(): html_files.append(legacy_file) html_files.extend(timestamped_files) if not html_files: return jsonify({'error': 'No reports found. Generate a report first.'}), 404 html_file = sorted(html_files, key=lambda p: p.stat().st_mtime, reverse=True)[0] return send_from_directory(str(output_dir), html_file.name, mimetype='text/html') except Exception as e: logger.error(f"Error reading report HTML: {e}", exc_info=True) return jsonify({'error': f'Failed to read report HTML: {str(e)}'}), 500 @app.route('/api/reports/list', methods=['GET']) def list_reports(): """List all available reports (last 10).""" try: report_config = app.config['REPORT_CONFIG'] output_dir_str = report_config.get('output_dir', 'output') output_dir = Path(output_dir_str) # Ensure absolute path if not output_dir.is_absolute(): script_dir = Path(__file__).parent.absolute() output_dir = script_dir / output_dir # Log for debugging logger.info(f"Looking for reports in: {output_dir.absolute()}") logger.info(f"Output directory exists: {output_dir.exists()}") if output_dir.exists(): logger.info(f"Files in output directory: {list(output_dir.glob('*'))}") # Find all report HTML files (both timestamped and non-timestamped) timestamped_files = list(output_dir.glob('report-*.html')) legacy_file = output_dir / 'report.html' logger.info(f"Found {len(timestamped_files)} timestamped report files") logger.info(f"Legacy report.html exists: {legacy_file.exists()}") if legacy_file.exists(): logger.info(f"Legacy report.html path: {legacy_file.absolute()}") html_files = [] # Add legacy report.html if it exists if legacy_file.exists(): html_files.append(legacy_file) logger.info("Added legacy report.html to list") # Add timestamped files html_files.extend(timestamped_files) logger.info(f"Total HTML files found: {len(html_files)}") reports = [] for html_file in sorted(html_files, key=lambda p: p.stat().st_mtime, reverse=True)[:10]: report_id = html_file.stem # e.g., "report-2025-11-08-11-25-46" or "report" # Handle legacy report.html if report_id == 'report': # Use file modification time as timestamp mtime = html_file.stat().st_mtime dt = datetime.fromtimestamp(mtime) timestamp_str = dt.strftime('%Y-%m-%d-%H-%M-%S') date_str = dt.strftime('%Y-%m-%d %H:%M:%S') report_id = f"report-{timestamp_str}" else: # Timestamped report timestamp_str = report_id.replace('report-', '') try: # Parse timestamp to create readable date dt = datetime.strptime(timestamp_str, '%Y-%m-%d-%H-%M-%S') date_str = dt.strftime('%Y-%m-%d %H:%M:%S') except: date_str = timestamp_str # Get file size file_size = html_file.stat().st_size reports.append({ 'report_id': report_id, 'date': date_str, 'timestamp': timestamp_str, 'file_size': file_size }) return jsonify({ 'reports': reports, 'count': len(reports) }) except Exception as e: logger.error(f"Error listing reports: {e}", exc_info=True) return jsonify({'error': f'Failed to list reports: {str(e)}'}), 500 return app def run_server(config_path: Optional[str] = None, host: Optional[str] = None, port: Optional[int] = None): """Run the API server.""" app = create_app(config_path) api_config = config.get('api', {}) server_host = host or api_config.get('host', '0.0.0.0') server_port = port or api_config.get('port', 8080) logger.info(f"Starting API server on {server_host}:{server_port}") app.run(host=server_host, port=server_port, debug=False) if __name__ == "__main__": import sys logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' ) config_path = sys.argv[1] if len(sys.argv) > 1 else None # Check if API is enabled config = load_config(config_path) if not config.get('api', {}).get('enabled', False): logger.warning("API is disabled in configuration. Set api.enabled=true to enable.") logger.info("Starting API server anyway (for testing)...") run_server(config_path=config_path)