#!/usr/bin/env python3 """ SharePoint File Downloader using Office365-REST-Python-Client Downloads Excel files from SharePoint to the local reports directory. Uses Office365-REST-Python-Client library for SharePoint REST API access. """ import os from pathlib import Path from typing import Optional, List from datetime import datetime import logging try: from office365.sharepoint.client_context import ClientContext from office365.runtime.auth.authentication_context import AuthenticationContext from office365.sharepoint.files.file import File OFFICE365_AVAILABLE = True except ImportError: OFFICE365_AVAILABLE = False logging.warning("office365-rest-python-client not installed. SharePoint features disabled.") logger = logging.getLogger(__name__) class SharePointDownloader: """Downloads files from SharePoint using Office365-REST-Python-Client.""" def __init__( self, site_url: str, tenant_id: Optional[str] = None, client_id: Optional[str] = None, client_secret: Optional[str] = None, use_app_authentication: bool = True ): """ Initialize SharePoint downloader using Office365-REST-Python-Client. Args: site_url: SharePoint site URL (e.g., "https://yourcompany.sharepoint.com/sites/YourSite") tenant_id: Azure AD tenant ID (required for app authentication) client_id: Azure AD app client ID (required for app authentication) client_secret: Azure AD app client secret (required for app authentication) use_app_authentication: Whether to use app authentication (default: True) """ if not OFFICE365_AVAILABLE: raise ImportError( "office365-rest-python-client is required for SharePoint integration. " "Install it with: pip install Office365-REST-Python-Client" ) self.site_url = site_url.rstrip('/') self.tenant_id = tenant_id self.client_id = client_id self.client_secret = client_secret self.use_app_authentication = use_app_authentication self.ctx = None if not self.client_id or not self.client_secret: logger.error("Client ID and Client Secret are required for SharePoint authentication.") raise ValueError("Missing Azure AD credentials for SharePoint.") def connect(self) -> bool: """Connect to SharePoint site.""" if self.ctx: return True try: if self.use_app_authentication: # App-only authentication using Office365-REST-Python-Client from office365.runtime.auth.client_credential import ClientCredential logger.info(f"Connecting to SharePoint site: {self.site_url}") logger.info(f"Using Client ID: {self.client_id[:8]}... (truncated for security)") credentials = ClientCredential(self.client_id, self.client_secret) self.ctx = ClientContext(self.site_url).with_credentials(credentials) # Test connection by getting web # This will fail if RSC is not granted or credentials are wrong web = self.ctx.web self.ctx.load(web) self.ctx.execute_query() logger.info(f"Successfully connected to SharePoint site: {web.properties['Title']}") return True else: logger.error("Only app-only authentication is supported") return False except Exception as e: error_msg = str(e) logger.error(f"Failed to connect to SharePoint: {error_msg}", exc_info=True) # Provide helpful error messages if "Unsupported app only token" in error_msg or "401" in error_msg: logger.error("This error usually means:") logger.error("1. Resource-Specific Consent (RSC) is not granted for this site") logger.error("2. Go to: {}/_layouts/15/appinv.aspx".format(self.site_url)) logger.error("3. Enter App ID: {}".format(self.client_id)) logger.error("4. Grant permission with XML: ") elif "403" in error_msg or "Forbidden" in error_msg: logger.error("403 Forbidden - App does not have access to this site") logger.error("RSC must be granted via appinv.aspx") elif "Invalid client secret" in error_msg or "invalid_client" in error_msg: logger.error("Invalid client credentials - check CLIENT_ID and CLIENT_SECRET") return False def list_files_in_folder( self, folder_path: str, file_pattern: Optional[str] = None ) -> List[dict]: """ List files in a SharePoint folder. Args: folder_path: Folder path relative to site root (e.g., "Shared Documents/General/Amazon Punchlist [EXTERNAL]") file_pattern: Optional file pattern filter (e.g., "*.xlsx") Returns: List of file metadata dictionaries """ if not self.ctx: if not self.connect(): return [] try: # Normalize folder path # User provides: /Shared Documents/General/Amazon Punchlist [EXTERNAL] # SharePoint needs: /sites/SiteName/Shared Documents/General/Amazon Punchlist [EXTERNAL] folder_path = folder_path.strip('/') # Extract site path from site_url from urllib.parse import urlparse site_path = urlparse(self.site_url).path.strip('/') # Construct full server-relative URL # If folder_path already starts with site path, use as-is # Otherwise, prepend site path if folder_path.startswith(site_path + '/'): server_relative_url = f"/{folder_path}" elif site_path: server_relative_url = f"/{site_path}/{folder_path}" else: server_relative_url = f"/{folder_path}" logger.info(f"Listing files in folder: {server_relative_url}") logger.info(f"Site URL: {self.site_url}, Site path: {site_path}, Folder path: {folder_path}") # Get folder folder = self.ctx.web.get_folder_by_server_relative_url(server_relative_url) files = folder.files self.ctx.load(files) self.ctx.execute_query() excel_files = [] for file in files: file_name = file.properties["Name"] # Only consider Excel files if file_name and (file_name.endswith('.xlsx') or file_name.endswith('.xls')): # Apply file pattern filter if provided if file_pattern: pattern = file_pattern.replace('*', '') if not file_name.endswith(pattern): continue excel_files.append({ "name": file_name, "server_relative_url": file.properties.get("ServerRelativeUrl", ""), "size": file.properties.get("Length", 0), "time_last_modified": file.properties.get("TimeLastModified", "") }) logger.info(f"Found {len(excel_files)} Excel file(s) in folder") for file_info in excel_files: logger.info(f" - {file_info['name']} ({file_info['size']} bytes)") return excel_files except Exception as e: logger.error(f"Error listing files: {e}", exc_info=True) return [] def download_file( self, server_relative_url: str, file_name: str, local_path: str, overwrite: bool = True ) -> bool: """ Download a single file from SharePoint. Args: server_relative_url: Server-relative URL of the file file_name: The original name of the file (for logging) local_path: Local path where file should be saved overwrite: Whether to overwrite existing file Returns: True if successful, False otherwise """ if not self.ctx: if not self.connect(): return False local_file = None try: local_file_path = Path(local_path) local_file_path.parent.mkdir(parents=True, exist_ok=True) if local_file_path.exists() and not overwrite: logger.info(f"File already exists, skipping: {local_path}") return True logger.info(f"Downloading file: {file_name} from {server_relative_url} to {local_path}") # Get file file = self.ctx.web.get_file_by_server_relative_url(server_relative_url) self.ctx.load(file) self.ctx.execute_query() # Open file and keep it open during download # The Office365 library writes to the file during execute_query() local_file = open(local_file_path, "wb") # Download file content - this sets up the download callback file.download(local_file) # Execute the query - this actually performs the download and writes to the file self.ctx.execute_query() # Close the file after download completes local_file.close() local_file = None logger.info(f"Successfully downloaded: {file_name} -> {local_path}") return True except Exception as e: logger.error(f"Error downloading file {file_name}: {e}", exc_info=True) if local_file: try: local_file.close() except: pass return False def download_files_from_folder( self, folder_path: str, local_dir: str, file_pattern: Optional[str] = None, overwrite: bool = True, clear_existing: bool = True ) -> List[str]: """ Download Excel files from a SharePoint folder. By default, downloads only the newest file and clears old files. Args: folder_path: Folder path relative to site root local_dir: Local directory to save files file_pattern: Optional file pattern filter (e.g., "*.xlsx") overwrite: Whether to overwrite existing files clear_existing: If True, clear all existing Excel files before downloading (default: True) Returns: List of downloaded file paths (typically 1 file - the newest) """ # Connect to SharePoint if not self.connect(): logger.error("Failed to connect to SharePoint") return [] # Prepare local directory local_dir_path = Path(local_dir) local_dir_path.mkdir(parents=True, exist_ok=True) # ALWAYS clear ALL existing Excel files before downloading (to ensure only new files are used) # This is critical to prevent combining multiple files # Wait a moment first to allow any previous file operations to complete import time time.sleep(1.0) # Give file handles time to close existing_files = list(local_dir_path.glob('*.xlsx')) + list(local_dir_path.glob('*.xls')) cleared_count = 0 failed_to_clear = [] for old_file in existing_files: try: # On Windows, files might be locked - try multiple times with increasing delays max_retries = 5 retry_count = 0 cleared_this_file = False while retry_count < max_retries and not cleared_this_file: try: old_file.unlink() cleared_count += 1 cleared_this_file = True logger.info(f"Cleared existing file before download: {old_file.name}") break except PermissionError as pe: retry_count += 1 if retry_count < max_retries: # Increasing delay: 0.5s, 1s, 2s, 3s import time delay = min(0.5 * (2 ** retry_count), 3.0) logger.warning(f"File {old_file.name} is locked (attempt {retry_count}/{max_retries}), waiting {delay}s...") time.sleep(delay) else: # Last attempt failed - try renaming instead of deleting logger.warning(f"Cannot delete {old_file.name}, trying to rename instead...") try: import time timestamp = int(time.time()) backup_name = f"{old_file.stem}_backup_{timestamp}{old_file.suffix}" backup_path = old_file.parent / backup_name old_file.rename(backup_path) cleared_count += 1 cleared_this_file = True logger.info(f"Renamed locked file to backup: {old_file.name} -> {backup_name}") except Exception as rename_error: logger.error(f"Could not rename file either: {rename_error}") raise pe # Raise original PermissionError except Exception as e: if retry_count >= max_retries - 1: raise retry_count += 1 import time time.sleep(1) if not cleared_this_file: failed_to_clear.append(old_file.name) logger.error(f"Failed to clear existing file {old_file.name} after {max_retries} attempts") except Exception as e: if old_file.name not in failed_to_clear: failed_to_clear.append(old_file.name) logger.error(f"Failed to clear existing file {old_file.name}: {e}") if failed_to_clear: logger.error(f"CRITICAL: Failed to clear {len(failed_to_clear)} file(s) before download: {failed_to_clear}") logger.error("This will cause data mixing! Files may be locked by another process.") logger.error("ABORTING download to prevent combining multiple files.") raise Exception(f"Cannot download from SharePoint: {len(failed_to_clear)} file(s) could not be cleared. Please close any programs that might have these files open: {failed_to_clear}") if cleared_count > 0: logger.info(f"Cleared {cleared_count} existing Excel file(s) before downloading from SharePoint") else: logger.info("No existing Excel files found to clear (reports directory was empty)") # VERIFY: Double-check that all Excel files are actually gone remaining_files = list(local_dir_path.glob('*.xlsx')) + list(local_dir_path.glob('*.xls')) if remaining_files: logger.error(f"CRITICAL: After clearing, {len(remaining_files)} file(s) still exist: {[f.name for f in remaining_files]}") logger.error("These files are likely locked. Attempting force removal...") for remaining_file in remaining_files: try: remaining_file.unlink() logger.info(f"Force-removed locked file: {remaining_file.name}") except Exception as e: logger.error(f"CRITICAL: Cannot remove locked file {remaining_file.name}: {e}") raise Exception(f"Cannot proceed: File {remaining_file.name} is locked and cannot be deleted. Please close Excel or any other program using this file.") logger.info("✓ Verified: All old Excel files cleared successfully") # List files in folder files = self.list_files_in_folder(folder_path, file_pattern) if not files: logger.warning(f"No Excel files found in folder: {folder_path}") return [] # Sort files by last modified date (newest first) and download only the newest one def parse_time(time_str): try: if time_str: # Office365 library returns datetime objects or ISO strings if isinstance(time_str, datetime): return time_str # Try parsing ISO format return datetime.fromisoformat(str(time_str).replace('Z', '+00:00')) return datetime.min except: return datetime.min files_sorted = sorted(files, key=lambda f: parse_time(f.get("time_last_modified", "")), reverse=True) if len(files_sorted) > 1: logger.info(f"Found {len(files_sorted)} Excel file(s) in SharePoint folder. Using only the newest file.") logger.info(f"Newest file: {files_sorted[0]['name']} (modified: {files_sorted[0].get('time_last_modified', 'Unknown')})") if len(files_sorted) > 1: logger.info(f"Skipping {len(files_sorted) - 1} older file(s) to avoid combining data") # Download only the newest file downloaded_files = [] newest_file = files_sorted[0] file_name = newest_file["name"] server_relative_url = newest_file["server_relative_url"] local_file_path = local_dir_path / file_name if self.download_file(server_relative_url, file_name, str(local_file_path), overwrite=overwrite): downloaded_files.append(str(local_file_path)) logger.info(f"Successfully downloaded newest file: {file_name}") else: logger.error(f"Failed to download file: {file_name}") logger.info(f"Downloaded {len(downloaded_files)} file(s) from {folder_path} (using only newest file)") return downloaded_files def download_from_sharepoint( site_url: str, folder_path: Optional[str] = None, file_path: Optional[str] = None, local_dir: str = "reports", tenant_id: Optional[str] = None, client_id: Optional[str] = None, client_secret: Optional[str] = None, use_app_authentication: bool = True, file_pattern: Optional[str] = None, overwrite: bool = True, clear_existing: bool = True ) -> List[str]: """ Convenience function to download files from SharePoint using Office365-REST-Python-Client. Args: site_url: SharePoint site URL folder_path: Path to folder (if downloading all files from folder) file_path: Path to specific file (if downloading single file) - NOT YET IMPLEMENTED local_dir: Local directory to save files tenant_id: Azure AD tenant ID (not used by Office365 library, but kept for compatibility) client_id: Azure AD app client ID (required for app authentication) client_secret: Azure AD app client secret (required for app authentication) use_app_authentication: Use app authentication (default: True) file_pattern: Pattern to filter files (e.g., "*.xlsx") overwrite: Whether to overwrite existing files clear_existing: If True, clear all existing Excel files before downloading (default: True) Returns: List of downloaded file paths (typically 1 file - the newest) """ if not folder_path and not file_path: logger.error("Either folder_path or file_path must be provided") return [] if file_path: logger.warning("Single file download not yet implemented") return [] downloader = SharePointDownloader( site_url=site_url, tenant_id=tenant_id, client_id=client_id, client_secret=client_secret, use_app_authentication=use_app_authentication ) if folder_path: # Download only the newest file from folder (clears existing files first) return downloader.download_files_from_folder( folder_path=folder_path, local_dir=local_dir, file_pattern=file_pattern, overwrite=overwrite, clear_existing=clear_existing ) else: logger.error("file_path download not yet implemented") return [] if __name__ == "__main__": import sys from config import load_config logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' ) config = load_config() if not config.get('sharepoint'): logger.error("SharePoint configuration not found") sys.exit(1) sp_config = config['sharepoint'] downloaded = download_from_sharepoint( site_url=sp_config['site_url'], folder_path=sp_config.get('folder_path'), file_path=sp_config.get('file_path'), local_dir=sp_config.get('local_dir', 'reports'), tenant_id=sp_config.get('tenant_id'), client_id=sp_config.get('client_id'), client_secret=sp_config.get('client_secret'), use_app_authentication=sp_config.get('use_app_authentication', True), file_pattern=sp_config.get('file_pattern'), overwrite=sp_config.get('overwrite', True) ) print(f"Downloaded {len(downloaded)} file(s)") for file_path in downloaded: print(f" - {file_path}")