462 lines
19 KiB
Python
462 lines
19 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
SharePoint File Downloader using Office365-REST-Python-Client
|
|
|
|
Downloads Excel files from SharePoint to the local reports directory.
|
|
Uses Office365-REST-Python-Client library for SharePoint REST API access.
|
|
"""
|
|
|
|
import os
|
|
from pathlib import Path
|
|
from typing import Optional, List
|
|
from datetime import datetime
|
|
import logging
|
|
|
|
try:
|
|
from office365.sharepoint.client_context import ClientContext
|
|
from office365.runtime.auth.authentication_context import AuthenticationContext
|
|
from office365.sharepoint.files.file import File
|
|
OFFICE365_AVAILABLE = True
|
|
except ImportError:
|
|
OFFICE365_AVAILABLE = False
|
|
logging.warning("office365-rest-python-client not installed. SharePoint features disabled.")
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class SharePointDownloader:
|
|
"""Downloads files from SharePoint using Office365-REST-Python-Client."""
|
|
|
|
def __init__(
|
|
self,
|
|
site_url: str,
|
|
tenant_id: Optional[str] = None,
|
|
client_id: Optional[str] = None,
|
|
client_secret: Optional[str] = None,
|
|
use_app_authentication: bool = True
|
|
):
|
|
"""
|
|
Initialize SharePoint downloader using Office365-REST-Python-Client.
|
|
|
|
Args:
|
|
site_url: SharePoint site URL (e.g., "https://yourcompany.sharepoint.com/sites/YourSite")
|
|
tenant_id: Azure AD tenant ID (required for app authentication)
|
|
client_id: Azure AD app client ID (required for app authentication)
|
|
client_secret: Azure AD app client secret (required for app authentication)
|
|
use_app_authentication: Whether to use app authentication (default: True)
|
|
"""
|
|
if not OFFICE365_AVAILABLE:
|
|
raise ImportError(
|
|
"office365-rest-python-client is required for SharePoint integration. "
|
|
"Install it with: pip install Office365-REST-Python-Client"
|
|
)
|
|
|
|
self.site_url = site_url.rstrip('/')
|
|
self.tenant_id = tenant_id
|
|
self.client_id = client_id
|
|
self.client_secret = client_secret
|
|
self.use_app_authentication = use_app_authentication
|
|
self.ctx = None
|
|
|
|
if not self.client_id or not self.client_secret:
|
|
logger.error("Client ID and Client Secret are required for SharePoint authentication.")
|
|
raise ValueError("Missing Azure AD credentials for SharePoint.")
|
|
|
|
def connect(self) -> bool:
|
|
"""Connect to SharePoint site."""
|
|
if self.ctx:
|
|
return True
|
|
|
|
try:
|
|
if self.use_app_authentication:
|
|
# App-only authentication using Office365-REST-Python-Client
|
|
from office365.runtime.auth.client_credential import ClientCredential
|
|
|
|
logger.info(f"Connecting to SharePoint site: {self.site_url}")
|
|
logger.info(f"Using Client ID: {self.client_id[:8]}... (truncated for security)")
|
|
|
|
credentials = ClientCredential(self.client_id, self.client_secret)
|
|
self.ctx = ClientContext(self.site_url).with_credentials(credentials)
|
|
|
|
# Test connection by getting web
|
|
# This will fail if RSC is not granted or credentials are wrong
|
|
web = self.ctx.web
|
|
self.ctx.load(web)
|
|
self.ctx.execute_query()
|
|
|
|
logger.info(f"Successfully connected to SharePoint site: {web.properties['Title']}")
|
|
return True
|
|
else:
|
|
logger.error("Only app-only authentication is supported")
|
|
return False
|
|
|
|
except Exception as e:
|
|
error_msg = str(e)
|
|
logger.error(f"Failed to connect to SharePoint: {error_msg}", exc_info=True)
|
|
|
|
# Provide helpful error messages
|
|
if "Unsupported app only token" in error_msg or "401" in error_msg:
|
|
logger.error("This error usually means:")
|
|
logger.error("1. Resource-Specific Consent (RSC) is not granted for this site")
|
|
logger.error("2. Go to: {}/_layouts/15/appinv.aspx".format(self.site_url))
|
|
logger.error("3. Enter App ID: {}".format(self.client_id))
|
|
logger.error("4. Grant permission with XML: <AppPermissionRequests AllowAppOnlyPolicy=\"true\"><AppPermissionRequest Scope=\"http://sharepoint/content/sitecollection\" Right=\"Read\" /></AppPermissionRequests>")
|
|
elif "403" in error_msg or "Forbidden" in error_msg:
|
|
logger.error("403 Forbidden - App does not have access to this site")
|
|
logger.error("RSC must be granted via appinv.aspx")
|
|
elif "Invalid client secret" in error_msg or "invalid_client" in error_msg:
|
|
logger.error("Invalid client credentials - check CLIENT_ID and CLIENT_SECRET")
|
|
|
|
return False
|
|
|
|
def list_files_in_folder(
|
|
self,
|
|
folder_path: str,
|
|
file_pattern: Optional[str] = None
|
|
) -> List[dict]:
|
|
"""
|
|
List files in a SharePoint folder.
|
|
|
|
Args:
|
|
folder_path: Folder path relative to site root (e.g., "Shared Documents/General/Amazon Punchlist [EXTERNAL]")
|
|
file_pattern: Optional file pattern filter (e.g., "*.xlsx")
|
|
|
|
Returns:
|
|
List of file metadata dictionaries
|
|
"""
|
|
if not self.ctx:
|
|
if not self.connect():
|
|
return []
|
|
|
|
try:
|
|
# Normalize folder path
|
|
# User provides: /Shared Documents/General/Amazon Punchlist [EXTERNAL]
|
|
# SharePoint needs: /sites/SiteName/Shared Documents/General/Amazon Punchlist [EXTERNAL]
|
|
folder_path = folder_path.strip('/')
|
|
|
|
# Extract site path from site_url
|
|
from urllib.parse import urlparse
|
|
site_path = urlparse(self.site_url).path.strip('/')
|
|
|
|
# Construct full server-relative URL
|
|
# If folder_path already starts with site path, use as-is
|
|
# Otherwise, prepend site path
|
|
if folder_path.startswith(site_path + '/'):
|
|
server_relative_url = f"/{folder_path}"
|
|
elif site_path:
|
|
server_relative_url = f"/{site_path}/{folder_path}"
|
|
else:
|
|
server_relative_url = f"/{folder_path}"
|
|
|
|
logger.info(f"Listing files in folder: {server_relative_url}")
|
|
logger.info(f"Site URL: {self.site_url}, Site path: {site_path}, Folder path: {folder_path}")
|
|
|
|
# Get folder
|
|
folder = self.ctx.web.get_folder_by_server_relative_url(server_relative_url)
|
|
files = folder.files
|
|
self.ctx.load(files)
|
|
self.ctx.execute_query()
|
|
|
|
excel_files = []
|
|
for file in files:
|
|
file_name = file.properties["Name"]
|
|
# Only consider Excel files
|
|
if file_name and (file_name.endswith('.xlsx') or file_name.endswith('.xls')):
|
|
# Apply file pattern filter if provided
|
|
if file_pattern:
|
|
pattern = file_pattern.replace('*', '')
|
|
if not file_name.endswith(pattern):
|
|
continue
|
|
|
|
excel_files.append({
|
|
"name": file_name,
|
|
"server_relative_url": file.properties.get("ServerRelativeUrl", ""),
|
|
"size": file.properties.get("Length", 0),
|
|
"time_last_modified": file.properties.get("TimeLastModified", "")
|
|
})
|
|
|
|
logger.info(f"Found {len(excel_files)} Excel file(s) in folder")
|
|
for file_info in excel_files:
|
|
logger.info(f" - {file_info['name']} ({file_info['size']} bytes)")
|
|
|
|
return excel_files
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error listing files: {e}", exc_info=True)
|
|
return []
|
|
|
|
def download_file(
|
|
self,
|
|
server_relative_url: str,
|
|
file_name: str,
|
|
local_path: str,
|
|
overwrite: bool = True
|
|
) -> bool:
|
|
"""
|
|
Download a single file from SharePoint.
|
|
|
|
Args:
|
|
server_relative_url: Server-relative URL of the file
|
|
file_name: The original name of the file (for logging)
|
|
local_path: Local path where file should be saved
|
|
overwrite: Whether to overwrite existing file
|
|
|
|
Returns:
|
|
True if successful, False otherwise
|
|
"""
|
|
if not self.ctx:
|
|
if not self.connect():
|
|
return False
|
|
|
|
local_file = None
|
|
try:
|
|
local_file_path = Path(local_path)
|
|
local_file_path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
if local_file_path.exists() and not overwrite:
|
|
logger.info(f"File already exists, skipping: {local_path}")
|
|
return True
|
|
|
|
logger.info(f"Downloading file: {file_name} from {server_relative_url} to {local_path}")
|
|
|
|
# Get file
|
|
file = self.ctx.web.get_file_by_server_relative_url(server_relative_url)
|
|
self.ctx.load(file)
|
|
self.ctx.execute_query()
|
|
|
|
# Open file and keep it open during download
|
|
# The Office365 library writes to the file during execute_query()
|
|
local_file = open(local_file_path, "wb")
|
|
|
|
# Download file content - this sets up the download callback
|
|
file.download(local_file)
|
|
|
|
# Execute the query - this actually performs the download and writes to the file
|
|
self.ctx.execute_query()
|
|
|
|
# Close the file after download completes
|
|
local_file.close()
|
|
local_file = None
|
|
|
|
logger.info(f"Successfully downloaded: {file_name} -> {local_path}")
|
|
return True
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error downloading file {file_name}: {e}", exc_info=True)
|
|
if local_file:
|
|
try:
|
|
local_file.close()
|
|
except:
|
|
pass
|
|
return False
|
|
|
|
def download_files_from_folder(
|
|
self,
|
|
folder_path: str,
|
|
local_dir: str,
|
|
file_pattern: Optional[str] = None,
|
|
overwrite: bool = True,
|
|
clear_existing: bool = True
|
|
) -> List[str]:
|
|
"""
|
|
Download Excel files from a SharePoint folder.
|
|
By default, downloads only the newest file and clears old files.
|
|
|
|
Args:
|
|
folder_path: Folder path relative to site root
|
|
local_dir: Local directory to save files
|
|
file_pattern: Optional file pattern filter (e.g., "*.xlsx")
|
|
overwrite: Whether to overwrite existing files
|
|
clear_existing: If True, clear all existing Excel files before downloading (default: True)
|
|
|
|
Returns:
|
|
List of downloaded file paths (typically 1 file - the newest)
|
|
"""
|
|
# Connect to SharePoint
|
|
if not self.connect():
|
|
logger.error("Failed to connect to SharePoint")
|
|
return []
|
|
|
|
# Prepare local directory
|
|
local_dir_path = Path(local_dir)
|
|
local_dir_path.mkdir(parents=True, exist_ok=True)
|
|
|
|
# ALWAYS clear ALL existing Excel files before downloading (to ensure only new files are used)
|
|
# This is critical to prevent combining multiple files
|
|
existing_files = list(local_dir_path.glob('*.xlsx')) + list(local_dir_path.glob('*.xls'))
|
|
cleared_count = 0
|
|
failed_to_clear = []
|
|
|
|
for old_file in existing_files:
|
|
try:
|
|
# On Windows, files might be locked - try multiple times
|
|
max_retries = 3
|
|
retry_count = 0
|
|
while retry_count < max_retries:
|
|
try:
|
|
old_file.unlink()
|
|
cleared_count += 1
|
|
logger.info(f"Cleared existing file before download: {old_file.name}")
|
|
break
|
|
except PermissionError:
|
|
retry_count += 1
|
|
if retry_count < max_retries:
|
|
import time
|
|
time.sleep(0.5) # Wait 500ms before retry
|
|
else:
|
|
raise
|
|
except Exception as e:
|
|
failed_to_clear.append(old_file.name)
|
|
logger.error(f"Failed to clear existing file {old_file.name}: {e}")
|
|
|
|
if failed_to_clear:
|
|
logger.error(f"CRITICAL: Failed to clear {len(failed_to_clear)} file(s) before download: {failed_to_clear}")
|
|
logger.error("This will cause data mixing! Files may be locked by another process.")
|
|
# Don't fail here - let the download proceed, but log the warning
|
|
|
|
if cleared_count > 0:
|
|
logger.info(f"Cleared {cleared_count} existing Excel file(s) before downloading from SharePoint")
|
|
else:
|
|
logger.info("No existing Excel files found to clear (reports directory was empty)")
|
|
|
|
# List files in folder
|
|
files = self.list_files_in_folder(folder_path, file_pattern)
|
|
|
|
if not files:
|
|
logger.warning(f"No Excel files found in folder: {folder_path}")
|
|
return []
|
|
|
|
# Sort files by last modified date (newest first) and download only the newest one
|
|
def parse_time(time_str):
|
|
try:
|
|
if time_str:
|
|
# Office365 library returns datetime objects or ISO strings
|
|
if isinstance(time_str, datetime):
|
|
return time_str
|
|
# Try parsing ISO format
|
|
return datetime.fromisoformat(str(time_str).replace('Z', '+00:00'))
|
|
return datetime.min
|
|
except:
|
|
return datetime.min
|
|
|
|
files_sorted = sorted(files, key=lambda f: parse_time(f.get("time_last_modified", "")), reverse=True)
|
|
|
|
if len(files_sorted) > 1:
|
|
logger.info(f"Found {len(files_sorted)} Excel file(s) in SharePoint folder. Using only the newest file.")
|
|
logger.info(f"Newest file: {files_sorted[0]['name']} (modified: {files_sorted[0].get('time_last_modified', 'Unknown')})")
|
|
if len(files_sorted) > 1:
|
|
logger.info(f"Skipping {len(files_sorted) - 1} older file(s) to avoid combining data")
|
|
|
|
# Download only the newest file
|
|
downloaded_files = []
|
|
newest_file = files_sorted[0]
|
|
file_name = newest_file["name"]
|
|
server_relative_url = newest_file["server_relative_url"]
|
|
local_file_path = local_dir_path / file_name
|
|
|
|
if self.download_file(server_relative_url, file_name, str(local_file_path), overwrite=overwrite):
|
|
downloaded_files.append(str(local_file_path))
|
|
logger.info(f"Successfully downloaded newest file: {file_name}")
|
|
else:
|
|
logger.error(f"Failed to download file: {file_name}")
|
|
|
|
logger.info(f"Downloaded {len(downloaded_files)} file(s) from {folder_path} (using only newest file)")
|
|
return downloaded_files
|
|
|
|
|
|
def download_from_sharepoint(
|
|
site_url: str,
|
|
folder_path: Optional[str] = None,
|
|
file_path: Optional[str] = None,
|
|
local_dir: str = "reports",
|
|
tenant_id: Optional[str] = None,
|
|
client_id: Optional[str] = None,
|
|
client_secret: Optional[str] = None,
|
|
use_app_authentication: bool = True,
|
|
file_pattern: Optional[str] = None,
|
|
overwrite: bool = True,
|
|
clear_existing: bool = True
|
|
) -> List[str]:
|
|
"""
|
|
Convenience function to download files from SharePoint using Office365-REST-Python-Client.
|
|
|
|
Args:
|
|
site_url: SharePoint site URL
|
|
folder_path: Path to folder (if downloading all files from folder)
|
|
file_path: Path to specific file (if downloading single file) - NOT YET IMPLEMENTED
|
|
local_dir: Local directory to save files
|
|
tenant_id: Azure AD tenant ID (not used by Office365 library, but kept for compatibility)
|
|
client_id: Azure AD app client ID (required for app authentication)
|
|
client_secret: Azure AD app client secret (required for app authentication)
|
|
use_app_authentication: Use app authentication (default: True)
|
|
file_pattern: Pattern to filter files (e.g., "*.xlsx")
|
|
overwrite: Whether to overwrite existing files
|
|
clear_existing: If True, clear all existing Excel files before downloading (default: True)
|
|
|
|
Returns:
|
|
List of downloaded file paths (typically 1 file - the newest)
|
|
"""
|
|
if not folder_path and not file_path:
|
|
logger.error("Either folder_path or file_path must be provided")
|
|
return []
|
|
|
|
if file_path:
|
|
logger.warning("Single file download not yet implemented")
|
|
return []
|
|
|
|
downloader = SharePointDownloader(
|
|
site_url=site_url,
|
|
tenant_id=tenant_id,
|
|
client_id=client_id,
|
|
client_secret=client_secret,
|
|
use_app_authentication=use_app_authentication
|
|
)
|
|
|
|
if folder_path:
|
|
# Download only the newest file from folder (clears existing files first)
|
|
return downloader.download_files_from_folder(
|
|
folder_path=folder_path,
|
|
local_dir=local_dir,
|
|
file_pattern=file_pattern,
|
|
overwrite=overwrite,
|
|
clear_existing=clear_existing
|
|
)
|
|
else:
|
|
logger.error("file_path download not yet implemented")
|
|
return []
|
|
|
|
|
|
if __name__ == "__main__":
|
|
import sys
|
|
from config import load_config
|
|
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
|
)
|
|
|
|
config = load_config()
|
|
if not config.get('sharepoint'):
|
|
logger.error("SharePoint configuration not found")
|
|
sys.exit(1)
|
|
|
|
sp_config = config['sharepoint']
|
|
|
|
downloaded = download_from_sharepoint(
|
|
site_url=sp_config['site_url'],
|
|
folder_path=sp_config.get('folder_path'),
|
|
file_path=sp_config.get('file_path'),
|
|
local_dir=sp_config.get('local_dir', 'reports'),
|
|
tenant_id=sp_config.get('tenant_id'),
|
|
client_id=sp_config.get('client_id'),
|
|
client_secret=sp_config.get('client_secret'),
|
|
use_app_authentication=sp_config.get('use_app_authentication', True),
|
|
file_pattern=sp_config.get('file_pattern'),
|
|
overwrite=sp_config.get('overwrite', True)
|
|
)
|
|
|
|
print(f"Downloaded {len(downloaded)} file(s)")
|
|
for file_path in downloaded:
|
|
print(f" - {file_path}")
|