vendor_report/sharepoint_downloader.py
2025-11-08 15:44:43 +04:00

462 lines
19 KiB
Python

#!/usr/bin/env python3
"""
SharePoint File Downloader using Office365-REST-Python-Client
Downloads Excel files from SharePoint to the local reports directory.
Uses Office365-REST-Python-Client library for SharePoint REST API access.
"""
import os
from pathlib import Path
from typing import Optional, List
from datetime import datetime
import logging
try:
from office365.sharepoint.client_context import ClientContext
from office365.runtime.auth.authentication_context import AuthenticationContext
from office365.sharepoint.files.file import File
OFFICE365_AVAILABLE = True
except ImportError:
OFFICE365_AVAILABLE = False
logging.warning("office365-rest-python-client not installed. SharePoint features disabled.")
logger = logging.getLogger(__name__)
class SharePointDownloader:
"""Downloads files from SharePoint using Office365-REST-Python-Client."""
def __init__(
self,
site_url: str,
tenant_id: Optional[str] = None,
client_id: Optional[str] = None,
client_secret: Optional[str] = None,
use_app_authentication: bool = True
):
"""
Initialize SharePoint downloader using Office365-REST-Python-Client.
Args:
site_url: SharePoint site URL (e.g., "https://yourcompany.sharepoint.com/sites/YourSite")
tenant_id: Azure AD tenant ID (required for app authentication)
client_id: Azure AD app client ID (required for app authentication)
client_secret: Azure AD app client secret (required for app authentication)
use_app_authentication: Whether to use app authentication (default: True)
"""
if not OFFICE365_AVAILABLE:
raise ImportError(
"office365-rest-python-client is required for SharePoint integration. "
"Install it with: pip install Office365-REST-Python-Client"
)
self.site_url = site_url.rstrip('/')
self.tenant_id = tenant_id
self.client_id = client_id
self.client_secret = client_secret
self.use_app_authentication = use_app_authentication
self.ctx = None
if not self.client_id or not self.client_secret:
logger.error("Client ID and Client Secret are required for SharePoint authentication.")
raise ValueError("Missing Azure AD credentials for SharePoint.")
def connect(self) -> bool:
"""Connect to SharePoint site."""
if self.ctx:
return True
try:
if self.use_app_authentication:
# App-only authentication using Office365-REST-Python-Client
from office365.runtime.auth.client_credential import ClientCredential
logger.info(f"Connecting to SharePoint site: {self.site_url}")
logger.info(f"Using Client ID: {self.client_id[:8]}... (truncated for security)")
credentials = ClientCredential(self.client_id, self.client_secret)
self.ctx = ClientContext(self.site_url).with_credentials(credentials)
# Test connection by getting web
# This will fail if RSC is not granted or credentials are wrong
web = self.ctx.web
self.ctx.load(web)
self.ctx.execute_query()
logger.info(f"Successfully connected to SharePoint site: {web.properties['Title']}")
return True
else:
logger.error("Only app-only authentication is supported")
return False
except Exception as e:
error_msg = str(e)
logger.error(f"Failed to connect to SharePoint: {error_msg}", exc_info=True)
# Provide helpful error messages
if "Unsupported app only token" in error_msg or "401" in error_msg:
logger.error("This error usually means:")
logger.error("1. Resource-Specific Consent (RSC) is not granted for this site")
logger.error("2. Go to: {}/_layouts/15/appinv.aspx".format(self.site_url))
logger.error("3. Enter App ID: {}".format(self.client_id))
logger.error("4. Grant permission with XML: <AppPermissionRequests AllowAppOnlyPolicy=\"true\"><AppPermissionRequest Scope=\"http://sharepoint/content/sitecollection\" Right=\"Read\" /></AppPermissionRequests>")
elif "403" in error_msg or "Forbidden" in error_msg:
logger.error("403 Forbidden - App does not have access to this site")
logger.error("RSC must be granted via appinv.aspx")
elif "Invalid client secret" in error_msg or "invalid_client" in error_msg:
logger.error("Invalid client credentials - check CLIENT_ID and CLIENT_SECRET")
return False
def list_files_in_folder(
self,
folder_path: str,
file_pattern: Optional[str] = None
) -> List[dict]:
"""
List files in a SharePoint folder.
Args:
folder_path: Folder path relative to site root (e.g., "Shared Documents/General/Amazon Punchlist [EXTERNAL]")
file_pattern: Optional file pattern filter (e.g., "*.xlsx")
Returns:
List of file metadata dictionaries
"""
if not self.ctx:
if not self.connect():
return []
try:
# Normalize folder path
# User provides: /Shared Documents/General/Amazon Punchlist [EXTERNAL]
# SharePoint needs: /sites/SiteName/Shared Documents/General/Amazon Punchlist [EXTERNAL]
folder_path = folder_path.strip('/')
# Extract site path from site_url
from urllib.parse import urlparse
site_path = urlparse(self.site_url).path.strip('/')
# Construct full server-relative URL
# If folder_path already starts with site path, use as-is
# Otherwise, prepend site path
if folder_path.startswith(site_path + '/'):
server_relative_url = f"/{folder_path}"
elif site_path:
server_relative_url = f"/{site_path}/{folder_path}"
else:
server_relative_url = f"/{folder_path}"
logger.info(f"Listing files in folder: {server_relative_url}")
logger.info(f"Site URL: {self.site_url}, Site path: {site_path}, Folder path: {folder_path}")
# Get folder
folder = self.ctx.web.get_folder_by_server_relative_url(server_relative_url)
files = folder.files
self.ctx.load(files)
self.ctx.execute_query()
excel_files = []
for file in files:
file_name = file.properties["Name"]
# Only consider Excel files
if file_name and (file_name.endswith('.xlsx') or file_name.endswith('.xls')):
# Apply file pattern filter if provided
if file_pattern:
pattern = file_pattern.replace('*', '')
if not file_name.endswith(pattern):
continue
excel_files.append({
"name": file_name,
"server_relative_url": file.properties.get("ServerRelativeUrl", ""),
"size": file.properties.get("Length", 0),
"time_last_modified": file.properties.get("TimeLastModified", "")
})
logger.info(f"Found {len(excel_files)} Excel file(s) in folder")
for file_info in excel_files:
logger.info(f" - {file_info['name']} ({file_info['size']} bytes)")
return excel_files
except Exception as e:
logger.error(f"Error listing files: {e}", exc_info=True)
return []
def download_file(
self,
server_relative_url: str,
file_name: str,
local_path: str,
overwrite: bool = True
) -> bool:
"""
Download a single file from SharePoint.
Args:
server_relative_url: Server-relative URL of the file
file_name: The original name of the file (for logging)
local_path: Local path where file should be saved
overwrite: Whether to overwrite existing file
Returns:
True if successful, False otherwise
"""
if not self.ctx:
if not self.connect():
return False
local_file = None
try:
local_file_path = Path(local_path)
local_file_path.parent.mkdir(parents=True, exist_ok=True)
if local_file_path.exists() and not overwrite:
logger.info(f"File already exists, skipping: {local_path}")
return True
logger.info(f"Downloading file: {file_name} from {server_relative_url} to {local_path}")
# Get file
file = self.ctx.web.get_file_by_server_relative_url(server_relative_url)
self.ctx.load(file)
self.ctx.execute_query()
# Open file and keep it open during download
# The Office365 library writes to the file during execute_query()
local_file = open(local_file_path, "wb")
# Download file content - this sets up the download callback
file.download(local_file)
# Execute the query - this actually performs the download and writes to the file
self.ctx.execute_query()
# Close the file after download completes
local_file.close()
local_file = None
logger.info(f"Successfully downloaded: {file_name} -> {local_path}")
return True
except Exception as e:
logger.error(f"Error downloading file {file_name}: {e}", exc_info=True)
if local_file:
try:
local_file.close()
except:
pass
return False
def download_files_from_folder(
self,
folder_path: str,
local_dir: str,
file_pattern: Optional[str] = None,
overwrite: bool = True,
clear_existing: bool = True
) -> List[str]:
"""
Download Excel files from a SharePoint folder.
By default, downloads only the newest file and clears old files.
Args:
folder_path: Folder path relative to site root
local_dir: Local directory to save files
file_pattern: Optional file pattern filter (e.g., "*.xlsx")
overwrite: Whether to overwrite existing files
clear_existing: If True, clear all existing Excel files before downloading (default: True)
Returns:
List of downloaded file paths (typically 1 file - the newest)
"""
# Connect to SharePoint
if not self.connect():
logger.error("Failed to connect to SharePoint")
return []
# Prepare local directory
local_dir_path = Path(local_dir)
local_dir_path.mkdir(parents=True, exist_ok=True)
# ALWAYS clear ALL existing Excel files before downloading (to ensure only new files are used)
# This is critical to prevent combining multiple files
existing_files = list(local_dir_path.glob('*.xlsx')) + list(local_dir_path.glob('*.xls'))
cleared_count = 0
failed_to_clear = []
for old_file in existing_files:
try:
# On Windows, files might be locked - try multiple times
max_retries = 3
retry_count = 0
while retry_count < max_retries:
try:
old_file.unlink()
cleared_count += 1
logger.info(f"Cleared existing file before download: {old_file.name}")
break
except PermissionError:
retry_count += 1
if retry_count < max_retries:
import time
time.sleep(0.5) # Wait 500ms before retry
else:
raise
except Exception as e:
failed_to_clear.append(old_file.name)
logger.error(f"Failed to clear existing file {old_file.name}: {e}")
if failed_to_clear:
logger.error(f"CRITICAL: Failed to clear {len(failed_to_clear)} file(s) before download: {failed_to_clear}")
logger.error("This will cause data mixing! Files may be locked by another process.")
# Don't fail here - let the download proceed, but log the warning
if cleared_count > 0:
logger.info(f"Cleared {cleared_count} existing Excel file(s) before downloading from SharePoint")
else:
logger.info("No existing Excel files found to clear (reports directory was empty)")
# List files in folder
files = self.list_files_in_folder(folder_path, file_pattern)
if not files:
logger.warning(f"No Excel files found in folder: {folder_path}")
return []
# Sort files by last modified date (newest first) and download only the newest one
def parse_time(time_str):
try:
if time_str:
# Office365 library returns datetime objects or ISO strings
if isinstance(time_str, datetime):
return time_str
# Try parsing ISO format
return datetime.fromisoformat(str(time_str).replace('Z', '+00:00'))
return datetime.min
except:
return datetime.min
files_sorted = sorted(files, key=lambda f: parse_time(f.get("time_last_modified", "")), reverse=True)
if len(files_sorted) > 1:
logger.info(f"Found {len(files_sorted)} Excel file(s) in SharePoint folder. Using only the newest file.")
logger.info(f"Newest file: {files_sorted[0]['name']} (modified: {files_sorted[0].get('time_last_modified', 'Unknown')})")
if len(files_sorted) > 1:
logger.info(f"Skipping {len(files_sorted) - 1} older file(s) to avoid combining data")
# Download only the newest file
downloaded_files = []
newest_file = files_sorted[0]
file_name = newest_file["name"]
server_relative_url = newest_file["server_relative_url"]
local_file_path = local_dir_path / file_name
if self.download_file(server_relative_url, file_name, str(local_file_path), overwrite=overwrite):
downloaded_files.append(str(local_file_path))
logger.info(f"Successfully downloaded newest file: {file_name}")
else:
logger.error(f"Failed to download file: {file_name}")
logger.info(f"Downloaded {len(downloaded_files)} file(s) from {folder_path} (using only newest file)")
return downloaded_files
def download_from_sharepoint(
site_url: str,
folder_path: Optional[str] = None,
file_path: Optional[str] = None,
local_dir: str = "reports",
tenant_id: Optional[str] = None,
client_id: Optional[str] = None,
client_secret: Optional[str] = None,
use_app_authentication: bool = True,
file_pattern: Optional[str] = None,
overwrite: bool = True,
clear_existing: bool = True
) -> List[str]:
"""
Convenience function to download files from SharePoint using Office365-REST-Python-Client.
Args:
site_url: SharePoint site URL
folder_path: Path to folder (if downloading all files from folder)
file_path: Path to specific file (if downloading single file) - NOT YET IMPLEMENTED
local_dir: Local directory to save files
tenant_id: Azure AD tenant ID (not used by Office365 library, but kept for compatibility)
client_id: Azure AD app client ID (required for app authentication)
client_secret: Azure AD app client secret (required for app authentication)
use_app_authentication: Use app authentication (default: True)
file_pattern: Pattern to filter files (e.g., "*.xlsx")
overwrite: Whether to overwrite existing files
clear_existing: If True, clear all existing Excel files before downloading (default: True)
Returns:
List of downloaded file paths (typically 1 file - the newest)
"""
if not folder_path and not file_path:
logger.error("Either folder_path or file_path must be provided")
return []
if file_path:
logger.warning("Single file download not yet implemented")
return []
downloader = SharePointDownloader(
site_url=site_url,
tenant_id=tenant_id,
client_id=client_id,
client_secret=client_secret,
use_app_authentication=use_app_authentication
)
if folder_path:
# Download only the newest file from folder (clears existing files first)
return downloader.download_files_from_folder(
folder_path=folder_path,
local_dir=local_dir,
file_pattern=file_pattern,
overwrite=overwrite,
clear_existing=clear_existing
)
else:
logger.error("file_path download not yet implemented")
return []
if __name__ == "__main__":
import sys
from config import load_config
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
config = load_config()
if not config.get('sharepoint'):
logger.error("SharePoint configuration not found")
sys.exit(1)
sp_config = config['sharepoint']
downloaded = download_from_sharepoint(
site_url=sp_config['site_url'],
folder_path=sp_config.get('folder_path'),
file_path=sp_config.get('file_path'),
local_dir=sp_config.get('local_dir', 'reports'),
tenant_id=sp_config.get('tenant_id'),
client_id=sp_config.get('client_id'),
client_secret=sp_config.get('client_secret'),
use_app_authentication=sp_config.get('use_app_authentication', True),
file_pattern=sp_config.get('file_pattern'),
overwrite=sp_config.get('overwrite', True)
)
print(f"Downloaded {len(downloaded)} file(s)")
for file_path in downloaded:
print(f" - {file_path}")