vendor_report/sharepoint_downloader.py

#!/usr/bin/env python3
"""
SharePoint File Downloader using Office365-REST-Python-Client

Downloads Excel files from SharePoint to the local reports directory.
Uses Office365-REST-Python-Client library for SharePoint REST API access.
"""

import os
from pathlib import Path
from typing import Optional, List
from datetime import datetime
import logging

try:
    from office365.sharepoint.client_context import ClientContext
    from office365.runtime.auth.authentication_context import AuthenticationContext
    from office365.sharepoint.files.file import File
    OFFICE365_AVAILABLE = True
except ImportError:
    OFFICE365_AVAILABLE = False
    logging.warning("office365-rest-python-client not installed. SharePoint features disabled.")


logger = logging.getLogger(__name__)


class SharePointDownloader:
    """Downloads files from SharePoint using Office365-REST-Python-Client."""

    def __init__(
        self,
        site_url: str,
        tenant_id: Optional[str] = None,
        client_id: Optional[str] = None,
        client_secret: Optional[str] = None,
        use_app_authentication: bool = True
    ):
        """
        Initialize SharePoint downloader using Office365-REST-Python-Client.

        Args:
            site_url: SharePoint site URL (e.g., "https://yourcompany.sharepoint.com/sites/YourSite")
            tenant_id: Azure AD tenant ID (required for app authentication)
            client_id: Azure AD app client ID (required for app authentication)
            client_secret: Azure AD app client secret (required for app authentication)
            use_app_authentication: Whether to use app authentication (default: True)
        """
        if not OFFICE365_AVAILABLE:
            raise ImportError(
                "office365-rest-python-client is required for SharePoint integration. "
                "Install it with: pip install Office365-REST-Python-Client"
            )

        self.site_url = site_url.rstrip('/')
        self.tenant_id = tenant_id
        self.client_id = client_id
        self.client_secret = client_secret
        self.use_app_authentication = use_app_authentication
        self.ctx = None

        if not self.client_id or not self.client_secret:
            logger.error("Client ID and Client Secret are required for SharePoint authentication.")
            raise ValueError("Missing Azure AD credentials for SharePoint.")

    def connect(self) -> bool:
        """Connect to SharePoint site."""
        if self.ctx:
            return True

        try:
            if self.use_app_authentication:
                # App-only authentication using Office365-REST-Python-Client
                from office365.runtime.auth.client_credential import ClientCredential

                logger.info(f"Connecting to SharePoint site: {self.site_url}")
                logger.info(f"Using Client ID: {self.client_id[:8]}... (truncated for security)")

                credentials = ClientCredential(self.client_id, self.client_secret)
                self.ctx = ClientContext(self.site_url).with_credentials(credentials)

                # Test connection by getting web
                # This will fail if RSC is not granted or credentials are wrong
                web = self.ctx.web
                self.ctx.load(web)
                self.ctx.execute_query()

                logger.info(f"Successfully connected to SharePoint site: {web.properties['Title']}")
                return True
            else:
                logger.error("Only app-only authentication is supported")
                return False

        except Exception as e:
            error_msg = str(e)
            logger.error(f"Failed to connect to SharePoint: {error_msg}", exc_info=True)

            # Provide helpful error messages
            if "Unsupported app only token" in error_msg or "401" in error_msg:
                logger.error("This error usually means:")
                logger.error("1. Resource-Specific Consent (RSC) is not granted for this site")
                logger.error("2. Go to: {}/_layouts/15/appinv.aspx".format(self.site_url))
                logger.error("3. Enter App ID: {}".format(self.client_id))
                logger.error("4. Grant permission with XML: <AppPermissionRequests AllowAppOnlyPolicy=\"true\"><AppPermissionRequest Scope=\"http://sharepoint/content/sitecollection\" Right=\"Read\" /></AppPermissionRequests>")
            elif "403" in error_msg or "Forbidden" in error_msg:
                logger.error("403 Forbidden - App does not have access to this site")
                logger.error("RSC must be granted via appinv.aspx")
            elif "Invalid client secret" in error_msg or "invalid_client" in error_msg:
                logger.error("Invalid client credentials - check CLIENT_ID and CLIENT_SECRET")

            return False

    def list_files_in_folder(
        self,
        folder_path: str,
        file_pattern: Optional[str] = None
    ) -> List[dict]:
        """
        List files in a SharePoint folder.

        Args:
            folder_path: Folder path relative to site root (e.g., "Shared Documents/General/Amazon Punchlist [EXTERNAL]")
            file_pattern: Optional file pattern filter (e.g., "*.xlsx")

        Returns:
            List of file metadata dictionaries
        """
        if not self.ctx:
            if not self.connect():
                return []

        try:
            # Normalize folder path
            # User provides: /Shared Documents/General/Amazon Punchlist [EXTERNAL]
            # SharePoint needs: /sites/SiteName/Shared Documents/General/Amazon Punchlist [EXTERNAL]
            folder_path = folder_path.strip('/')

            # Extract site path from site_url
            from urllib.parse import urlparse
            site_path = urlparse(self.site_url).path.strip('/')

            # Construct full server-relative URL
            # If folder_path already starts with site path, use as-is
            # Otherwise, prepend site path
            if folder_path.startswith(site_path + '/'):
                server_relative_url = f"/{folder_path}"
            elif site_path:
                server_relative_url = f"/{site_path}/{folder_path}"
            else:
                server_relative_url = f"/{folder_path}"

            logger.info(f"Listing files in folder: {server_relative_url}")
            logger.info(f"Site URL: {self.site_url}, Site path: {site_path}, Folder path: {folder_path}")

            # Get folder
            folder = self.ctx.web.get_folder_by_server_relative_url(server_relative_url)
            files = folder.files
            self.ctx.load(files)
            self.ctx.execute_query()

            excel_files = []
            for file in files:
                file_name = file.properties["Name"]
                # Only consider Excel files
                if file_name and (file_name.endswith('.xlsx') or file_name.endswith('.xls')):
                    # Apply file pattern filter if provided
                    if file_pattern:
                        pattern = file_pattern.replace('*', '')
                        if not file_name.endswith(pattern):
                            continue

                    excel_files.append({
                        "name": file_name,
                        "server_relative_url": file.properties.get("ServerRelativeUrl", ""),
                        "size": file.properties.get("Length", 0),
                        "time_last_modified": file.properties.get("TimeLastModified", "")
                    })

            logger.info(f"Found {len(excel_files)} Excel file(s) in folder")
            for file_info in excel_files:
                logger.info(f"  - {file_info['name']} ({file_info['size']} bytes)")

            return excel_files

        except Exception as e:
            logger.error(f"Error listing files: {e}", exc_info=True)
            return []

    def download_file(
        self,
        server_relative_url: str,
        file_name: str,
        local_path: str,
        overwrite: bool = True
    ) -> bool:
        """
        Download a single file from SharePoint.

        Args:
            server_relative_url: Server-relative URL of the file
            file_name: The original name of the file (for logging)
            local_path: Local path where file should be saved
            overwrite: Whether to overwrite existing file

        Returns:
            True if successful, False otherwise
        """
        if not self.ctx:
            if not self.connect():
                return False

        local_file = None
        try:
            local_file_path = Path(local_path)
            local_file_path.parent.mkdir(parents=True, exist_ok=True)

            if local_file_path.exists() and not overwrite:
                logger.info(f"File already exists, skipping: {local_path}")
                return True

            logger.info(f"Downloading file: {file_name} from {server_relative_url} to {local_path}")

            # Get file
            file = self.ctx.web.get_file_by_server_relative_url(server_relative_url)
            self.ctx.load(file)
            self.ctx.execute_query()

            # Open file and keep it open during download
            # The Office365 library writes to the file during execute_query()
            local_file = open(local_file_path, "wb")

            # Download file content - this sets up the download callback
            file.download(local_file)

            # Execute the query - this actually performs the download and writes to the file
            self.ctx.execute_query()

            # Close the file after download completes
            local_file.close()
            local_file = None

            logger.info(f"Successfully downloaded: {file_name} -> {local_path}")
            return True

        except Exception as e:
            logger.error(f"Error downloading file {file_name}: {e}", exc_info=True)
            if local_file:
                try:
                    local_file.close()
                except:
                    pass
            return False

    def download_files_from_folder(
        self,
        folder_path: str,
        local_dir: str,
        file_pattern: Optional[str] = None,
        overwrite: bool = True,
        clear_existing: bool = True
    ) -> List[str]:
        """
        Download Excel files from a SharePoint folder.
        By default, downloads only the newest file and clears old files.

        Args:
            folder_path: Folder path relative to site root
            local_dir: Local directory to save files
            file_pattern: Optional file pattern filter (e.g., "*.xlsx")
            overwrite: Whether to overwrite existing files
            clear_existing: If True, clear all existing Excel files before downloading (default: True)

        Returns:
            List of downloaded file paths (typically 1 file - the newest)
        """
        # Connect to SharePoint
        if not self.connect():
            logger.error("Failed to connect to SharePoint")
            return []

        # Prepare local directory
        local_dir_path = Path(local_dir)
        local_dir_path.mkdir(parents=True, exist_ok=True)

        # ALWAYS clear ALL existing Excel files before downloading (to ensure only new files are used)
        # This is critical to prevent combining multiple files
        existing_files = list(local_dir_path.glob('*.xlsx')) + list(local_dir_path.glob('*.xls'))
        cleared_count = 0
        failed_to_clear = []

        for old_file in existing_files:
            try:
                # On Windows, files might be locked - try multiple times
                max_retries = 3
                retry_count = 0
                while retry_count < max_retries:
                    try:
                        old_file.unlink()
                        cleared_count += 1
                        logger.info(f"Cleared existing file before download: {old_file.name}")
                        break
                    except PermissionError:
                        retry_count += 1
                        if retry_count < max_retries:
                            import time
                            time.sleep(0.5)  # Wait 500ms before retry
                        else:
                            raise
            except Exception as e:
                failed_to_clear.append(old_file.name)
                logger.error(f"Failed to clear existing file {old_file.name}: {e}")

        if failed_to_clear:
            logger.error(f"CRITICAL: Failed to clear {len(failed_to_clear)} file(s) before download: {failed_to_clear}")
            logger.error("This will cause data mixing! Files may be locked by another process.")
            # Don't fail here - let the download proceed, but log the warning

        if cleared_count > 0:
            logger.info(f"Cleared {cleared_count} existing Excel file(s) before downloading from SharePoint")
        else:
            logger.info("No existing Excel files found to clear (reports directory was empty)")

        # List files in folder
        files = self.list_files_in_folder(folder_path, file_pattern)

        if not files:
            logger.warning(f"No Excel files found in folder: {folder_path}")
            return []

        # Sort files by last modified date (newest first) and download only the newest one
        def parse_time(time_str):
            try:
                if time_str:
                    # Office365 library returns datetime objects or ISO strings
                    if isinstance(time_str, datetime):
                        return time_str
                    # Try parsing ISO format
                    return datetime.fromisoformat(str(time_str).replace('Z', '+00:00'))
                return datetime.min
            except:
                return datetime.min

        files_sorted = sorted(files, key=lambda f: parse_time(f.get("time_last_modified", "")), reverse=True)

        if len(files_sorted) > 1:
            logger.info(f"Found {len(files_sorted)} Excel file(s) in SharePoint folder. Using only the newest file.")
            logger.info(f"Newest file: {files_sorted[0]['name']} (modified: {files_sorted[0].get('time_last_modified', 'Unknown')})")
            if len(files_sorted) > 1:
                logger.info(f"Skipping {len(files_sorted) - 1} older file(s) to avoid combining data")

        # Download only the newest file
        downloaded_files = []
        newest_file = files_sorted[0]
        file_name = newest_file["name"]
        server_relative_url = newest_file["server_relative_url"]
        local_file_path = local_dir_path / file_name

        if self.download_file(server_relative_url, file_name, str(local_file_path), overwrite=overwrite):
            downloaded_files.append(str(local_file_path))
            logger.info(f"Successfully downloaded newest file: {file_name}")
        else:
            logger.error(f"Failed to download file: {file_name}")

        logger.info(f"Downloaded {len(downloaded_files)} file(s) from {folder_path} (using only newest file)")
        return downloaded_files


def download_from_sharepoint(
    site_url: str,
    folder_path: Optional[str] = None,
    file_path: Optional[str] = None,
    local_dir: str = "reports",
    tenant_id: Optional[str] = None,
    client_id: Optional[str] = None,
    client_secret: Optional[str] = None,
    use_app_authentication: bool = True,
    file_pattern: Optional[str] = None,
    overwrite: bool = True,
    clear_existing: bool = True
) -> List[str]:
    """
    Convenience function to download files from SharePoint using Office365-REST-Python-Client.

    Args:
        site_url: SharePoint site URL
        folder_path: Path to folder (if downloading all files from folder)
        file_path: Path to specific file (if downloading single file) - NOT YET IMPLEMENTED
        local_dir: Local directory to save files
        tenant_id: Azure AD tenant ID (not used by Office365 library, but kept for compatibility)
        client_id: Azure AD app client ID (required for app authentication)
        client_secret: Azure AD app client secret (required for app authentication)
        use_app_authentication: Use app authentication (default: True)
        file_pattern: Pattern to filter files (e.g., "*.xlsx")
        overwrite: Whether to overwrite existing files
        clear_existing: If True, clear all existing Excel files before downloading (default: True)

    Returns:
        List of downloaded file paths (typically 1 file - the newest)
    """
    if not folder_path and not file_path:
        logger.error("Either folder_path or file_path must be provided")
        return []

    if file_path:
        logger.warning("Single file download not yet implemented")
        return []

    downloader = SharePointDownloader(
        site_url=site_url,
        tenant_id=tenant_id,
        client_id=client_id,
        client_secret=client_secret,
        use_app_authentication=use_app_authentication
    )

    if folder_path:
        # Download only the newest file from folder (clears existing files first)
        return downloader.download_files_from_folder(
            folder_path=folder_path,
            local_dir=local_dir,
            file_pattern=file_pattern,
            overwrite=overwrite,
            clear_existing=clear_existing
        )
    else:
        logger.error("file_path download not yet implemented")
        return []


if __name__ == "__main__":
    import sys
    from config import load_config

    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
    )

    config = load_config()
    if not config.get('sharepoint'):
        logger.error("SharePoint configuration not found")
        sys.exit(1)

    sp_config = config['sharepoint']

    downloaded = download_from_sharepoint(
        site_url=sp_config['site_url'],
        folder_path=sp_config.get('folder_path'),
        file_path=sp_config.get('file_path'),
        local_dir=sp_config.get('local_dir', 'reports'),
        tenant_id=sp_config.get('tenant_id'),
        client_id=sp_config.get('client_id'),
        client_secret=sp_config.get('client_secret'),
        use_app_authentication=sp_config.get('use_app_authentication', True),
        file_pattern=sp_config.get('file_pattern'),
        overwrite=sp_config.get('overwrite', True)
    )

    print(f"Downloaded {len(downloaded)} file(s)")
    for file_path in downloaded:
        print(f"  - {file_path}")