#!/usr/bin/env python3 """ Excel to Tabulated Text Converter Converts Excel files in the reports directory to tabulated text format for processing by LLM. """ import os import pandas as pd from pathlib import Path from typing import List, Optional from tabulate import tabulate def excel_to_tabulated_text(excel_path: str, output_file: Optional[str] = None) -> str: """ Convert Excel file to tabulated text format. Args: excel_path: Path to Excel file output_file: Optional path to save output. If None, returns string. Returns: Tabulated text representation of Excel data """ try: xl_file = pd.ExcelFile(excel_path) filename = Path(excel_path).name output_lines = [] for sheet_name in xl_file.sheet_names: # Try reading with header first try: df = pd.read_excel(xl_file, sheet_name=sheet_name) if df.empty: # Try without header df = pd.read_excel(xl_file, sheet_name=sheet_name, header=None) except Exception: # Fallback: read without header df = pd.read_excel(xl_file, sheet_name=sheet_name, header=None) if df.empty: continue # Add file and sheet headers output_lines.append(f"FILENAME: {filename}") output_lines.append(f"SHEET: {sheet_name}") output_lines.append("") # Replace NaN with empty string df = df.fillna("") # Use tabulate to format the table nicely # Using 'simple' format for clean, readable tables # First row as headers if column names exist, otherwise use indices if df.columns.tolist() and not all(str(col).startswith('Unnamed') for col in df.columns): # Use column names as headers table_str = tabulate(df, headers='keys', tablefmt='simple', showindex=False) else: # No meaningful headers, use first row or no headers table_str = tabulate(df, headers='firstrow' if len(df) > 0 else 'keys', tablefmt='simple', showindex=False) output_lines.append(table_str) output_lines.append("") output_lines.append("=" * 80) output_lines.append("") result = "\n".join(output_lines) if output_file: with open(output_file, 'w', encoding='utf-8') as f: f.write(result) return result except Exception as e: error_msg = f"Error processing {excel_path}: {str(e)}" return error_msg def process_reports_directory(reports_dir: str = "reports") -> str: """ Process all Excel files in the reports directory. Args: reports_dir: Directory containing Excel files Returns: Combined tabulated text from all Excel files """ reports_path = Path(reports_dir) if not reports_path.exists(): return f"Reports directory '{reports_dir}' not found." all_outputs = [] excel_files = list(reports_path.glob("*.xlsx")) + list(reports_path.glob("*.xls")) if not excel_files: return f"No Excel files found in '{reports_dir}' directory." for excel_file in excel_files: text_output = excel_to_tabulated_text(str(excel_file)) all_outputs.append(text_output) return "\n\n".join(all_outputs) if __name__ == "__main__": import sys # Process all files in reports directory if len(sys.argv) > 1: # Single file mode output = excel_to_tabulated_text(sys.argv[1]) print(output) else: # Directory mode output = process_reports_directory() print(output) # Optionally save to file output_path = Path("output") output_path.mkdir(exist_ok=True) with open(output_path / "excel_data.txt", 'w', encoding='utf-8') as f: f.write(output) print(f"\nOutput saved to: {output_path / 'excel_data.txt'}")