129 lines
4.1 KiB
Python
129 lines
4.1 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Excel to Tabulated Text Converter
|
|
|
|
Converts Excel files in the reports directory to tabulated text format
|
|
for processing by LLM.
|
|
"""
|
|
|
|
import os
|
|
import pandas as pd
|
|
from pathlib import Path
|
|
from typing import List, Optional
|
|
from tabulate import tabulate
|
|
|
|
|
|
def excel_to_tabulated_text(excel_path: str, output_file: Optional[str] = None) -> str:
|
|
"""
|
|
Convert Excel file to tabulated text format.
|
|
|
|
Args:
|
|
excel_path: Path to Excel file
|
|
output_file: Optional path to save output. If None, returns string.
|
|
|
|
Returns:
|
|
Tabulated text representation of Excel data
|
|
"""
|
|
try:
|
|
xl_file = pd.ExcelFile(excel_path)
|
|
filename = Path(excel_path).name
|
|
output_lines = []
|
|
|
|
for sheet_name in xl_file.sheet_names:
|
|
# Try reading with header first
|
|
try:
|
|
df = pd.read_excel(xl_file, sheet_name=sheet_name)
|
|
if df.empty:
|
|
# Try without header
|
|
df = pd.read_excel(xl_file, sheet_name=sheet_name, header=None)
|
|
except Exception:
|
|
# Fallback: read without header
|
|
df = pd.read_excel(xl_file, sheet_name=sheet_name, header=None)
|
|
|
|
if df.empty:
|
|
continue
|
|
|
|
# Add file and sheet headers
|
|
output_lines.append(f"FILENAME: {filename}")
|
|
output_lines.append(f"SHEET: {sheet_name}")
|
|
output_lines.append("")
|
|
|
|
# Replace NaN with empty string
|
|
df = df.fillna("")
|
|
|
|
# Use tabulate to format the table nicely
|
|
# Using 'simple' format for clean, readable tables
|
|
# First row as headers if column names exist, otherwise use indices
|
|
if df.columns.tolist() and not all(str(col).startswith('Unnamed') for col in df.columns):
|
|
# Use column names as headers
|
|
table_str = tabulate(df, headers='keys', tablefmt='simple', showindex=False)
|
|
else:
|
|
# No meaningful headers, use first row or no headers
|
|
table_str = tabulate(df, headers='firstrow' if len(df) > 0 else 'keys', tablefmt='simple', showindex=False)
|
|
|
|
output_lines.append(table_str)
|
|
output_lines.append("")
|
|
output_lines.append("=" * 80)
|
|
output_lines.append("")
|
|
|
|
result = "\n".join(output_lines)
|
|
|
|
if output_file:
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
f.write(result)
|
|
|
|
return result
|
|
|
|
except Exception as e:
|
|
error_msg = f"Error processing {excel_path}: {str(e)}"
|
|
return error_msg
|
|
|
|
|
|
def process_reports_directory(reports_dir: str = "reports") -> str:
|
|
"""
|
|
Process all Excel files in the reports directory.
|
|
|
|
Args:
|
|
reports_dir: Directory containing Excel files
|
|
|
|
Returns:
|
|
Combined tabulated text from all Excel files
|
|
"""
|
|
reports_path = Path(reports_dir)
|
|
if not reports_path.exists():
|
|
return f"Reports directory '{reports_dir}' not found."
|
|
|
|
all_outputs = []
|
|
excel_files = list(reports_path.glob("*.xlsx")) + list(reports_path.glob("*.xls"))
|
|
|
|
if not excel_files:
|
|
return f"No Excel files found in '{reports_dir}' directory."
|
|
|
|
for excel_file in excel_files:
|
|
text_output = excel_to_tabulated_text(str(excel_file))
|
|
all_outputs.append(text_output)
|
|
|
|
return "\n\n".join(all_outputs)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
import sys
|
|
|
|
# Process all files in reports directory
|
|
if len(sys.argv) > 1:
|
|
# Single file mode
|
|
output = excel_to_tabulated_text(sys.argv[1])
|
|
print(output)
|
|
else:
|
|
# Directory mode
|
|
output = process_reports_directory()
|
|
print(output)
|
|
|
|
# Optionally save to file
|
|
output_path = Path("output")
|
|
output_path.mkdir(exist_ok=True)
|
|
with open(output_path / "excel_data.txt", 'w', encoding='utf-8') as f:
|
|
f.write(output)
|
|
print(f"\nOutput saved to: {output_path / 'excel_data.txt'}")
|
|
|