vendor_report/excel_to_text.py
2025-11-05 22:40:20 +04:00

129 lines
4.1 KiB
Python

#!/usr/bin/env python3
"""
Excel to Tabulated Text Converter
Converts Excel files in the reports directory to tabulated text format
for processing by LLM.
"""
import os
import pandas as pd
from pathlib import Path
from typing import List, Optional
from tabulate import tabulate
def excel_to_tabulated_text(excel_path: str, output_file: Optional[str] = None) -> str:
"""
Convert Excel file to tabulated text format.
Args:
excel_path: Path to Excel file
output_file: Optional path to save output. If None, returns string.
Returns:
Tabulated text representation of Excel data
"""
try:
xl_file = pd.ExcelFile(excel_path)
filename = Path(excel_path).name
output_lines = []
for sheet_name in xl_file.sheet_names:
# Try reading with header first
try:
df = pd.read_excel(xl_file, sheet_name=sheet_name)
if df.empty:
# Try without header
df = pd.read_excel(xl_file, sheet_name=sheet_name, header=None)
except Exception:
# Fallback: read without header
df = pd.read_excel(xl_file, sheet_name=sheet_name, header=None)
if df.empty:
continue
# Add file and sheet headers
output_lines.append(f"FILENAME: {filename}")
output_lines.append(f"SHEET: {sheet_name}")
output_lines.append("")
# Replace NaN with empty string
df = df.fillna("")
# Use tabulate to format the table nicely
# Using 'simple' format for clean, readable tables
# First row as headers if column names exist, otherwise use indices
if df.columns.tolist() and not all(str(col).startswith('Unnamed') for col in df.columns):
# Use column names as headers
table_str = tabulate(df, headers='keys', tablefmt='simple', showindex=False)
else:
# No meaningful headers, use first row or no headers
table_str = tabulate(df, headers='firstrow' if len(df) > 0 else 'keys', tablefmt='simple', showindex=False)
output_lines.append(table_str)
output_lines.append("")
output_lines.append("=" * 80)
output_lines.append("")
result = "\n".join(output_lines)
if output_file:
with open(output_file, 'w', encoding='utf-8') as f:
f.write(result)
return result
except Exception as e:
error_msg = f"Error processing {excel_path}: {str(e)}"
return error_msg
def process_reports_directory(reports_dir: str = "reports") -> str:
"""
Process all Excel files in the reports directory.
Args:
reports_dir: Directory containing Excel files
Returns:
Combined tabulated text from all Excel files
"""
reports_path = Path(reports_dir)
if not reports_path.exists():
return f"Reports directory '{reports_dir}' not found."
all_outputs = []
excel_files = list(reports_path.glob("*.xlsx")) + list(reports_path.glob("*.xls"))
if not excel_files:
return f"No Excel files found in '{reports_dir}' directory."
for excel_file in excel_files:
text_output = excel_to_tabulated_text(str(excel_file))
all_outputs.append(text_output)
return "\n\n".join(all_outputs)
if __name__ == "__main__":
import sys
# Process all files in reports directory
if len(sys.argv) > 1:
# Single file mode
output = excel_to_tabulated_text(sys.argv[1])
print(output)
else:
# Directory mode
output = process_reports_directory()
print(output)
# Optionally save to file
output_path = Path("output")
output_path.mkdir(exist_ok=True)
with open(output_path / "excel_data.txt", 'w', encoding='utf-8') as f:
f.write(output)
print(f"\nOutput saved to: {output_path / 'excel_data.txt'}")