zulip_bot/compare_all_messages.py
2025-05-16 18:00:22 +04:00

141 lines
5.8 KiB
Python
Executable File

#!/usr/bin/env python
"""
Simple script to compare ALL messages in Zulip to ChromaDB with no restrictions.
"""
import os
import sys
import logging
from collections import defaultdict
from datetime import datetime
# Configure logging
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger("compare_all_messages")
# Add the current directory to the path so we can import the app module
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
# Apply NumPy compatibility patch for ChromaDB
from app.utils import patch_chromadb_numpy
patch_chromadb_numpy()
from app import create_app
from app.db import get_chroma_collection, get_db_session
from app.db.zulip_service import ZulipDatabaseService
from app.models.zulip import Message
def main():
"""Main function to compare Zulip messages with ChromaDB entries."""
logger.info("Starting simple comparison of ALL messages")
# Create the Flask app (needed for context)
app = create_app()
with app.app_context():
print("\n====================================================")
print("COMPARING ALL ZULIP MESSAGES WITH CHROMADB")
print(f"Started at: {datetime.now()}")
print("====================================================\n")
try:
# Get Zulip DB session
session = get_db_session()
# Get ALL messages from Zulip
print("Fetching all messages from Zulip...")
zulip_messages = session.query(Message).all()
zulip_ids = set(str(msg.id) for msg in zulip_messages)
# Get channel counts
channel_counts = defaultdict(int)
for message in zulip_messages:
channel_name = ZulipDatabaseService.get_channel_name_for_message(message)
if channel_name is None:
channel_name = "Unknown Channel"
channel_counts[channel_name] += 1
# Print Zulip stats
print(f"\nZulip has {len(zulip_messages)} total messages across {len(channel_counts)} channels")
# Get ChromaDB collection
collection = get_chroma_collection()
if not collection:
print("ERROR: Failed to get ChromaDB collection")
return
# Get all entries from ChromaDB
print("Fetching all entries from ChromaDB...")
chroma_result = collection.get(include=['metadatas'])
if not chroma_result or 'ids' not in chroma_result or not chroma_result['ids']:
print("No entries found in ChromaDB")
return
# Get unique ChromaDB IDs
chroma_ids = set(chroma_result['ids'])
# Get channel counts for ChromaDB
chroma_channel_counts = defaultdict(int)
for i, _ in enumerate(chroma_result['ids']):
if chroma_result.get('metadatas') and len(chroma_result['metadatas']) > i:
metadata = chroma_result['metadatas'][i]
channel = metadata.get('channel', 'Unknown')
chroma_channel_counts[channel] += 1
# Print ChromaDB stats
print(f"ChromaDB has {len(chroma_result['ids'])} total entries")
print(f"ChromaDB has {len(chroma_ids)} unique entries")
# Calculate missing and extra
missing_from_chromadb = zulip_ids - chroma_ids
extra_in_chromadb = chroma_ids - zulip_ids
# Calculate overall sync percentage
sync_percentage = (len(chroma_ids) / len(zulip_ids) * 100) if zulip_ids else 0
# Print comparison results
print("\n====================================================")
print("COMPARISON RESULTS")
print("====================================================")
print(f"Zulip total messages: {len(zulip_messages)}")
print(f"ChromaDB total entries: {len(chroma_result['ids'])}")
print(f"ChromaDB unique entries: {len(chroma_ids)}")
print(f"Sync percentage: {sync_percentage:.2f}%")
print(f"Messages in Zulip but not in ChromaDB: {len(missing_from_chromadb)}")
print(f"Entries in ChromaDB not in Zulip: {len(extra_in_chromadb)}")
# Print channel comparison
print("\nCHANNEL COMPARISON:")
print("-" * 70)
print(f"{'Channel':<25} {'Zulip':<10} {'ChromaDB':<10} {'Diff':<10} {'%':<10}")
print("-" * 70)
all_channels = sorted(set(channel_counts.keys()) | set(chroma_channel_counts.keys()))
for channel in all_channels:
zulip_count = channel_counts.get(channel, 0)
chroma_count = chroma_channel_counts.get(channel, 0)
diff = zulip_count - chroma_count
percentage = (chroma_count / zulip_count * 100) if zulip_count > 0 else 0
print(f"{channel[:25]:<25} {zulip_count:<10} {chroma_count:<10} {diff:<10} {percentage:.2f}%")
# Print recommendations
print("\n====================================================")
print("RECOMMENDATIONS")
print("====================================================")
if sync_percentage < 100:
print("- Run ./sync_all_messages.py to sync missing messages")
else:
print("- All messages are synced!")
print(f"\nComparison completed at: {datetime.now()}")
except Exception as e:
print(f"Error during comparison: {e}")
logger.error(f"Error during comparison: {e}")
if __name__ == "__main__":
main()