#!/usr/bin/env python """ Simple script to compare ALL messages in Zulip to ChromaDB with no restrictions. """ import os import sys import logging from collections import defaultdict from datetime import datetime # Configure logging logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" ) logger = logging.getLogger("compare_all_messages") # Add the current directory to the path so we can import the app module sys.path.append(os.path.dirname(os.path.abspath(__file__))) # Apply NumPy compatibility patch for ChromaDB from app.utils import patch_chromadb_numpy patch_chromadb_numpy() from app import create_app from app.db import get_chroma_collection, get_db_session from app.db.zulip_service import ZulipDatabaseService from app.models.zulip import Message def main(): """Main function to compare Zulip messages with ChromaDB entries.""" logger.info("Starting simple comparison of ALL messages") # Create the Flask app (needed for context) app = create_app() with app.app_context(): print("\n====================================================") print("COMPARING ALL ZULIP MESSAGES WITH CHROMADB") print(f"Started at: {datetime.now()}") print("====================================================\n") try: # Get Zulip DB session session = get_db_session() # Get ALL messages from Zulip print("Fetching all messages from Zulip...") zulip_messages = session.query(Message).all() zulip_ids = set(str(msg.id) for msg in zulip_messages) # Get channel counts channel_counts = defaultdict(int) for message in zulip_messages: channel_name = ZulipDatabaseService.get_channel_name_for_message(message) if channel_name is None: channel_name = "Unknown Channel" channel_counts[channel_name] += 1 # Print Zulip stats print(f"\nZulip has {len(zulip_messages)} total messages across {len(channel_counts)} channels") # Get ChromaDB collection collection = get_chroma_collection() if not collection: print("ERROR: Failed to get ChromaDB collection") return # Get all entries from ChromaDB print("Fetching all entries from ChromaDB...") chroma_result = collection.get(include=['metadatas']) if not chroma_result or 'ids' not in chroma_result or not chroma_result['ids']: print("No entries found in ChromaDB") return # Get unique ChromaDB IDs chroma_ids = set(chroma_result['ids']) # Get channel counts for ChromaDB chroma_channel_counts = defaultdict(int) for i, _ in enumerate(chroma_result['ids']): if chroma_result.get('metadatas') and len(chroma_result['metadatas']) > i: metadata = chroma_result['metadatas'][i] channel = metadata.get('channel', 'Unknown') chroma_channel_counts[channel] += 1 # Print ChromaDB stats print(f"ChromaDB has {len(chroma_result['ids'])} total entries") print(f"ChromaDB has {len(chroma_ids)} unique entries") # Calculate missing and extra missing_from_chromadb = zulip_ids - chroma_ids extra_in_chromadb = chroma_ids - zulip_ids # Calculate overall sync percentage sync_percentage = (len(chroma_ids) / len(zulip_ids) * 100) if zulip_ids else 0 # Print comparison results print("\n====================================================") print("COMPARISON RESULTS") print("====================================================") print(f"Zulip total messages: {len(zulip_messages)}") print(f"ChromaDB total entries: {len(chroma_result['ids'])}") print(f"ChromaDB unique entries: {len(chroma_ids)}") print(f"Sync percentage: {sync_percentage:.2f}%") print(f"Messages in Zulip but not in ChromaDB: {len(missing_from_chromadb)}") print(f"Entries in ChromaDB not in Zulip: {len(extra_in_chromadb)}") # Print channel comparison print("\nCHANNEL COMPARISON:") print("-" * 70) print(f"{'Channel':<25} {'Zulip':<10} {'ChromaDB':<10} {'Diff':<10} {'%':<10}") print("-" * 70) all_channels = sorted(set(channel_counts.keys()) | set(chroma_channel_counts.keys())) for channel in all_channels: zulip_count = channel_counts.get(channel, 0) chroma_count = chroma_channel_counts.get(channel, 0) diff = zulip_count - chroma_count percentage = (chroma_count / zulip_count * 100) if zulip_count > 0 else 0 print(f"{channel[:25]:<25} {zulip_count:<10} {chroma_count:<10} {diff:<10} {percentage:.2f}%") # Print recommendations print("\n====================================================") print("RECOMMENDATIONS") print("====================================================") if sync_percentage < 100: print("- Run ./sync_all_messages.py to sync missing messages") else: print("- All messages are synced!") print(f"\nComparison completed at: {datetime.now()}") except Exception as e: print(f"Error during comparison: {e}") logger.error(f"Error during comparison: {e}") if __name__ == "__main__": main()