141 lines
5.8 KiB
Python
Executable File
141 lines
5.8 KiB
Python
Executable File
#!/usr/bin/env python
|
|
"""
|
|
Simple script to compare ALL messages in Zulip to ChromaDB with no restrictions.
|
|
"""
|
|
import os
|
|
import sys
|
|
import logging
|
|
from collections import defaultdict
|
|
from datetime import datetime
|
|
|
|
# Configure logging
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
|
)
|
|
logger = logging.getLogger("compare_all_messages")
|
|
|
|
# Add the current directory to the path so we can import the app module
|
|
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
|
|
|
|
# Apply NumPy compatibility patch for ChromaDB
|
|
from app.utils import patch_chromadb_numpy
|
|
patch_chromadb_numpy()
|
|
|
|
from app import create_app
|
|
from app.db import get_chroma_collection, get_db_session
|
|
from app.db.zulip_service import ZulipDatabaseService
|
|
from app.models.zulip import Message
|
|
|
|
def main():
|
|
"""Main function to compare Zulip messages with ChromaDB entries."""
|
|
logger.info("Starting simple comparison of ALL messages")
|
|
|
|
# Create the Flask app (needed for context)
|
|
app = create_app()
|
|
|
|
with app.app_context():
|
|
print("\n====================================================")
|
|
print("COMPARING ALL ZULIP MESSAGES WITH CHROMADB")
|
|
print(f"Started at: {datetime.now()}")
|
|
print("====================================================\n")
|
|
|
|
try:
|
|
# Get Zulip DB session
|
|
session = get_db_session()
|
|
|
|
# Get ALL messages from Zulip
|
|
print("Fetching all messages from Zulip...")
|
|
zulip_messages = session.query(Message).all()
|
|
zulip_ids = set(str(msg.id) for msg in zulip_messages)
|
|
|
|
# Get channel counts
|
|
channel_counts = defaultdict(int)
|
|
for message in zulip_messages:
|
|
channel_name = ZulipDatabaseService.get_channel_name_for_message(message)
|
|
if channel_name is None:
|
|
channel_name = "Unknown Channel"
|
|
channel_counts[channel_name] += 1
|
|
|
|
# Print Zulip stats
|
|
print(f"\nZulip has {len(zulip_messages)} total messages across {len(channel_counts)} channels")
|
|
|
|
# Get ChromaDB collection
|
|
collection = get_chroma_collection()
|
|
|
|
if not collection:
|
|
print("ERROR: Failed to get ChromaDB collection")
|
|
return
|
|
|
|
# Get all entries from ChromaDB
|
|
print("Fetching all entries from ChromaDB...")
|
|
chroma_result = collection.get(include=['metadatas'])
|
|
|
|
if not chroma_result or 'ids' not in chroma_result or not chroma_result['ids']:
|
|
print("No entries found in ChromaDB")
|
|
return
|
|
|
|
# Get unique ChromaDB IDs
|
|
chroma_ids = set(chroma_result['ids'])
|
|
|
|
# Get channel counts for ChromaDB
|
|
chroma_channel_counts = defaultdict(int)
|
|
for i, _ in enumerate(chroma_result['ids']):
|
|
if chroma_result.get('metadatas') and len(chroma_result['metadatas']) > i:
|
|
metadata = chroma_result['metadatas'][i]
|
|
channel = metadata.get('channel', 'Unknown')
|
|
chroma_channel_counts[channel] += 1
|
|
|
|
# Print ChromaDB stats
|
|
print(f"ChromaDB has {len(chroma_result['ids'])} total entries")
|
|
print(f"ChromaDB has {len(chroma_ids)} unique entries")
|
|
|
|
# Calculate missing and extra
|
|
missing_from_chromadb = zulip_ids - chroma_ids
|
|
extra_in_chromadb = chroma_ids - zulip_ids
|
|
|
|
# Calculate overall sync percentage
|
|
sync_percentage = (len(chroma_ids) / len(zulip_ids) * 100) if zulip_ids else 0
|
|
|
|
# Print comparison results
|
|
print("\n====================================================")
|
|
print("COMPARISON RESULTS")
|
|
print("====================================================")
|
|
print(f"Zulip total messages: {len(zulip_messages)}")
|
|
print(f"ChromaDB total entries: {len(chroma_result['ids'])}")
|
|
print(f"ChromaDB unique entries: {len(chroma_ids)}")
|
|
print(f"Sync percentage: {sync_percentage:.2f}%")
|
|
print(f"Messages in Zulip but not in ChromaDB: {len(missing_from_chromadb)}")
|
|
print(f"Entries in ChromaDB not in Zulip: {len(extra_in_chromadb)}")
|
|
|
|
# Print channel comparison
|
|
print("\nCHANNEL COMPARISON:")
|
|
print("-" * 70)
|
|
print(f"{'Channel':<25} {'Zulip':<10} {'ChromaDB':<10} {'Diff':<10} {'%':<10}")
|
|
print("-" * 70)
|
|
|
|
all_channels = sorted(set(channel_counts.keys()) | set(chroma_channel_counts.keys()))
|
|
for channel in all_channels:
|
|
zulip_count = channel_counts.get(channel, 0)
|
|
chroma_count = chroma_channel_counts.get(channel, 0)
|
|
diff = zulip_count - chroma_count
|
|
percentage = (chroma_count / zulip_count * 100) if zulip_count > 0 else 0
|
|
print(f"{channel[:25]:<25} {zulip_count:<10} {chroma_count:<10} {diff:<10} {percentage:.2f}%")
|
|
|
|
# Print recommendations
|
|
print("\n====================================================")
|
|
print("RECOMMENDATIONS")
|
|
print("====================================================")
|
|
if sync_percentage < 100:
|
|
print("- Run ./sync_all_messages.py to sync missing messages")
|
|
else:
|
|
print("- All messages are synced!")
|
|
|
|
print(f"\nComparison completed at: {datetime.now()}")
|
|
|
|
except Exception as e:
|
|
print(f"Error during comparison: {e}")
|
|
logger.error(f"Error during comparison: {e}")
|
|
|
|
if __name__ == "__main__":
|
|
main() |