zulip_bot/app/utils/sync_service.py
2025-05-16 18:00:22 +04:00

217 lines
8.5 KiB
Python

"""
Message synchronization service.
Handles periodic fetching of new messages from Zulip and adds them to ChromaDB.
"""
import os
import time
import logging
import threading
import pickle
from datetime import datetime, timedelta
from app.db.zulip_service import ZulipDatabaseService
from app.db.chroma_service import ChromaDBService
# Configure logger
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger("sync_service")
class MessageSyncService:
"""Service for synchronizing messages from Zulip to ChromaDB."""
# File to store the last synced message ID
_SYNC_STATE_FILE = "sync_state.pickle"
def __init__(self, sync_interval=60, state_dir=None):
"""
Initialize the message sync service.
Args:
sync_interval (int): Sync interval in seconds (default: 60)
state_dir (str): Directory to store sync state file (default: current directory)
"""
self.sync_interval = sync_interval
self.is_running = False
self.sync_thread = None
self.state_dir = state_dir or os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
self.last_sync_time = None
self.last_message_id = None
self.batch_size = 50 # Default batch size
# Load the last synced state if available
self._load_sync_state()
def _set_batch_size(self, batch_size):
"""Set the batch size for syncing messages."""
if batch_size > 0:
self.batch_size = batch_size
logger.info(f"Set batch size to {batch_size}")
else:
logger.warning(f"Invalid batch size: {batch_size}, using default")
def _get_state_file_path(self):
"""Get the full path to the sync state file."""
return os.path.join(self.state_dir, self._SYNC_STATE_FILE)
def _load_sync_state(self):
"""Load the last sync state from disk."""
try:
state_file = self._get_state_file_path()
if os.path.exists(state_file):
with open(state_file, 'rb') as f:
state = pickle.load(f)
self.last_sync_time = state.get('last_sync_time')
self.last_message_id = state.get('last_message_id')
logger.info(f"Loaded sync state: last_sync_time={self.last_sync_time}, last_message_id={self.last_message_id}")
else:
logger.info("No previous sync state found, starting fresh")
except Exception as e:
logger.error(f"Error loading sync state: {e}")
def _save_sync_state(self):
"""Save the current sync state to disk."""
try:
state = {
'last_sync_time': self.last_sync_time,
'last_message_id': self.last_message_id
}
state_file = self._get_state_file_path()
with open(state_file, 'wb') as f:
pickle.dump(state, f)
logger.info(f"Saved sync state: {state}")
except Exception as e:
logger.error(f"Error saving sync state: {e}")
def _sync_messages(self):
"""
Sync new messages from Zulip to ChromaDB.
This method fetches new messages from the Zulip database that haven't been
synchronized yet and adds them to ChromaDB.
"""
try:
# Set default sync time if not set yet
if not self.last_sync_time:
# Start with messages from the last 7 days if no previous sync
self.last_sync_time = datetime.now() - timedelta(days=7)
# Get messages newer than the last sync time
logger.info(f"Fetching messages since {self.last_sync_time} or ID > {self.last_message_id}")
# Get new messages
messages = []
if self.last_message_id:
# Get messages with ID greater than the last processed message ID
messages = ZulipDatabaseService.get_messages_newer_than_id(self.last_message_id, limit=self.batch_size)
else:
# Get messages from IT channels since the last sync time
messages = ZulipDatabaseService.get_messages_from_it_channels(
since=self.last_sync_time,
limit=self.batch_size
)
if not messages:
logger.info("No new messages found to sync")
return
logger.info(f"Found {len(messages)} new messages to sync")
# Add messages to ChromaDB
synced_count = 0
already_exists_count = 0
highest_message_id = self.last_message_id or 0
# Get a list of unique message IDs
unique_message_ids = set(message.id for message in messages)
logger.info(f"Found {len(unique_message_ids)} unique message IDs out of {len(messages)} messages")
for message in messages:
message_id = message.id
# Update highest message ID seen
if message_id > highest_message_id:
highest_message_id = message_id
channel_name = ZulipDatabaseService.get_channel_name_for_message(message)
sender_name = ZulipDatabaseService.get_sender_name_for_message(message)
# Check if this message already exists in ChromaDB to avoid duplicates
if ChromaDBService.message_exists(message_id):
already_exists_count += 1
logger.debug(f"Message {message_id} already exists in ChromaDB, skipping")
continue
# Add the message to ChromaDB
success = ChromaDBService.add_message(
message_id=message_id,
content=message.content,
channel_name=channel_name,
subject=message.subject,
sender_name=sender_name,
date_sent=message.date_sent
)
if success:
synced_count += 1
else:
logger.warning(f"Failed to add message {message_id} to ChromaDB")
# Update the last sync time and message ID
self.last_sync_time = datetime.now()
if highest_message_id > (self.last_message_id or 0):
self.last_message_id = highest_message_id
# Save the sync state
self._save_sync_state()
logger.info(f"Sync completed. Added {synced_count} new messages to ChromaDB. Skipped {already_exists_count} existing messages. Last message ID: {self.last_message_id}")
except Exception as e:
logger.error(f"Error syncing messages: {e}")
def _sync_loop(self):
"""Main sync loop."""
while self.is_running:
try:
self._sync_messages()
# Sleep for the specified interval
for _ in range(self.sync_interval):
if not self.is_running:
break
time.sleep(1)
except Exception as e:
logger.error(f"Error in sync loop: {e}")
# Sleep a bit before retrying to avoid tight error loops
time.sleep(5)
def start(self):
"""Start the message sync service."""
if self.is_running:
logger.warning("Sync service is already running")
return
logger.info(f"Starting message sync service with interval {self.sync_interval} seconds")
self.is_running = True
self.sync_thread = threading.Thread(target=self._sync_loop)
self.sync_thread.daemon = True
self.sync_thread.start()
def stop(self):
"""Stop the message sync service."""
if not self.is_running:
logger.warning("Sync service is not running")
return
logger.info("Stopping message sync service")
self.is_running = False
if self.sync_thread:
self.sync_thread.join(timeout=10)
logger.info("Sync service stopped")
def sync_now(self):
"""Manually trigger a sync operation."""
logger.info("Manual sync triggered")
self._sync_messages()