217 lines
8.5 KiB
Python
217 lines
8.5 KiB
Python
"""
|
|
Message synchronization service.
|
|
Handles periodic fetching of new messages from Zulip and adds them to ChromaDB.
|
|
"""
|
|
import os
|
|
import time
|
|
import logging
|
|
import threading
|
|
import pickle
|
|
from datetime import datetime, timedelta
|
|
|
|
from app.db.zulip_service import ZulipDatabaseService
|
|
from app.db.chroma_service import ChromaDBService
|
|
|
|
# Configure logger
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
|
)
|
|
logger = logging.getLogger("sync_service")
|
|
|
|
class MessageSyncService:
|
|
"""Service for synchronizing messages from Zulip to ChromaDB."""
|
|
|
|
# File to store the last synced message ID
|
|
_SYNC_STATE_FILE = "sync_state.pickle"
|
|
|
|
def __init__(self, sync_interval=60, state_dir=None):
|
|
"""
|
|
Initialize the message sync service.
|
|
|
|
Args:
|
|
sync_interval (int): Sync interval in seconds (default: 60)
|
|
state_dir (str): Directory to store sync state file (default: current directory)
|
|
"""
|
|
self.sync_interval = sync_interval
|
|
self.is_running = False
|
|
self.sync_thread = None
|
|
self.state_dir = state_dir or os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
self.last_sync_time = None
|
|
self.last_message_id = None
|
|
self.batch_size = 50 # Default batch size
|
|
|
|
# Load the last synced state if available
|
|
self._load_sync_state()
|
|
|
|
def _set_batch_size(self, batch_size):
|
|
"""Set the batch size for syncing messages."""
|
|
if batch_size > 0:
|
|
self.batch_size = batch_size
|
|
logger.info(f"Set batch size to {batch_size}")
|
|
else:
|
|
logger.warning(f"Invalid batch size: {batch_size}, using default")
|
|
|
|
def _get_state_file_path(self):
|
|
"""Get the full path to the sync state file."""
|
|
return os.path.join(self.state_dir, self._SYNC_STATE_FILE)
|
|
|
|
def _load_sync_state(self):
|
|
"""Load the last sync state from disk."""
|
|
try:
|
|
state_file = self._get_state_file_path()
|
|
if os.path.exists(state_file):
|
|
with open(state_file, 'rb') as f:
|
|
state = pickle.load(f)
|
|
self.last_sync_time = state.get('last_sync_time')
|
|
self.last_message_id = state.get('last_message_id')
|
|
logger.info(f"Loaded sync state: last_sync_time={self.last_sync_time}, last_message_id={self.last_message_id}")
|
|
else:
|
|
logger.info("No previous sync state found, starting fresh")
|
|
except Exception as e:
|
|
logger.error(f"Error loading sync state: {e}")
|
|
|
|
def _save_sync_state(self):
|
|
"""Save the current sync state to disk."""
|
|
try:
|
|
state = {
|
|
'last_sync_time': self.last_sync_time,
|
|
'last_message_id': self.last_message_id
|
|
}
|
|
state_file = self._get_state_file_path()
|
|
with open(state_file, 'wb') as f:
|
|
pickle.dump(state, f)
|
|
logger.info(f"Saved sync state: {state}")
|
|
except Exception as e:
|
|
logger.error(f"Error saving sync state: {e}")
|
|
|
|
def _sync_messages(self):
|
|
"""
|
|
Sync new messages from Zulip to ChromaDB.
|
|
|
|
This method fetches new messages from the Zulip database that haven't been
|
|
synchronized yet and adds them to ChromaDB.
|
|
"""
|
|
try:
|
|
# Set default sync time if not set yet
|
|
if not self.last_sync_time:
|
|
# Start with messages from the last 7 days if no previous sync
|
|
self.last_sync_time = datetime.now() - timedelta(days=7)
|
|
|
|
# Get messages newer than the last sync time
|
|
logger.info(f"Fetching messages since {self.last_sync_time} or ID > {self.last_message_id}")
|
|
|
|
# Get new messages
|
|
messages = []
|
|
if self.last_message_id:
|
|
# Get messages with ID greater than the last processed message ID
|
|
messages = ZulipDatabaseService.get_messages_newer_than_id(self.last_message_id, limit=self.batch_size)
|
|
else:
|
|
# Get messages from IT channels since the last sync time
|
|
messages = ZulipDatabaseService.get_messages_from_it_channels(
|
|
since=self.last_sync_time,
|
|
limit=self.batch_size
|
|
)
|
|
|
|
if not messages:
|
|
logger.info("No new messages found to sync")
|
|
return
|
|
|
|
logger.info(f"Found {len(messages)} new messages to sync")
|
|
|
|
# Add messages to ChromaDB
|
|
synced_count = 0
|
|
already_exists_count = 0
|
|
highest_message_id = self.last_message_id or 0
|
|
|
|
# Get a list of unique message IDs
|
|
unique_message_ids = set(message.id for message in messages)
|
|
logger.info(f"Found {len(unique_message_ids)} unique message IDs out of {len(messages)} messages")
|
|
|
|
for message in messages:
|
|
message_id = message.id
|
|
|
|
# Update highest message ID seen
|
|
if message_id > highest_message_id:
|
|
highest_message_id = message_id
|
|
|
|
channel_name = ZulipDatabaseService.get_channel_name_for_message(message)
|
|
sender_name = ZulipDatabaseService.get_sender_name_for_message(message)
|
|
|
|
# Check if this message already exists in ChromaDB to avoid duplicates
|
|
if ChromaDBService.message_exists(message_id):
|
|
already_exists_count += 1
|
|
logger.debug(f"Message {message_id} already exists in ChromaDB, skipping")
|
|
continue
|
|
|
|
# Add the message to ChromaDB
|
|
success = ChromaDBService.add_message(
|
|
message_id=message_id,
|
|
content=message.content,
|
|
channel_name=channel_name,
|
|
subject=message.subject,
|
|
sender_name=sender_name,
|
|
date_sent=message.date_sent
|
|
)
|
|
|
|
if success:
|
|
synced_count += 1
|
|
else:
|
|
logger.warning(f"Failed to add message {message_id} to ChromaDB")
|
|
|
|
# Update the last sync time and message ID
|
|
self.last_sync_time = datetime.now()
|
|
if highest_message_id > (self.last_message_id or 0):
|
|
self.last_message_id = highest_message_id
|
|
|
|
# Save the sync state
|
|
self._save_sync_state()
|
|
|
|
logger.info(f"Sync completed. Added {synced_count} new messages to ChromaDB. Skipped {already_exists_count} existing messages. Last message ID: {self.last_message_id}")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error syncing messages: {e}")
|
|
|
|
def _sync_loop(self):
|
|
"""Main sync loop."""
|
|
while self.is_running:
|
|
try:
|
|
self._sync_messages()
|
|
# Sleep for the specified interval
|
|
for _ in range(self.sync_interval):
|
|
if not self.is_running:
|
|
break
|
|
time.sleep(1)
|
|
except Exception as e:
|
|
logger.error(f"Error in sync loop: {e}")
|
|
# Sleep a bit before retrying to avoid tight error loops
|
|
time.sleep(5)
|
|
|
|
def start(self):
|
|
"""Start the message sync service."""
|
|
if self.is_running:
|
|
logger.warning("Sync service is already running")
|
|
return
|
|
|
|
logger.info(f"Starting message sync service with interval {self.sync_interval} seconds")
|
|
self.is_running = True
|
|
self.sync_thread = threading.Thread(target=self._sync_loop)
|
|
self.sync_thread.daemon = True
|
|
self.sync_thread.start()
|
|
|
|
def stop(self):
|
|
"""Stop the message sync service."""
|
|
if not self.is_running:
|
|
logger.warning("Sync service is not running")
|
|
return
|
|
|
|
logger.info("Stopping message sync service")
|
|
self.is_running = False
|
|
if self.sync_thread:
|
|
self.sync_thread.join(timeout=10)
|
|
logger.info("Sync service stopped")
|
|
|
|
def sync_now(self):
|
|
"""Manually trigger a sync operation."""
|
|
logger.info("Manual sync triggered")
|
|
self._sync_messages() |