zulip_bot/app/db/integration_service.py

"""
Database integration service.
Combines functionality from both Zulip and ChromaDB services.
"""
from datetime import datetime
from app.db.zulip_service import ZulipDatabaseService
from app.db.chroma_service import ChromaDBService
from app.utils.contextual_retrieval.hybrid_search import HybridSearchService

class DatabaseIntegrationService:
    """
    Service for integrating between Zulip DB and ChromaDB.
    Handles the synchronization of messages from Zulip to ChromaDB.
    """

    @staticmethod
    def sync_messages_to_chromadb(days_ago=30, limit=1000):
        """
        Sync recent messages from Zulip to ChromaDB.

        Args:
            days_ago (int): Number of days to look back
            limit (int): Maximum number of messages to sync

        Returns:
            dict: Statistics about the sync operation
        """
        # Get messages from Zulip
        messages = ZulipDatabaseService.get_messages_from_it_channels(days_ago=days_ago, limit=limit)

        stats = {
            "total_messages": len(messages),
            "new_messages": 0,
            "already_existing": 0,
            "failed": 0
        }

        # Process each message
        for message in messages:
            # Check if message already exists in ChromaDB
            if ChromaDBService.message_exists(message.id):
                stats["already_existing"] += 1
                continue

            # Get channel name for the message
            channel_name = ZulipDatabaseService.get_channel_name_for_message(message)

            # Get sender name (we don't have that information readily available from the query)
            # In a real implementation, we would join with the UserProfile table
            sender_name = f"User ID: {message.sender_id}"

            # Add message to ChromaDB
            success = ChromaDBService.add_message(
                message_id=message.id,
                content=message.content,
                channel_name=channel_name,
                subject=message.subject,
                sender_name=sender_name,
                date_sent=message.date_sent
            )

            if success:
                stats["new_messages"] += 1
            else:
                stats["failed"] += 1

        return stats

    @staticmethod
    def search_knowledge_base(query_text, n_results=5, filter_channel=None, use_hybrid=True, use_reranking=True):
        """
        Search for messages in the knowledge base using hybrid search.

        Args:
            query_text (str): Text to search for
            n_results (int): Number of results to return
            filter_channel (str): Optional channel name to filter results
            use_hybrid (bool): Whether to use hybrid search or just vector search
            use_reranking (bool): Whether to apply reranking to the results

        Returns:
            list: List of search results
        """
        # Prepare filter criteria
        filter_criteria = None
        if filter_channel:
            filter_criteria = {"channel": filter_channel}

        # Decide which search method to use
        if use_hybrid:
            # Use the hybrid search service
            results = HybridSearchService.hybrid_search(
                query=query_text,
                n_results=n_results,
                filter_criteria=filter_criteria,
                rerank=use_reranking
            )

            return results
        else:
            # Use the standard ChromaDB search
            results = ChromaDBService.search_similar(
                query_text=query_text,
                n_results=n_results,
                filter_criteria=filter_criteria,
                use_hybrid=False
            )

            # Format results
            formatted_results = []
            if results and results['ids'] and len(results['ids'][0]) > 0:
                for i in range(len(results['ids'][0])):
                    formatted_results.append({
                        'id': results['ids'][0][i],
                        'content': results['documents'][0][i],
                        'metadata': results['metadatas'][0][i],
                        'score': results['distances'][0][i] if 'distances' in results else None
                    })

            return formatted_results