zulip_bot/app/db/integration_service.py
2025-05-16 18:00:22 +04:00

120 lines
4.4 KiB
Python

"""
Database integration service.
Combines functionality from both Zulip and ChromaDB services.
"""
from datetime import datetime
from app.db.zulip_service import ZulipDatabaseService
from app.db.chroma_service import ChromaDBService
from app.utils.contextual_retrieval.hybrid_search import HybridSearchService
class DatabaseIntegrationService:
"""
Service for integrating between Zulip DB and ChromaDB.
Handles the synchronization of messages from Zulip to ChromaDB.
"""
@staticmethod
def sync_messages_to_chromadb(days_ago=30, limit=1000):
"""
Sync recent messages from Zulip to ChromaDB.
Args:
days_ago (int): Number of days to look back
limit (int): Maximum number of messages to sync
Returns:
dict: Statistics about the sync operation
"""
# Get messages from Zulip
messages = ZulipDatabaseService.get_messages_from_it_channels(days_ago=days_ago, limit=limit)
stats = {
"total_messages": len(messages),
"new_messages": 0,
"already_existing": 0,
"failed": 0
}
# Process each message
for message in messages:
# Check if message already exists in ChromaDB
if ChromaDBService.message_exists(message.id):
stats["already_existing"] += 1
continue
# Get channel name for the message
channel_name = ZulipDatabaseService.get_channel_name_for_message(message)
# Get sender name (we don't have that information readily available from the query)
# In a real implementation, we would join with the UserProfile table
sender_name = f"User ID: {message.sender_id}"
# Add message to ChromaDB
success = ChromaDBService.add_message(
message_id=message.id,
content=message.content,
channel_name=channel_name,
subject=message.subject,
sender_name=sender_name,
date_sent=message.date_sent
)
if success:
stats["new_messages"] += 1
else:
stats["failed"] += 1
return stats
@staticmethod
def search_knowledge_base(query_text, n_results=5, filter_channel=None, use_hybrid=True, use_reranking=True):
"""
Search for messages in the knowledge base using hybrid search.
Args:
query_text (str): Text to search for
n_results (int): Number of results to return
filter_channel (str): Optional channel name to filter results
use_hybrid (bool): Whether to use hybrid search or just vector search
use_reranking (bool): Whether to apply reranking to the results
Returns:
list: List of search results
"""
# Prepare filter criteria
filter_criteria = None
if filter_channel:
filter_criteria = {"channel": filter_channel}
# Decide which search method to use
if use_hybrid:
# Use the hybrid search service
results = HybridSearchService.hybrid_search(
query=query_text,
n_results=n_results,
filter_criteria=filter_criteria,
rerank=use_reranking
)
return results
else:
# Use the standard ChromaDB search
results = ChromaDBService.search_similar(
query_text=query_text,
n_results=n_results,
filter_criteria=filter_criteria,
use_hybrid=False
)
# Format results
formatted_results = []
if results and results['ids'] and len(results['ids'][0]) > 0:
for i in range(len(results['ids'][0])):
formatted_results.append({
'id': results['ids'][0][i],
'content': results['documents'][0][i],
'metadata': results['metadatas'][0][i],
'score': results['distances'][0][i] if 'distances' in results else None
})
return formatted_results