120 lines
4.4 KiB
Python
120 lines
4.4 KiB
Python
"""
|
|
Database integration service.
|
|
Combines functionality from both Zulip and ChromaDB services.
|
|
"""
|
|
from datetime import datetime
|
|
from app.db.zulip_service import ZulipDatabaseService
|
|
from app.db.chroma_service import ChromaDBService
|
|
from app.utils.contextual_retrieval.hybrid_search import HybridSearchService
|
|
|
|
class DatabaseIntegrationService:
|
|
"""
|
|
Service for integrating between Zulip DB and ChromaDB.
|
|
Handles the synchronization of messages from Zulip to ChromaDB.
|
|
"""
|
|
|
|
@staticmethod
|
|
def sync_messages_to_chromadb(days_ago=30, limit=1000):
|
|
"""
|
|
Sync recent messages from Zulip to ChromaDB.
|
|
|
|
Args:
|
|
days_ago (int): Number of days to look back
|
|
limit (int): Maximum number of messages to sync
|
|
|
|
Returns:
|
|
dict: Statistics about the sync operation
|
|
"""
|
|
# Get messages from Zulip
|
|
messages = ZulipDatabaseService.get_messages_from_it_channels(days_ago=days_ago, limit=limit)
|
|
|
|
stats = {
|
|
"total_messages": len(messages),
|
|
"new_messages": 0,
|
|
"already_existing": 0,
|
|
"failed": 0
|
|
}
|
|
|
|
# Process each message
|
|
for message in messages:
|
|
# Check if message already exists in ChromaDB
|
|
if ChromaDBService.message_exists(message.id):
|
|
stats["already_existing"] += 1
|
|
continue
|
|
|
|
# Get channel name for the message
|
|
channel_name = ZulipDatabaseService.get_channel_name_for_message(message)
|
|
|
|
# Get sender name (we don't have that information readily available from the query)
|
|
# In a real implementation, we would join with the UserProfile table
|
|
sender_name = f"User ID: {message.sender_id}"
|
|
|
|
# Add message to ChromaDB
|
|
success = ChromaDBService.add_message(
|
|
message_id=message.id,
|
|
content=message.content,
|
|
channel_name=channel_name,
|
|
subject=message.subject,
|
|
sender_name=sender_name,
|
|
date_sent=message.date_sent
|
|
)
|
|
|
|
if success:
|
|
stats["new_messages"] += 1
|
|
else:
|
|
stats["failed"] += 1
|
|
|
|
return stats
|
|
|
|
@staticmethod
|
|
def search_knowledge_base(query_text, n_results=5, filter_channel=None, use_hybrid=True, use_reranking=True):
|
|
"""
|
|
Search for messages in the knowledge base using hybrid search.
|
|
|
|
Args:
|
|
query_text (str): Text to search for
|
|
n_results (int): Number of results to return
|
|
filter_channel (str): Optional channel name to filter results
|
|
use_hybrid (bool): Whether to use hybrid search or just vector search
|
|
use_reranking (bool): Whether to apply reranking to the results
|
|
|
|
Returns:
|
|
list: List of search results
|
|
"""
|
|
# Prepare filter criteria
|
|
filter_criteria = None
|
|
if filter_channel:
|
|
filter_criteria = {"channel": filter_channel}
|
|
|
|
# Decide which search method to use
|
|
if use_hybrid:
|
|
# Use the hybrid search service
|
|
results = HybridSearchService.hybrid_search(
|
|
query=query_text,
|
|
n_results=n_results,
|
|
filter_criteria=filter_criteria,
|
|
rerank=use_reranking
|
|
)
|
|
|
|
return results
|
|
else:
|
|
# Use the standard ChromaDB search
|
|
results = ChromaDBService.search_similar(
|
|
query_text=query_text,
|
|
n_results=n_results,
|
|
filter_criteria=filter_criteria,
|
|
use_hybrid=False
|
|
)
|
|
|
|
# Format results
|
|
formatted_results = []
|
|
if results and results['ids'] and len(results['ids'][0]) > 0:
|
|
for i in range(len(results['ids'][0])):
|
|
formatted_results.append({
|
|
'id': results['ids'][0][i],
|
|
'content': results['documents'][0][i],
|
|
'metadata': results['metadatas'][0][i],
|
|
'score': results['distances'][0][i] if 'distances' in results else None
|
|
})
|
|
|
|
return formatted_results |