zulip_bot/app/db/chroma_service.py

"""
Service for storing and retrieving embedded messages in ChromaDB.
"""
import json
from datetime import datetime
from typing import List, Dict, Any, Optional, Union
import chromadb
from chromadb.utils import embedding_functions
from app.db import get_chroma_collection
from app.utils.embeddings import EmbeddingService
from app.utils.contextual_retrieval.context_service import ContextService
from app.utils.contextual_retrieval.bm25_service import BM25Service
from app.config import Config
import logging

# Set up logging
logger = logging.getLogger("chroma_service")

class CustomEmbeddingFunction(embedding_functions.EmbeddingFunction):
    """Custom embedding function using our EmbeddingService."""

    def __init__(self, use_nomic: bool = True):
        """
        Initialize the custom embedding function.

        Args:
            use_nomic: Whether to use Nomic (True) or Ollama (False) for embeddings
        """
        self.use_nomic = use_nomic

    def __call__(self, texts: List[str]) -> List[List[float]]:
        """
        Generate embeddings for a list of texts.

        Args:
            texts: List of texts to generate embeddings for

        Returns:
            List of embeddings as float arrays
        """
        return EmbeddingService.get_embeddings(texts, use_nomic=self.use_nomic)

class ChromaDBService:
    """Service for storing and retrieving embedded messages in ChromaDB."""

    # Use Ollama embeddings by default for reliability
    _embedding_function = CustomEmbeddingFunction(use_nomic=False)

    @staticmethod
    def format_message_content(content, channel_name, subject, sender_name, date_sent):
        """
        Format message content with metadata but without contextual enrichment.

        Args:
            content (str): Original message content
            channel_name (str): Name of the channel
            subject (str): Subject of the message
            sender_name (str): Name of the sender
            date_sent (datetime): Date the message was sent

        Returns:
            str: Formatted message content with basic metadata
        """
        # Format date in a readable format
        date_str = date_sent.strftime("%Y-%m-%d %H:%M:%S")

        # Replace None values with empty strings
        content = content or ""
        channel_name = channel_name or "Unknown Channel"
        subject = subject or "No Subject"
        sender_name = sender_name or "Unknown Sender"

        # Return plain content with minimal metadata prefix
        return f"Channel: {channel_name} | Subject: {subject} | Sent by: {sender_name} | Date: {date_str}\n\n{content}"

    @staticmethod
    def sanitize_metadata(metadata):
        """
        Sanitize metadata to ensure no None values.

        Args:
            metadata (dict): Metadata dictionary

        Returns:
            dict: Sanitized metadata with no None values
        """
        sanitized = {}
        for key, value in metadata.items():
            if value is None:
                if key == "channel":
                    sanitized[key] = "Unknown Channel"
                elif key == "subject":
                    sanitized[key] = "No Subject"
                elif key == "sender":
                    sanitized[key] = "Unknown Sender"
                elif key == "timestamp":
                    sanitized[key] = datetime.now().isoformat()
                else:
                    sanitized[key] = ""
            else:
                sanitized[key] = value
        return sanitized

    @staticmethod
    def add_message(message_id, content, channel_name, subject, sender_name, date_sent):
        """
        Add a message to the ChromaDB collection with contextual information.

        Args:
            message_id (str): ID of the message
            content (str): Content of the message
            channel_name (str): Name of the channel
            subject (str): Subject of the message
            sender_name (str): Name of the sender
            date_sent (datetime): Date the message was sent

        Returns:
            bool: True if successful, False otherwise
        """
        try:
            # Check if message already exists to avoid duplicates
            if ChromaDBService.message_exists(message_id):
                logger.info(f"Message ID {message_id} already exists in ChromaDB, skipping")
                return True

            collection = get_chroma_collection()

            # Create metadata and sanitize to prevent None values
            metadata = {
                "channel": channel_name,
                "subject": subject,
                "sender": sender_name,
                "timestamp": date_sent.isoformat() if date_sent else datetime.now().isoformat(),
                "source": "zulip"
            }

            # Sanitize metadata to replace None values
            metadata = ChromaDBService.sanitize_metadata(metadata)

            # Format the content to include structured context information
            formatted_content = ChromaDBService.format_message_content(
                content, channel_name, subject, sender_name, date_sent
            )

            # Generate embeddings using our custom embedding function
            embeddings = ChromaDBService._embedding_function([formatted_content])

            # Add to ChromaDB
            collection.add(
                ids=[str(message_id)],
                documents=[formatted_content],
                metadatas=[metadata],
                embeddings=embeddings if embeddings else None
            )

            # Also add to BM25 index for hybrid search
            BM25Service.add_document(formatted_content, str(message_id))

            logger.info(f"Successfully added message ID {message_id} to ChromaDB")
            return True
        except Exception as e:
            logger.error(f"Error adding message to ChromaDB: {e}")
            return False

    @staticmethod
    def search_similar(query_text, n_results=5, filter_criteria=None, use_hybrid=True, _internal_call=False):
        """
        Search for similar messages in ChromaDB with improved contextual relevance.

        Args:
            query_text (str): Text to search for
            n_results (int): Number of results to return
            filter_criteria (dict): Metadata filter criteria
            use_hybrid (bool): Whether to use hybrid search or just vector search
            _internal_call (bool): Internal parameter to prevent circular calls

        Returns:
            dict: Search results from ChromaDB
        """
        try:
            logger.info("Using temporary ChromaDB client to prevent duplicate embeddings")
            collection = get_chroma_collection()

            # If hybrid search is disabled or this is an internal call from HybridSearchService,
            # fall back to vector-only search to prevent circular references
            if not use_hybrid or _internal_call:
                try:
                    # Generate query embedding locally instead of using the collection's embedding function
                    query_embedding = EmbeddingService.get_ollama_embeddings([query_text])[0]

                    # Perform search with embeddings using API directly to prevent collection modifications
                    # Create a temporary read-only client just for search to avoid modifying the main collection
                    temp_client = chromadb.PersistentClient(
                        path=Config.CHROMADB_PATH,
                        settings=chromadb.Settings(
                            anonymized_telemetry=False,
                            is_persistent=True,
                            allow_reset=False
                        )
                    )

                    # Get the existing collection without an embedding function
                    temp_collection = temp_client.get_collection(
                        name=Config.CHROMADB_COLLECTION or "zulip_messages"
                    )

                    # Perform search with embeddings
                    results = temp_collection.query(
                        query_embeddings=[query_embedding],
                        n_results=n_results,
                        where=filter_criteria,
                        include=["metadatas", "documents", "distances"]
                    )

                    # Close temporary client
                    del temp_client

                    return results

                except Exception as e:
                    logger.error(f"Error with vector search: {e}")
                    logger.info("Falling back to direct text query")

                    # Fallback to direct text query if embeddings fail
                    # But use a similar approach with a temporary client
                    try:
                        # Create temporary client just for search
                        temp_client = chromadb.PersistentClient(
                            path=Config.CHROMADB_PATH,
                            settings=chromadb.Settings(
                                anonymized_telemetry=False,
                                is_persistent=True,
                                allow_reset=False
                            )
                        )

                        # Get the existing collection without an embedding function
                        temp_collection = temp_client.get_collection(
                            name=Config.CHROMADB_COLLECTION or "zulip_messages"
                        )

                        # Use CustomEmbeddingFunction for just this query
                        from app.db.chroma_service import CustomEmbeddingFunction
                        embedding_func = CustomEmbeddingFunction(use_nomic=False)

                        # Get embedding for query
                        query_embedding = embedding_func([query_text])[0]

                        # Search using the embedding
                        results = temp_collection.query(
                            query_embeddings=[query_embedding],
                            n_results=n_results,
                            where=filter_criteria,
                            include=["metadatas", "documents", "distances"]
                        )

                        # Close temporary client
                        del temp_client

                        return results

                    except Exception as text_query_error:
                        logger.error(f"Error with text query: {text_query_error}")
                        # Last resort, just get all documents and do a simple text search
                        all_docs = collection.get(where=filter_criteria, include=["metadatas", "documents", "embeddings"])
                        # Return an empty result structure if no docs found
                        if not all_docs or not all_docs.get('ids'):
                            return {"ids": [[]], "documents": [[]], "metadatas": [[]], "distances": [[]]}
                        return {"ids": [all_docs['ids'][:n_results]],
                                "documents": [all_docs['documents'][:n_results]],
                                "metadatas": [all_docs['metadatas'][:n_results]],
                                "distances": [[1.0] * min(n_results, len(all_docs['ids']))]}

            # Use BM25 + vector search from hybrid search module
            # We're not calling it directly here to avoid circular imports
            try:
                from app.utils.contextual_retrieval.hybrid_search import HybridSearchService

                # Use hybrid search
                results = HybridSearchService.hybrid_search(
                    query=query_text,
                    n_results=n_results,
                    filter_criteria=filter_criteria,
                    rerank=True  # Enable reranking
                )

                # Convert to ChromaDB query result format
                formatted_results = {
                    'ids': [[doc['id'] for doc in results]],
                    'documents': [[doc['content'] for doc in results]],
                    'metadatas': [[doc.get('metadata', {}) for doc in results]],
                    'distances': [[1.0 - doc.get('combined_score', 0) for doc in results]]
                }

                return formatted_results
            except ImportError:
                logger.warning("Hybrid search module not available, falling back to vector search")
                # Fall back to vector search if hybrid search module not available

                # Create temporary client for search
                temp_client = chromadb.PersistentClient(
                    path=Config.CHROMADB_PATH,
                    settings=chromadb.Settings(
                        anonymized_telemetry=False,
                        is_persistent=True,
                        allow_reset=False
                    )
                )

                # Get the existing collection without an embedding function
                temp_collection = temp_client.get_collection(
                    name=Config.CHROMADB_COLLECTION or "zulip_messages"
                )

                # Generate embedding
                query_embedding = EmbeddingService.get_ollama_embeddings([query_text])[0]

                # Perform search
                results = temp_collection.query(
                    query_embeddings=[query_embedding],
                    n_results=n_results,
                    where=filter_criteria,
                    include=["metadatas", "documents", "distances"]
                )

                # Close temporary client
                del temp_client

                return results
        except Exception as e:
            logger.error(f"Error searching ChromaDB: {e}")
            # Return an empty result set rather than None
            return {"ids": [[]], "documents": [[]], "metadatas": [[]], "distances": [[]]}

    @staticmethod
    def delete_message(message_id):
        """
        Delete a message from ChromaDB.

        Args:
            message_id (str): ID of the message to delete

        Returns:
            bool: True if successful, False otherwise
        """
        try:
            collection = get_chroma_collection()
            collection.delete(ids=[str(message_id)])

            # Also update BM25 index - for simplicity, we'll rebuild it from ChromaDB
            # In a production scenario, you might want a more efficient approach
            all_results = collection.get()
            if all_results and all_results['ids']:
                BM25Service.index_documents(all_results['documents'], all_results['ids'])

            return True
        except Exception as e:
            logger.error(f"Error deleting message from ChromaDB: {e}")
            return False

    @staticmethod
    def get_message_by_id(message_id):
        """
        Get a message from ChromaDB by ID.

        Args:
            message_id (str): ID of the message to retrieve

        Returns:
            dict: Message data or None if not found
        """
        try:
            collection = get_chroma_collection()
            result = collection.get(ids=[str(message_id)])

            if result['ids'] and len(result['ids']) > 0:
                return {
                    'id': result['ids'][0],
                    'content': result['documents'][0],
                    'metadata': result['metadatas'][0]
                }
            return None
        except RecursionError:
            logger.error(f"Recursion error when getting message ID {message_id} from ChromaDB")
            return None
        except Exception as e:
            logger.error(f"Error getting message from ChromaDB: {e}")
            return None

    @staticmethod
    def message_exists(message_id):
        """
        Check if a message exists in ChromaDB.

        Args:
            message_id (str): ID of the message to check

        Returns:
            bool: True if exists, False otherwise
        """
        try:
            collection = get_chroma_collection()
            result = collection.get(ids=[str(message_id)], include=[])

            return len(result['ids']) > 0
        except Exception as e:
            logger.error(f"Error checking if message exists in ChromaDB: {e}")
            return False

    @staticmethod
    def switch_embedding_method(use_nomic: bool):
        """
        Switch between Nomic and Ollama embedding methods.

        Args:
            use_nomic: Whether to use Nomic (True) or Ollama (False)
        """
        ChromaDBService._embedding_function = CustomEmbeddingFunction(use_nomic=use_nomic)