first commit

2025-05-16 18:00:22 +04:00 · 2025-05-16 18:00:22 +04:00 · 8484b0b882
commit 8484b0b882
20833 changed files with 3827911 additions and 0 deletions
--- a/.cursorignore
+++ b/.cursorignore
@ -0,0 +1 @@
 venv/
--- a/.env
+++ b/.env
@ -0,0 +1,26 @@
 # Flask configuration
 FLASK_ENV=development
 SECRET_KEY=dev-secret-key
 # Zulip DB configuration
 ZULIP_DB_URI=postgresql://zulip:BlackMoonSky89@zulip.lci.ge:5432/zulip
 # ChromaDB configuration
 CHROMA_DB_PATH=./chromadb
 # Embedding model configuration
 USE_NOMIC_EMBEDDINGS=true
 OLLAMA_MODEL=nomic-embed-text
 OLLAMA_HOST=http://localhost:11434
 # AI model configuration
 # OpenAI GPT-4o (new)
 OPENAI_API_KEY=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
 # Gemini API (legacy)
 GEMINI_API_KEY=AIzaSyD_VYKUcleCUkAxZj1sX3pWLHvGk0HDe9s
 # Zulip Bot configuration
 ZULIP_BOT_EMAIL=IT_bot-bot@zulip.lci.ge
 ZULIP_BOT_API_KEY=ta8x0Rwlf5yLlZutETiTZbHFtQMVOv1z
 ZULIP_SITE=https://zulip.lci.ge 
--- a/README.md
+++ b/README.md
@ -0,0 +1,79 @@
 # Zulip Bot Service
 This is a Zulip bot service that provides AI-powered responses to user queries.
 ## Running as a Service with PM2
 The application can be set up to run as a background service using PM2, which ensures it starts automatically on system boot and restarts if it crashes.
 ### Prerequisites
 1. Node.js and npm installed on your system
 2. PM2 installed globally (`npm install -g pm2`)
 3. Python 3.11+ and required dependencies
 ### Installation
 1. Make sure all required environment variables are set in your `.env` file:
   ```
   ZULIP_BOT_EMAIL=your-bot@example.com
   ZULIP_BOT_API_KEY=your-api-key
   ZULIP_SITE=https://your-zulip-instance.com
   OPENAI_API_KEY=your-openai-api-key
   ```
 2. Make the setup script executable:
   ```bash
   chmod +x pm2_start.sh
   ```
 3. Run the setup script:
   ```bash
   ./pm2_start.sh
   ```
 4. The script will:
   - Install PM2 if not already installed
   - Start the bot as a background service
   - Configure PM2 to run at system startup
   - Provide instructions for any required sudo commands
 ### Managing the Service
 - **Check status**: `pm2 status`
 - **View logs**: `pm2 logs zulip-bot`
 - **Restart**: `pm2 restart zulip-bot`
 - **Stop**: `pm2 stop zulip-bot`
 - **Start (if stopped)**: `pm2 start zulip-bot`
 ### Troubleshooting
 If the service fails to start:
 1. Check logs for errors: `pm2 logs zulip-bot`
 2. Ensure all environment variables are properly set
 3. Verify that the Flask app works by running it directly: `./run_app.sh`
 ## Manual Setup
 If you prefer to run the bot without PM2:
 1. Activate the virtual environment:
   ```bash
   source venv/bin/activate
   ```
 2. Run the Flask app:
   ```bash
   ./run_app.sh
   ```
 ## Development
 For development purposes, you can run the Flask app in debug mode:
 ```bash
 export FLASK_APP=app
 export FLASK_DEBUG=1
 flask run --port=5100
 ``` 
--- a/all_channels_sync_state.pickle
+++ b/all_channels_sync_state.pickle
--- a/app/init.py
+++ b/app/init.py
@ -0,0 +1,146 @@
 """
 Main application entry point for the Zulip Bot application.
 """
 import os
 from flask import Flask, request, jsonify
 from app.config import load_config
 def create_app(config_name=None):
    """Create and configure the Flask application."""
    app = Flask(__name__)
    # Load configuration
    config = load_config(config_name)
    app.config.from_object(config)
    # Set DEBUG mode for the app
    app.config['DEBUG'] = True
    # Override any environment flags to disable safety filters
    os.environ['GEMINI_NO_SAFETY'] = 'true'
    # Apply NumPy compatibility patch for ChromaDB
    from app.utils import patch_chromadb_numpy
    patch_chromadb_numpy()
    # Initialize database connections
    from app.db import init_db
    init_db(app)
    # Check if we're in the main process or a Flask reloader worker
    # When Flask reloads in debug mode, it sets an environment variable
    # We only want to start services in the main process to avoid duplication
    is_flask_reloader_process = os.environ.get('WERKZEUG_RUN_MAIN') == 'true'
    is_main_process = not os.environ.get('WERKZEUG_RUN_MAIN')
    # Only start services in the main process or if --no-reload is used
    # This prevents duplicate services when using Flask's debug mode
    should_start_services = is_flask_reloader_process or is_main_process
    # Initialize message sync service and bot service regardless of process
    # but only start them in the appropriate process
    from app.utils.sync_service import MessageSyncService
    sync_service = MessageSyncService(sync_interval=60)  # Sync every 60 seconds
    # Store sync_service in app context so it can be accessed elsewhere
    app.sync_service = sync_service
    # Initialize Zulip bot service
    from app.utils.bot_service import ZulipBotService
    bot_service = ZulipBotService()
    # Store bot_service in app context so it can be accessed elsewhere
    app.bot_service = bot_service
    # Start the services in a better way (avoiding deprecated before_first_request)
    # But only if this is the main process or Flask reloader's main thread
    with app.app_context():
        # Add logging to help diagnose any issues
        app.logger.info(f"App initialization, should_start_services={should_start_services}, " 
                      f"is_main_process={is_main_process}, is_flask_reloader_process={is_flask_reloader_process}")
        if should_start_services:
            # Start the sync service
            app.logger.info("Starting sync service...")
            sync_service.start()
            # Start the bot service and log the result
            app.logger.info("Starting Zulip bot service...")
            if bot_service.thread and bot_service.thread.is_alive():
                app.logger.info("Bot service is already running, not starting again")
            else:
                bot_service.start()
                app.logger.info("Bot service started successfully")
        else:
            app.logger.info("Skipping service startup in Flask reloader process")
    # Register a shutdown function to stop the services
    @app.teardown_appcontext
    def stop_services(exception=None):
        if hasattr(app, 'sync_service'):
            app.sync_service.stop()
        if hasattr(app, 'bot_service'):
            app.bot_service.stop()
    # Register blueprints
    # This will be implemented later
    @app.route('/health')
    def health_check():
        """Simple health check endpoint."""
        return jsonify({'status': 'ok'})
    @app.route('/sync/now')
    def trigger_sync():
        """Trigger an immediate sync."""
        if hasattr(app, 'sync_service'):
            app.sync_service.sync_now()
            return jsonify({'status': 'sync_triggered'})
        return jsonify({'status': 'error', 'message': 'Sync service not available'}), 500
    @app.route('/bot/status')
    def bot_status():
        """Get the status of the bot service."""
        if hasattr(app, 'bot_service') and app.bot_service.thread and app.bot_service.thread.is_alive():
            return jsonify({'status': 'running'})
        return jsonify({'status': 'stopped'})
    @app.route('/bot/start', methods=['POST'])
    def start_bot():
        """Start the bot service."""
        if hasattr(app, 'bot_service'):
            app.bot_service.start()
            return jsonify({'status': 'started'})
        return jsonify({'status': 'error', 'message': 'Bot service not available'}), 500
    @app.route('/bot/stop', methods=['POST'])
    def stop_bot():
        """Stop the bot service."""
        if hasattr(app, 'bot_service'):
            app.bot_service.stop()
            return jsonify({'status': 'stopped'})
        return jsonify({'status': 'error', 'message': 'Bot service not available'}), 500
    @app.route('/bot/test', methods=['POST'])
    def test_bot():
        """Send a test message to verify the bot is working."""
        if not hasattr(app, 'bot_service'):
            return jsonify({'status': 'error', 'message': 'Bot service not available'}), 500
        data = request.get_json()
        if not data or 'recipient' not in data or 'content' not in data:
            return jsonify({'status': 'error', 'message': 'Missing required fields: recipient, content'}), 400
        result = app.bot_service.send_test_message(data['recipient'], data['content'])
        return jsonify({'status': 'sent', 'result': result})
    @app.route('/bot/reset-cache', methods=['POST'])
    def reset_bot_cache():
        """Reset the bot's message cache to fix issues with message processing."""
        if not hasattr(app, 'bot_service'):
            return jsonify({'status': 'error', 'message': 'Bot service not available'}), 500
        result = app.bot_service.reset_cache()
        return jsonify({'status': 'success', 'message': result})
    return app 
--- a/app/pycache/init.cpython-311.pyc
+++ b/app/pycache/init.cpython-311.pyc
--- a/app/config/init.py
+++ b/app/config/init.py
@ -0,0 +1,106 @@
 """
 Configuration module for the application.
 Loads environment variables and provides configuration values.
 """
 import os
 from dotenv import load_dotenv
 # Load environment variables from .env file (if it exists)
 load_dotenv()
 class Config:
    """Configuration class for the application."""
    # Zulip API settings
    ZULIP_EMAIL = os.getenv("ZULIP_EMAIL", "IT_bot-bot@zulip.lci.ge")
    ZULIP_API_KEY = os.getenv("ZULIP_API_KEY", "ta8x0Rwlf5yLlZutETiTZbHFtQMVOv1z")
    ZULIP_SITE = os.getenv("ZULIP_SITE", "https://zulip.lci.ge")
    # Zulip database settings
    ZULIP_DB_HOST = os.getenv("ZULIP_DB_HOST", "zulip.lci.ge")
    ZULIP_DB_PORT = os.getenv("ZULIP_DB_PORT", "5432")
    ZULIP_DB_NAME = os.getenv("ZULIP_DB_NAME", "zulip")
    ZULIP_DB_USER = os.getenv("ZULIP_DB_USER", "zulip")
    ZULIP_DB_PASSWORD = os.getenv("ZULIP_DB_PASSWORD", "BlackMoonSky89")
    # Database URL
    SQLALCHEMY_DATABASE_URI = f"postgresql://{ZULIP_DB_USER}:{ZULIP_DB_PASSWORD}@{ZULIP_DB_HOST}:{ZULIP_DB_PORT}/{ZULIP_DB_NAME}"
    # ChromaDB settings
    CHROMADB_PATH = os.getenv("CHROMADB_PATH", "./chromadb")
    CHROMADB_COLLECTION = os.getenv("CHROMADB_COLLECTION", "zulip_messages")
    # Channels to monitor (IT Discussions, IT Knowledge, IT Support)
    CHANNELS_TO_MONITOR = [
        "IT Discussions",
        "IT Knowledge",
        "IT Support"
    ]
    # AI model settings
    # OpenAI settings (primary)
    OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
    OPENAI_MODEL = os.getenv("OPENAI_MODEL", "gpt-4o")
    # Gemini API settings (legacy)
    GEMINI_API_KEY = os.getenv("GEMINI_API_KEY", "AIzaSyD_VYKUcleCUkAxZj1sX3pWLHvGk0HDe9s")
    # Embedding settings
    USE_NOMIC_EMBEDDINGS = os.getenv("USE_NOMIC_EMBEDDINGS", "False").lower() == "true"
    COHERE_API_KEY = os.getenv("COHERE_API_KEY", "4sCOTMgEg5rXeXU0XMmPeucSBMl5xd4FMhyV2UDW")
    # Ollama settings
    OLLAMA_MODEL = os.getenv("OLLAMA_MODEL", "nomic-embed-text")
    OLLAMA_HOST = os.getenv("OLLAMA_HOST", "http://localhost:11434")
    # Flask settings
    SECRET_KEY = os.getenv("SECRET_KEY", "your_secret_key_here")
    DEBUG = os.getenv("DEBUG", "True").lower() == "true"
    # Bot settings
    BOT_NAME = "IT_Bot"
    BOT_TRIGGER = f"@**{BOT_NAME}**"
    # Rate limiting settings
    RATE_LIMIT_PERIOD = int(os.getenv("RATE_LIMIT_PERIOD", "60"))  # 60 seconds
    RATE_LIMIT_REQUESTS = int(os.getenv("RATE_LIMIT_REQUESTS", "10"))  # 10 requests per period
 class DevelopmentConfig(Config):
    """Development configuration."""
    DEBUG = True
 class ProductionConfig(Config):
    """Production configuration."""
    DEBUG = False
 class TestingConfig(Config):
    """Testing configuration."""
    TESTING = True
    # Use a test database
    SQLALCHEMY_DATABASE_URI = os.getenv('TEST_SQLALCHEMY_DATABASE_URI', 'postgresql://zulip:BlackMoonSky89@zulip.lci.ge:5432/zulip_test')
    # Use a test ChromaDB path
    CHROMADB_PATH = os.getenv('TEST_CHROMADB_PATH', './chromadb_test')
 # Configuration dictionary
 config_dict = {
    'development': DevelopmentConfig,
    'production': ProductionConfig,
    'testing': TestingConfig,
    'default': DevelopmentConfig
 }
 def load_config(config_name=None):
    """
    Load the appropriate configuration based on environment variables or the provided config_name.
    Args:
        config_name (str, optional): Name of the configuration to load. Defaults to None.
    Returns:
        Config: Configuration object
    """
    if not config_name:
        config_name = os.getenv('FLASK_ENV', 'default')
    return config_dict.get(config_name, config_dict['default']) 
--- a/app/config/pycache/init.cpython-311.pyc
+++ b/app/config/pycache/init.cpython-311.pyc
--- a/app/db/init.py
+++ b/app/db/init.py
@ -0,0 +1,105 @@
 """
 Database module for the application.
 Handles connections to PostgreSQL (Zulip DB) and ChromaDB.
 """
 from sqlalchemy import create_engine
 from sqlalchemy.orm import sessionmaker, scoped_session
 from sqlalchemy.ext.declarative import declarative_base
 import chromadb
 # SQLAlchemy base class for models
 Base = declarative_base()
 # Global variables for SQLAlchemy
 db_engine = None
 db_session = None
 # Global variable for ChromaDB
 chroma_client = None
 chroma_collection = None
 def init_db(app):
    """
    Initialize database connections.
    Args:
        app: Flask application object
    """
    global db_engine, db_session, chroma_client, chroma_collection
    # Initialize SQLAlchemy engine and session
    db_engine = create_engine(app.config['SQLALCHEMY_DATABASE_URI'])
    db_session = scoped_session(sessionmaker(autocommit=False, autoflush=False, bind=db_engine))
    # Set query property for models
    Base.query = db_session.query_property()
    # Initialize ChromaDB
    try:
        # Set allow_reset to True to prevent "Add of existing embedding ID" warnings
        chroma_client = chromadb.PersistentClient(
            path=app.config['CHROMADB_PATH'],
            settings=chromadb.Settings(
                allow_reset=True,
                anonymized_telemetry=False,
                is_persistent=True
            )
        )
        # Import here to avoid circular imports
        from app.db.chroma_service import CustomEmbeddingFunction
        # Create embedding function with setting from config
        try:
            # Always use Ollama since it's more reliable
            embedding_function = CustomEmbeddingFunction(use_nomic=False)
            # Get or create ChromaDB collection for Zulip messages with custom embedding function
            chroma_collection = chroma_client.get_or_create_collection(
                name=app.config.get('CHROMADB_COLLECTION', 'zulip_messages'),
                metadata={
                    "hnsw:space": "cosine",
                    "hnsw:allow_replace_deleted": True  # Allow replacing deleted vectors
                },
                embedding_function=embedding_function
            )
        except Exception as e:
            print(f"Error with embedding function: {e}")
            print("Creating collection without embedding function")
            # Create collection without embedding function
            chroma_collection = chroma_client.get_or_create_collection(
                name=app.config.get('CHROMADB_COLLECTION', 'zulip_messages'),
                metadata={
                    "hnsw:space": "cosine",
                    "hnsw:allow_replace_deleted": True  # Allow replacing deleted vectors
                }
            )
    except Exception as e:
        print(f"Critical error initializing ChromaDB: {e}")
        print("ChromaDB functionality will not be available")
        chroma_client = None
        chroma_collection = None
    # Register teardown function to remove database sessions
    @app.teardown_appcontext
    def shutdown_session(exception=None):
        """Remove the database session at the end of the request."""
        db_session.remove()
 def get_db_session():
    """
    Get the current database session.
    Returns:
        SQLAlchemy session object
    """
    return db_session
 def get_chroma_collection():
    """
    Get the ChromaDB collection for Zulip messages.
    Returns:
        ChromaDB collection object
    """
    return chroma_collection 
--- a/app/db/pycache/init.cpython-311.pyc
+++ b/app/db/pycache/init.cpython-311.pyc
--- a/app/db/pycache/chroma_service.cpython-311.pyc
+++ b/app/db/pycache/chroma_service.cpython-311.pyc
--- a/app/db/pycache/zulip_service.cpython-311.pyc
+++ b/app/db/pycache/zulip_service.cpython-311.pyc
--- a/app/db/chroma_service.py
+++ b/app/db/chroma_service.py
@ -0,0 +1,418 @@
 """
 Service for storing and retrieving embedded messages in ChromaDB.
 """
 import json
 from datetime import datetime
 from typing import List, Dict, Any, Optional, Union
 import chromadb
 from chromadb.utils import embedding_functions
 from app.db import get_chroma_collection
 from app.utils.embeddings import EmbeddingService
 from app.utils.contextual_retrieval.context_service import ContextService
 from app.utils.contextual_retrieval.bm25_service import BM25Service
 from app.config import Config
 import logging
 # Set up logging
 logger = logging.getLogger("chroma_service")
 class CustomEmbeddingFunction(embedding_functions.EmbeddingFunction):
    """Custom embedding function using our EmbeddingService."""
    def __init__(self, use_nomic: bool = True):
        """
        Initialize the custom embedding function.
        Args:
            use_nomic: Whether to use Nomic (True) or Ollama (False) for embeddings
        """
        self.use_nomic = use_nomic
    def __call__(self, texts: List[str]) -> List[List[float]]:
        """
        Generate embeddings for a list of texts.
        Args:
            texts: List of texts to generate embeddings for
        Returns:
            List of embeddings as float arrays
        """
        return EmbeddingService.get_embeddings(texts, use_nomic=self.use_nomic)
 class ChromaDBService:
    """Service for storing and retrieving embedded messages in ChromaDB."""
    # Use Ollama embeddings by default for reliability
    _embedding_function = CustomEmbeddingFunction(use_nomic=False)
    @staticmethod
    def format_message_content(content, channel_name, subject, sender_name, date_sent):
        """
        Format message content with metadata but without contextual enrichment.
        Args:
            content (str): Original message content
            channel_name (str): Name of the channel
            subject (str): Subject of the message
            sender_name (str): Name of the sender
            date_sent (datetime): Date the message was sent
        Returns:
            str: Formatted message content with basic metadata
        """
        # Format date in a readable format
        date_str = date_sent.strftime("%Y-%m-%d %H:%M:%S")
        # Replace None values with empty strings
        content = content or ""
        channel_name = channel_name or "Unknown Channel"
        subject = subject or "No Subject"
        sender_name = sender_name or "Unknown Sender"
        # Return plain content with minimal metadata prefix
        return f"Channel: {channel_name} | Subject: {subject} | Sent by: {sender_name} | Date: {date_str}\n\n{content}"
    @staticmethod
    def sanitize_metadata(metadata):
        """
        Sanitize metadata to ensure no None values.
        Args:
            metadata (dict): Metadata dictionary
        Returns:
            dict: Sanitized metadata with no None values
        """
        sanitized = {}
        for key, value in metadata.items():
            if value is None:
                if key == "channel":
                    sanitized[key] = "Unknown Channel"
                elif key == "subject":
                    sanitized[key] = "No Subject"
                elif key == "sender":
                    sanitized[key] = "Unknown Sender"
                elif key == "timestamp":
                    sanitized[key] = datetime.now().isoformat()
                else:
                    sanitized[key] = ""
            else:
                sanitized[key] = value
        return sanitized
    @staticmethod
    def add_message(message_id, content, channel_name, subject, sender_name, date_sent):
        """
        Add a message to the ChromaDB collection with contextual information.
        Args:
            message_id (str): ID of the message
            content (str): Content of the message
            channel_name (str): Name of the channel
            subject (str): Subject of the message
            sender_name (str): Name of the sender
            date_sent (datetime): Date the message was sent
        Returns:
            bool: True if successful, False otherwise
        """
        try:
            # Check if message already exists to avoid duplicates
            if ChromaDBService.message_exists(message_id):
                logger.info(f"Message ID {message_id} already exists in ChromaDB, skipping")
                return True
            collection = get_chroma_collection()
            # Create metadata and sanitize to prevent None values
            metadata = {
                "channel": channel_name,
                "subject": subject,
                "sender": sender_name,
                "timestamp": date_sent.isoformat() if date_sent else datetime.now().isoformat(),
                "source": "zulip"
            }
            # Sanitize metadata to replace None values
            metadata = ChromaDBService.sanitize_metadata(metadata)
            # Format the content to include structured context information
            formatted_content = ChromaDBService.format_message_content(
                content, channel_name, subject, sender_name, date_sent
            )
            # Generate embeddings using our custom embedding function
            embeddings = ChromaDBService._embedding_function([formatted_content])
            # Add to ChromaDB
            collection.add(
                ids=[str(message_id)],
                documents=[formatted_content],
                metadatas=[metadata],
                embeddings=embeddings if embeddings else None
            )
            # Also add to BM25 index for hybrid search
            BM25Service.add_document(formatted_content, str(message_id))
            logger.info(f"Successfully added message ID {message_id} to ChromaDB")
            return True
        except Exception as e:
            logger.error(f"Error adding message to ChromaDB: {e}")
            return False
    @staticmethod
    def search_similar(query_text, n_results=5, filter_criteria=None, use_hybrid=True, _internal_call=False):
        """
        Search for similar messages in ChromaDB with improved contextual relevance.
        Args:
            query_text (str): Text to search for
            n_results (int): Number of results to return
            filter_criteria (dict): Metadata filter criteria
            use_hybrid (bool): Whether to use hybrid search or just vector search
            _internal_call (bool): Internal parameter to prevent circular calls
        Returns:
            dict: Search results from ChromaDB
        """
        try:
            logger.info("Using temporary ChromaDB client to prevent duplicate embeddings")
            collection = get_chroma_collection()
            # If hybrid search is disabled or this is an internal call from HybridSearchService,
            # fall back to vector-only search to prevent circular references
            if not use_hybrid or _internal_call:
                try:
                    # Generate query embedding locally instead of using the collection's embedding function
                    query_embedding = EmbeddingService.get_ollama_embeddings([query_text])[0]
                    # Perform search with embeddings using API directly to prevent collection modifications
                    # Create a temporary read-only client just for search to avoid modifying the main collection
                    temp_client = chromadb.PersistentClient(
                        path=Config.CHROMADB_PATH,
                        settings=chromadb.Settings(
                            anonymized_telemetry=False,
                            is_persistent=True,
                            allow_reset=False
                        )
                    )
                    # Get the existing collection without an embedding function
                    temp_collection = temp_client.get_collection(
                        name=Config.CHROMADB_COLLECTION or "zulip_messages"
                    )
                    # Perform search with embeddings
                    results = temp_collection.query(
                        query_embeddings=[query_embedding],
                        n_results=n_results,
                        where=filter_criteria,
                        include=["metadatas", "documents", "distances"]
                    )
                    # Close temporary client
                    del temp_client
                    return results
                except Exception as e:
                    logger.error(f"Error with vector search: {e}")
                    logger.info("Falling back to direct text query")
                    # Fallback to direct text query if embeddings fail
                    # But use a similar approach with a temporary client
                    try:
                        # Create temporary client just for search
                        temp_client = chromadb.PersistentClient(
                            path=Config.CHROMADB_PATH,
                            settings=chromadb.Settings(
                                anonymized_telemetry=False,
                                is_persistent=True,
                                allow_reset=False
                            )
                        )
                        # Get the existing collection without an embedding function
                        temp_collection = temp_client.get_collection(
                            name=Config.CHROMADB_COLLECTION or "zulip_messages"
                        )
                        # Use CustomEmbeddingFunction for just this query
                        from app.db.chroma_service import CustomEmbeddingFunction
                        embedding_func = CustomEmbeddingFunction(use_nomic=False)
                        # Get embedding for query
                        query_embedding = embedding_func([query_text])[0]
                        # Search using the embedding
                        results = temp_collection.query(
                            query_embeddings=[query_embedding],
                            n_results=n_results,
                            where=filter_criteria,
                            include=["metadatas", "documents", "distances"]
                        )
                        # Close temporary client
                        del temp_client
                        return results
                    except Exception as text_query_error:
                        logger.error(f"Error with text query: {text_query_error}")
                        # Last resort, just get all documents and do a simple text search
                        all_docs = collection.get(where=filter_criteria, include=["metadatas", "documents", "embeddings"])
                        # Return an empty result structure if no docs found
                        if not all_docs or not all_docs.get('ids'):
                            return {"ids": [[]], "documents": [[]], "metadatas": [[]], "distances": [[]]}
                        return {"ids": [all_docs['ids'][:n_results]], 
                                "documents": [all_docs['documents'][:n_results]], 
                                "metadatas": [all_docs['metadatas'][:n_results]], 
                                "distances": [[1.0] * min(n_results, len(all_docs['ids']))]}
            # Use BM25 + vector search from hybrid search module
            # We're not calling it directly here to avoid circular imports
            try:
                from app.utils.contextual_retrieval.hybrid_search import HybridSearchService
                # Use hybrid search
                results = HybridSearchService.hybrid_search(
                    query=query_text,
                    n_results=n_results,
                    filter_criteria=filter_criteria,
                    rerank=True  # Enable reranking
                )
                # Convert to ChromaDB query result format
                formatted_results = {
                    'ids': [[doc['id'] for doc in results]],
                    'documents': [[doc['content'] for doc in results]],
                    'metadatas': [[doc.get('metadata', {}) for doc in results]],
                    'distances': [[1.0 - doc.get('combined_score', 0) for doc in results]]
                }
                return formatted_results
            except ImportError:
                logger.warning("Hybrid search module not available, falling back to vector search")
                # Fall back to vector search if hybrid search module not available
                # Create temporary client for search
                temp_client = chromadb.PersistentClient(
                    path=Config.CHROMADB_PATH,
                    settings=chromadb.Settings(
                        anonymized_telemetry=False,
                        is_persistent=True,
                        allow_reset=False
                    )
                )
                # Get the existing collection without an embedding function
                temp_collection = temp_client.get_collection(
                    name=Config.CHROMADB_COLLECTION or "zulip_messages"
                )
                # Generate embedding
                query_embedding = EmbeddingService.get_ollama_embeddings([query_text])[0]
                # Perform search
                results = temp_collection.query(
                    query_embeddings=[query_embedding],
                    n_results=n_results,
                    where=filter_criteria,
                    include=["metadatas", "documents", "distances"]
                )
                # Close temporary client
                del temp_client
                return results
        except Exception as e:
            logger.error(f"Error searching ChromaDB: {e}")
            # Return an empty result set rather than None
            return {"ids": [[]], "documents": [[]], "metadatas": [[]], "distances": [[]]}
    @staticmethod
    def delete_message(message_id):
        """
        Delete a message from ChromaDB.
        Args:
            message_id (str): ID of the message to delete
        Returns:
            bool: True if successful, False otherwise
        """
        try:
            collection = get_chroma_collection()
            collection.delete(ids=[str(message_id)])
            # Also update BM25 index - for simplicity, we'll rebuild it from ChromaDB
            # In a production scenario, you might want a more efficient approach
            all_results = collection.get()
            if all_results and all_results['ids']:
                BM25Service.index_documents(all_results['documents'], all_results['ids'])
            return True
        except Exception as e:
            logger.error(f"Error deleting message from ChromaDB: {e}")
            return False
    @staticmethod
    def get_message_by_id(message_id):
        """
        Get a message from ChromaDB by ID.
        Args:
            message_id (str): ID of the message to retrieve
        Returns:
            dict: Message data or None if not found
        """
        try:
            collection = get_chroma_collection()
            result = collection.get(ids=[str(message_id)])
            if result['ids'] and len(result['ids']) > 0:
                return {
                    'id': result['ids'][0],
                    'content': result['documents'][0],
                    'metadata': result['metadatas'][0]
                }
            return None
        except RecursionError:
            logger.error(f"Recursion error when getting message ID {message_id} from ChromaDB")
            return None
        except Exception as e:
            logger.error(f"Error getting message from ChromaDB: {e}")
            return None
    @staticmethod
    def message_exists(message_id):
        """
        Check if a message exists in ChromaDB.
        Args:
            message_id (str): ID of the message to check
        Returns:
            bool: True if exists, False otherwise
        """
        try:
            collection = get_chroma_collection()
            result = collection.get(ids=[str(message_id)], include=[])
            return len(result['ids']) > 0
        except Exception as e:
            logger.error(f"Error checking if message exists in ChromaDB: {e}")
            return False
    @staticmethod
    def switch_embedding_method(use_nomic: bool):
        """
        Switch between Nomic and Ollama embedding methods.
        Args:
            use_nomic: Whether to use Nomic (True) or Ollama (False)
        """
        ChromaDBService._embedding_function = CustomEmbeddingFunction(use_nomic=use_nomic) 
--- a/app/db/integration_service.py
+++ b/app/db/integration_service.py
@ -0,0 +1,120 @@
 """
 Database integration service.
 Combines functionality from both Zulip and ChromaDB services.
 """
 from datetime import datetime
 from app.db.zulip_service import ZulipDatabaseService
 from app.db.chroma_service import ChromaDBService
 from app.utils.contextual_retrieval.hybrid_search import HybridSearchService
 class DatabaseIntegrationService:
    """
    Service for integrating between Zulip DB and ChromaDB.
    Handles the synchronization of messages from Zulip to ChromaDB.
    """
    @staticmethod
    def sync_messages_to_chromadb(days_ago=30, limit=1000):
        """
        Sync recent messages from Zulip to ChromaDB.
        Args:
            days_ago (int): Number of days to look back
            limit (int): Maximum number of messages to sync
        Returns:
            dict: Statistics about the sync operation
        """
        # Get messages from Zulip
        messages = ZulipDatabaseService.get_messages_from_it_channels(days_ago=days_ago, limit=limit)
        stats = {
            "total_messages": len(messages),
            "new_messages": 0,
            "already_existing": 0,
            "failed": 0
        }
        # Process each message
        for message in messages:
            # Check if message already exists in ChromaDB
            if ChromaDBService.message_exists(message.id):
                stats["already_existing"] += 1
                continue
            # Get channel name for the message
            channel_name = ZulipDatabaseService.get_channel_name_for_message(message)
            # Get sender name (we don't have that information readily available from the query)
            # In a real implementation, we would join with the UserProfile table
            sender_name = f"User ID: {message.sender_id}"
            # Add message to ChromaDB
            success = ChromaDBService.add_message(
                message_id=message.id,
                content=message.content,
                channel_name=channel_name,
                subject=message.subject,
                sender_name=sender_name,
                date_sent=message.date_sent
            )
            if success:
                stats["new_messages"] += 1
            else:
                stats["failed"] += 1
        return stats
    @staticmethod
    def search_knowledge_base(query_text, n_results=5, filter_channel=None, use_hybrid=True, use_reranking=True):
        """
        Search for messages in the knowledge base using hybrid search.
        Args:
            query_text (str): Text to search for
            n_results (int): Number of results to return
            filter_channel (str): Optional channel name to filter results
            use_hybrid (bool): Whether to use hybrid search or just vector search
            use_reranking (bool): Whether to apply reranking to the results
        Returns:
            list: List of search results
        """
        # Prepare filter criteria
        filter_criteria = None
        if filter_channel:
            filter_criteria = {"channel": filter_channel}
        # Decide which search method to use
        if use_hybrid:
            # Use the hybrid search service
            results = HybridSearchService.hybrid_search(
                query=query_text,
                n_results=n_results,
                filter_criteria=filter_criteria,
                rerank=use_reranking
            )
            return results
        else:
            # Use the standard ChromaDB search
            results = ChromaDBService.search_similar(
                query_text=query_text,
                n_results=n_results,
                filter_criteria=filter_criteria,
                use_hybrid=False
            )
            # Format results
            formatted_results = []
            if results and results['ids'] and len(results['ids'][0]) > 0:
                for i in range(len(results['ids'][0])):
                    formatted_results.append({
                        'id': results['ids'][0][i],
                        'content': results['documents'][0][i],
                        'metadata': results['metadatas'][0][i],
                        'score': results['distances'][0][i] if 'distances' in results else None
                    })
            return formatted_results
--- a/app/db/zulip_service.py
+++ b/app/db/zulip_service.py
@ -0,0 +1,209 @@
 """
 Service for querying messages from the Zulip database.
 """
 from datetime import datetime, timedelta
 from sqlalchemy import and_, or_
 from app.db import get_db_session
 from app.models.zulip import Message, Stream, Recipient, UserProfile, IT_RECIPIENT_IDS
 class ZulipDatabaseService:
    """Service for querying messages from the Zulip database."""
    @staticmethod
    def get_messages_from_it_channels(days_ago=None, limit=1000, since=None):
        """
        Get recent messages from IT channels.
        Args:
            days_ago (int): Number of days to look back (optional)
            limit (int): Maximum number of messages to return
            since (datetime): Get messages after this datetime (optional)
        Returns:
            list: List of Message objects
        """
        session = get_db_session()
        # Build the query based on parameters
        query = session.query(Message).filter(
            Message.recipient_id.in_(IT_RECIPIENT_IDS)
        )
        # Add date filter if specified
        if since:
            query = query.filter(Message.date_sent >= since)
        elif days_ago:
            start_date = datetime.now() - timedelta(days=days_ago)
            query = query.filter(Message.date_sent >= start_date)
        # Get results
        messages = query.order_by(Message.id.desc()).limit(limit).all()
        return messages
    @staticmethod
    def get_messages_newer_than_id(message_id, limit=100):
        """
        Get messages with ID greater than the specified ID.
        Args:
            message_id (int): Get messages with ID greater than this
            limit (int): Maximum number of messages to return
        Returns:
            list: List of Message objects
        """
        session = get_db_session()
        messages = session.query(Message).filter(
            and_(
                Message.recipient_id.in_(IT_RECIPIENT_IDS),
                Message.id > message_id
            )
        ).order_by(Message.id.asc()).limit(limit).all()
        return messages
    @staticmethod
    def get_message_by_id(message_id):
        """
        Get a specific message by ID.
        Args:
            message_id (int): ID of the message to retrieve
        Returns:
            Message: Message object or None if not found
        """
        session = get_db_session()
        return session.query(Message).filter(Message.id == message_id).first()
    @staticmethod
    def search_messages(search_term, days_ago=365, limit=100):
        """
        Search for messages containing a specific term.
        Args:
            search_term (str): Term to search for
            days_ago (int): Number of days to look back
            limit (int): Maximum number of messages to return
        Returns:
            list: List of Message objects matching the search
        """
        session = get_db_session()
        start_date = datetime.now() - timedelta(days=days_ago)
        # Use the tsquery system if available, otherwise fall back to LIKE
        messages = session.query(Message).filter(
            and_(
                Message.recipient_id.in_(IT_RECIPIENT_IDS),
                Message.date_sent >= start_date,
                or_(
                    Message.content.ilike(f'%{search_term}%'),
                    Message.subject.ilike(f'%{search_term}%')
                )
            )
        ).order_by(Message.date_sent.desc()).limit(limit).all()
        return messages
    @staticmethod
    def get_channel_name_for_message(message):
        """
        Get the channel name for a message.
        Args:
            message (Message): Message object
        Returns:
            str: Channel name or "Unknown Channel" if not found
        """
        session = get_db_session()
        try:
            if not message or not message.recipient_id:
                return "Unknown Channel"
            # First, get the recipient to determine type
            recipient = session.query(Recipient).filter(
                Recipient.id == message.recipient_id
            ).first()
            if not recipient:
                return "Unknown Channel"
            # Check recipient type (1 = stream, 2 = user, 3 = huddle)
            if recipient.type != 1:
                # For direct messages or huddles
                return "Direct Message" if recipient.type == 2 else "Group Message"
            # For stream messages, get the stream name
            stream = session.query(Stream).filter(
                Stream.recipient_id == message.recipient_id
            ).first()
            # Return the name or a default value
            return stream.name if stream and stream.name else "Unknown Channel"
        except Exception as e:
            # Log the error but don't crash - return a default value
            print(f"Error getting channel name for message {message.id if message else 'unknown'}: {e}")
            return "Unknown Channel"
    @staticmethod
    def get_sender_name_for_message(message):
        """
        Get the sender name for a message.
        Args:
            message (Message): Message object
        Returns:
            str: Sender full name or 'Unknown User' if not found
        """
        session = get_db_session()
        try:
            if not message or not message.sender_id:
                return "Unknown User"
            user = session.query(UserProfile).filter(
                UserProfile.id == message.sender_id
            ).first()
            return user.full_name if user and user.full_name else "Unknown User"
        except Exception as e:
            # Log the error but don't crash - return a default value
            print(f"Error getting sender name for message {message.id if message else 'unknown'}: {e}")
            return "Unknown User"
    @staticmethod
    def count_messages_up_to_id(message_id, since=None):
        """
        Count messages with ID less than or equal to the specified ID.
        Args:
            message_id (int): Count messages with ID <= this
            since (datetime): Only count messages after this datetime (optional)
        Returns:
            int: Count of messages
        """
        session = get_db_session()
        # Build the query
        query = session.query(Message).filter(
            and_(
                Message.recipient_id.in_(IT_RECIPIENT_IDS),
                Message.id <= message_id
            )
        )
        # Add date filter if specified
        if since:
            query = query.filter(Message.date_sent >= since)
        # Count the messages
        count = query.count()
        return count 
--- a/app/models/init.py
+++ b/app/models/init.py
@ -0,0 +1,10 @@
 """
 Models module for the application.
 Contains SQLAlchemy model definitions for Zulip database tables.
 """
 from app.db import Base
 # Import models to make them available through the models module
 from app.models.zulip import Recipient, Stream, Message, UserProfile, IT_CHANNELS, IT_RECIPIENT_IDS
 # This will be populated as we define models in the next steps 
--- a/app/models/pycache/init.cpython-311.pyc
+++ b/app/models/pycache/init.cpython-311.pyc
--- a/app/models/pycache/zulip.cpython-311.pyc
+++ b/app/models/pycache/zulip.cpython-311.pyc
--- a/app/models/zulip.py
+++ b/app/models/zulip.py
@ -0,0 +1,96 @@
 """
 SQLAlchemy models for the Zulip database tables.
 """
 from sqlalchemy import Column, Integer, String, Text, Boolean, SmallInteger, DateTime, ForeignKey, BigInteger
 from sqlalchemy.orm import relationship
 from app.db import Base
 class Recipient(Base):
    """
    Model for zerver_recipient table in Zulip DB.
    Recipients can be of different types (e.g., stream, user, huddle).
    """
    __tablename__ = 'zerver_recipient'
    __table_args__ = {'schema': 'zulip'}
    id = Column(Integer, primary_key=True)
    type_id = Column(Integer)
    type = Column(SmallInteger)  # 1 for stream, 2 for user, 3 for huddle
    # Relationships
    messages = relationship("Message", back_populates="recipient")
    stream = relationship("Stream", back_populates="recipient", uselist=False)
 class Stream(Base):
    """
    Model for zerver_stream table in Zulip DB.
    Represents a Zulip channel (called stream in Zulip terminology).
    """
    __tablename__ = 'zerver_stream'
    __table_args__ = {'schema': 'zulip'}
    id = Column(BigInteger, primary_key=True)
    name = Column(String)
    date_created = Column(DateTime)
    deactivated = Column(Boolean)
    description = Column(String)
    rendered_description = Column(Text)
    invite_only = Column(Boolean)
    recipient_id = Column(Integer, ForeignKey('zulip.zerver_recipient.id'))
    realm_id = Column(Integer)
    # Relationships
    recipient = relationship("Recipient", back_populates="stream")
 class Message(Base):
    """
    Model for zerver_message table in Zulip DB.
    Represents a message sent in Zulip.
    """
    __tablename__ = 'zerver_message'
    __table_args__ = {'schema': 'zulip'}
    id = Column(Integer, primary_key=True)
    sender_id = Column(Integer, ForeignKey('zulip.zerver_userprofile.id'))
    recipient_id = Column(Integer, ForeignKey('zulip.zerver_recipient.id'))
    subject = Column(String)
    content = Column(Text)
    rendered_content = Column(Text)
    date_sent = Column(DateTime)
    type = Column(SmallInteger)  # 1 for stream message, 2 for private message
    has_attachment = Column(Boolean)
    has_image = Column(Boolean)
    has_link = Column(Boolean)
    is_channel_message = Column(Boolean)
    realm_id = Column(Integer)
    # Relationships
    sender = relationship("UserProfile", back_populates="messages")
    recipient = relationship("Recipient", back_populates="messages")
 class UserProfile(Base):
    """
    Model for zerver_userprofile table in Zulip DB.
    Represents a Zulip user.
    """
    __tablename__ = 'zerver_userprofile'
    __table_args__ = {'schema': 'zulip'}
    id = Column(Integer, primary_key=True)
    email = Column(String)
    full_name = Column(String)
    is_active = Column(Boolean)
    realm_id = Column(Integer)
    # Relationships
    messages = relationship("Message", back_populates="sender")
 # Constants for the channels we're monitoring
 IT_CHANNELS = {
    "IT Discussions": 5,  # id = 5, recipient_id = 16
    "IT Knowledge": 17,   # id = 17, recipient_id = 47
    "IT Support": 16      # id = 16, recipient_id = 43
 }
 # Recipient IDs for the channels we're monitoring
 IT_RECIPIENT_IDS = [16, 47, 43] 
--- a/app/utils/init.py
+++ b/app/utils/init.py
@ -0,0 +1,33 @@
 """
 Utilities module for the application.
 Contains helper functions and utilities for the application.
 """
 import importlib
 import sys
 import numpy as np
 def patch_chromadb_numpy():
    """
    Patch ChromaDB to use np.nan instead of np.NaN for NumPy 2.0 compatibility.
    This function uses monkey patching to replace the old np.NaN reference in the 
    brute_force_index.py file of ChromaDB with the new np.nan (lowercase).
    """
    try:
        # Get the module where the error occurs
        from chromadb.segment.impl.vector import brute_force_index
        # Patch the module to use np.nan instead of np.NaN
        if not hasattr(np, 'NaN'):
            np.NaN = np.nan
        print("NumPy compatibility patch applied for ChromaDB")
        return True
    except ImportError:
        print("Could not patch ChromaDB: module not found")
        return False
    except Exception as e:
        print(f"Error patching ChromaDB: {e}")
        return False
 # This module will be populated with utility functions in later steps 
--- a/app/utils/pycache/init.cpython-311.pyc
+++ b/app/utils/pycache/init.cpython-311.pyc
--- a/app/utils/pycache/ai_service.cpython-311.pyc
+++ b/app/utils/pycache/ai_service.cpython-311.pyc
--- a/app/utils/pycache/bot_service.cpython-311.pyc
+++ b/app/utils/pycache/bot_service.cpython-311.pyc
--- a/app/utils/pycache/embeddings.cpython-311.pyc
+++ b/app/utils/pycache/embeddings.cpython-311.pyc
--- a/app/utils/pycache/sync_service.cpython-311.pyc
+++ b/app/utils/pycache/sync_service.cpython-311.pyc
--- a/app/utils/ai_service.py
+++ b/app/utils/ai_service.py
@ -0,0 +1,372 @@
 """
 AI service for OpenAI API integration.
 This module provides a class for generating responses using the OpenAI API.
 It handles authentication, prompt engineering, error handling, and retries.
 """
 import os
 import time
 import logging
 import hashlib
 import functools
 from datetime import datetime, timedelta
 from typing import List, Dict, Any, Optional, Tuple
 from openai import OpenAI
 # Configure logging
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
 logger = logging.getLogger("ai_service")
 # Simple in-memory cache for responses
 RESPONSE_CACHE = {}
 CACHE_TTL = 3600  # 1 hour in seconds
 class OpenAIService:
    """Service for generating responses using the OpenAI API."""
    def __init__(self, api_key: Optional[str] = None, 
                 model_name: str = "gpt-4o",
                 enable_cache: bool = True,
                 cache_ttl: int = CACHE_TTL,
                 rate_limit: int = 60):  # 60 requests per minute
        """
        Initialize the OpenAI service.
        Args:
            api_key: API key for OpenAI. If None, uses OPENAI_API_KEY environment variable.
            model_name: Name of the OpenAI model to use.
            enable_cache: Whether to enable response caching.
            cache_ttl: Time-to-live for cached responses in seconds.
            rate_limit: Maximum number of requests allowed per minute.
        """
        self.api_key = api_key or os.getenv("OPENAI_API_KEY")
        if not self.api_key:
            raise ValueError("OpenAI API key not provided. Set OPENAI_API_KEY environment variable or pass api_key parameter.")
        self.model_name = model_name
        self.enable_cache = enable_cache
        self.cache_ttl = cache_ttl
        self.rate_limit = rate_limit
        # Rate limiting state
        self.request_timestamps = []
        # Configure OpenAI API
        self.client = OpenAI(api_key=self.api_key)
        logger.info(f"Initialized OpenAIService with model: {model_name}")
    def _check_rate_limit(self):
        """
        Check if the rate limit has been reached.
        Waits if necessary to stay within the rate limit.
        """
        current_time = time.time()
        # Remove timestamps older than 60 seconds
        self.request_timestamps = [ts for ts in self.request_timestamps if current_time - ts < 60]
        # Check if we've reached the rate limit
        if len(self.request_timestamps) >= self.rate_limit:
            # Calculate how long to wait
            oldest_timestamp = min(self.request_timestamps)
            sleep_time = 60 - (current_time - oldest_timestamp)
            if sleep_time > 0:
                logger.warning(f"Rate limit reached. Waiting {sleep_time:.2f} seconds...")
                time.sleep(sleep_time)
        # Add current timestamp to the list
        self.request_timestamps.append(time.time())
    def _detect_language(self, text: str) -> str:
        """
        Detect the language of a text string.
        Args:
            text: The text to detect the language of.
        Returns:
            A language code, e.g. 'en' for English, 'ka' for Georgian.
        """
        try:
            # Use a very small prompt to detect language
            if not text:
                return 'en'  # Default to English for empty text
            # Simple language detection using a dedicated small request
            response = self.client.chat.completions.create(
                model=self.model_name,
                messages=[
                    {"role": "system", "content": "You are a language detection service. Respond with only the ISO language code ('en' for English, 'ka' for Georgian, etc.)."},
                    {"role": "user", "content": f"Detect the language of this text: {text[:100]}"}
                ],
                max_tokens=10,
                temperature=0
            )
            language_code = response.choices[0].message.content.strip().lower()
            logger.info(f"Detected language: {language_code}")
            # Validate and default to English for any issues
            if language_code not in ['en', 'ka']:
                return 'en'
            return language_code
        except Exception as e:
            logger.error(f"Error detecting language: {e}")
            return 'en'  # Default to English on error
    def _generate_cache_key(self, query: str, context: List[Dict[str, Any]]) -> str:
        """
        Generate a cache key for the query and context.
        Args:
            query: The query string.
            context: The context documents.
        Returns:
            A string hash key for caching.
        """
        # Create a string representation of the context
        context_str = ""
        for doc in context:
            if 'content' in doc:
                context_str += doc['content'][:100]  # Use just the beginning for performance
        # Create a hash of the query and context
        key_str = query + context_str
        return hashlib.md5(key_str.encode('utf-8')).hexdigest()
    def _get_cached_response(self, cache_key: str) -> Optional[str]:
        """
        Get a cached response if available and not expired.
        Args:
            cache_key: The cache key.
        Returns:
            The cached response, or None if not found or expired.
        """
        if not self.enable_cache:
            return None
        if cache_key in RESPONSE_CACHE:
            timestamp, response = RESPONSE_CACHE[cache_key]
            # Check if the cache entry has expired
            if time.time() - timestamp < self.cache_ttl:
                logger.info("Using cached response")
                return response
            # Remove expired cache entry
            del RESPONSE_CACHE[cache_key]
        return None
    def _cache_response(self, cache_key: str, response: str):
        """
        Cache a response.
        Args:
            cache_key: The cache key.
            response: The response to cache.
        """
        if not self.enable_cache:
            return
        RESPONSE_CACHE[cache_key] = (time.time(), response)
        # Clean up expired cache entries if cache is getting large
        if len(RESPONSE_CACHE) > 1000:  # Arbitrary limit
            self._cleanup_cache()
    def _cleanup_cache(self):
        """Clean up expired cache entries."""
        current_time = time.time()
        keys_to_delete = []
        for key, (timestamp, _) in RESPONSE_CACHE.items():
            if current_time - timestamp >= self.cache_ttl:
                keys_to_delete.append(key)
        for key in keys_to_delete:
            del RESPONSE_CACHE[key]
        logger.info(f"Cleaned up {len(keys_to_delete)} expired cache entries")
    def generate_response(self, query: str, context: List[Dict[str, Any]], 
                          max_retries: int = 3, temperature: float = 0.7) -> str:
        """
        Generate a response using the OpenAI API.
        Args:
            query: The user's query.
            context: A list of relevant context documents from ChromaDB.
                     Each document should be a dict with 'content' and 'metadata' keys.
            max_retries: Maximum number of retry attempts for API failures.
            temperature: Controls randomness in the response. Lower is more deterministic.
        Returns:
            The generated response text.
        """
        # Check rate limit
        self._check_rate_limit()
        # Detect language
        language = self._detect_language(query)
        # Check cache
        cache_key = self._generate_cache_key(query, context)
        cached_response = self._get_cached_response(cache_key)
        if cached_response:
            return cached_response
        # Construct the messages using the context
        messages = self._construct_messages(query, context, language)
        # Try to generate response with retries
        retry_count = 0
        while retry_count <= max_retries:
            try:
                logger.info(f"Attempting to generate response (attempt {retry_count+1}/{max_retries+1})")
                # Generate with OpenAI API
                response = self.client.chat.completions.create(
                    model=self.model_name,
                    messages=messages,
                    temperature=temperature,
                    max_tokens=4096,
                    top_p=0.8
                )
                # Extract the response text
                response_text = response.choices[0].message.content
                # Cache the response
                self._cache_response(cache_key, response_text)
                # Return the response text
                return response_text
            except Exception as e:
                retry_count += 1
                wait_time = 2 ** retry_count  # Exponential backoff
                # Log more details about the error
                logger.error(f"API call error: {type(e).__name__}: {str(e)}")
                if retry_count <= max_retries:
                    logger.warning(f"API call failed: {str(e)}. Retrying in {wait_time} seconds. (Attempt {retry_count}/{max_retries})")
                    time.sleep(wait_time)
                else:
                    logger.error(f"API call failed after {max_retries} retries: {str(e)}")
                    # Return a simple response when all retries fail
                    return f"I'm here to help with IT questions. However, I'm currently experiencing technical difficulties. Please try again later or contact IT support directly."
    def _construct_messages(self, query: str, context: List[Dict[str, Any]], language: str = 'en') -> List[Dict[str, str]]:
        """
        Construct message list with the query and context.
        Args:
            query: The user's query.
            context: A list of relevant context documents from ChromaDB.
            language: The language code (e.g., 'en', 'ka').
        Returns:
            List of message dictionaries for the OpenAI API.
        """
        # System instruction based on language
        if language == 'ka':
            system_instruction = """როგორც IT_Bot, თქვენი როლია ორგანიზაციაში IT ტექნიკური დახმარების გაწევა:
 ## როლი და მიზანი:
 - თქვენ ხართ ორგანიზაციის IT დახმარების ბოტი, რომელიც ეხმარება თანამშრომლებს ტექნიკური საკითხების გადაჭრაში.
 - გამოიყენეთ მოცემული ისტორიული კონტექსტი ზუსტი და სასარგებლო პასუხების გასაცემად.
 - როდესაც კონტექსტი ამბობს რომ რაიმე პრობლემა შეიძლება არსებობდეს, ჩათვალეთ რომ ეს მართლაც პრობლემაა.
 ## პასუხების მიდგომა:
 1. გამოიყენეთ მოცემული კონტექსტი პასუხების შესაქმნელად. თუ კონტექსტში მოცემულია კონკრეტული IT საკითხები და მათი გადაწყვეტა, გამოიყენეთ ეს ინფორმაცია.
 2. თუ კონტექსტი შეიცავს ინფორმაციას მსგავსი პრობლემის შესახებ, გააანალიზეთ, როგორ გადაიჭრა ეს პრობლემა წარსულში.
 3. მითითებები და ცოდნა მოცემული კონტექსტიდან პრიორიტეტული უნდა იყოს ზოგად ცოდნასთან შედარებით.
 4. თუ კითხვა არ უკავშირდება IT თემებს, მიუთითეთ მომხმარებელს, რომ დაუკავშირდეს IT მხარდაჭერას.
 5. დეტალური, ნაბიჯ-ნაბიჯ ინსტრუქციები მიაწოდეთ, როცა სთხოვენ ტექნიკური პრობლემის გადაჭრას.
 ## პასუხის ფორმატი:
 - მკაფიო, ზუსტი და კონკრეტული პასუხები გაეცით.
 - პასუხები დააფორმატეთ ადვილად წასაკითხად, გამოიყენეთ პუნქტები და ქვესათაურები, როცა საჭიროა.
 - მიაწოდეთ კონკრეტული ბრძანებები, კოდის მაგალითები ან ინსტრუქციები, როცა საჭიროა.
 - არ გამოიყენოთ [Reference X] ფორმატი პასუხებში - ინფორმაცია პირდაპირ ჩასვით პასუხში წყაროზე მითითების გარეშე."""
        else:  # Default to English
            system_instruction = """As IT_Bot, your role is to provide technical IT support within the organization:
 ## Role and Purpose:
 - You are an IT support bot for the organization, helping employees resolve technical issues.
 - Use the provided historical context to give accurate and helpful responses.
 - When context mentions that there may be an issue with something, assume there is an issue.
 ## Response Approach:
 1. Use the provided context to craft your answers. If the context contains specific IT issues and resolutions, use that information.
 2. If the context contains information about similar problems, analyze how the problem was resolved in the past.
 3. Guidance and knowledge from the provided context should take precedence over general knowledge.
 4. If a question is unrelated to IT topics, direct the user to contact IT support.
 5. Provide detailed, step-by-step instructions when asked about resolving a technical issue.
 ## Response Format:
 - Respond with clear, precise, and specific answers.
 - Format answers for easy reading, using bullet points and subheadings when appropriate.
 - Provide specific commands, code examples, or instructions when relevant.
 - IMPORTANT: DO NOT use reference numbers like [Reference X] in your responses. Instead, directly incorporate the relevant information into your answer without citing sources."""
        # Process the context data
        context_text = ""
        if context:
            # Sort context by relevance (assuming they're already in relevance order)
            context_text = "Reference information from IT knowledge base:\n\n"
            for i, doc in enumerate(context):
                if 'content' in doc:
                    # Create a more structured reference entry
                    content = doc['content']
                    # Build a descriptive reference header with metadata
                    ref_details = []
                    if 'metadata' in doc and doc['metadata']:
                        metadata = doc['metadata']
                        if 'subject' in metadata and metadata['subject']:
                            ref_details.append(f"Topic: {metadata['subject']}")
                        if 'channel' in metadata and metadata['channel']:
                            ref_details.append(f"Channel: {metadata['channel']}")
                        if 'sender' in metadata and metadata['sender']:
                            ref_details.append(f"From: {metadata['sender']}")
                        if 'timestamp' in metadata and metadata['timestamp']:
                            try:
                                # Try to format the timestamp in a more readable way
                                date_str = metadata['timestamp'][:10]  # Just use the date part
                                ref_details.append(f"Date: {date_str}")
                            except:
                                pass
                    # Create a detailed reference header with all the metadata
                    ref_header = f"Context {i+1}"
                    if ref_details:
                        ref_header += f": {' | '.join(ref_details)}"
                    # Format each reference entry
                    context_text += f"[{ref_header}]\n{content}\n\n"
        # Create messages array for the chat completions API
        messages = [
            {"role": "system", "content": system_instruction}
        ]
        # Add context as a separate message from the system if available
        if context_text:
            messages.append({"role": "system", "content": context_text})
        # Add the user query
        messages.append({"role": "user", "content": query})
        return messages
 # For backwards compatibility, provide GeminiService as an alias for OpenAIService
 GeminiService = OpenAIService 
--- a/app/utils/bot_service.py
+++ b/app/utils/bot_service.py
@ -0,0 +1,402 @@
 """
 Zulip bot service for handling interactions with Zulip.
 """
 import os
 import re
 import logging
 import threading
 import time
 import hashlib
 import tempfile
 from typing import Optional, List, Dict, Any
 import zulip
 from app.db.chroma_service import ChromaDBService
 from app.utils.ai_service import GeminiService
 # Configure logging
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
 logger = logging.getLogger("bot_service")
 class ZulipBotService:
    """Service for handling Zulip bot interactions."""
    # Singleton instance
    _instance = None
    _lock = threading.Lock()
    _process_id = os.getpid()  # Store the process ID when this module is loaded
    def __new__(cls, *args, **kwargs):
        with cls._lock:
            current_pid = os.getpid()
            if cls._instance is None or cls._process_id != current_pid:
                logger.info(f"Creating new ZulipBotService singleton instance for process {current_pid}")
                cls._instance = super(ZulipBotService, cls).__new__(cls)
                cls._instance._initialized = False
                cls._process_id = current_pid  # Update the stored process ID
            return cls._instance
    def __init__(self, 
                 email: Optional[str] = None, 
                 api_key: Optional[str] = None, 
                 site: Optional[str] = None,
                 chroma_service: Optional[ChromaDBService] = None,
                 ai_service: Optional[GeminiService] = None):
        """Initialize the Zulip bot service."""
        with self._lock:
            # Skip initialization if already initialized (singleton pattern)
            if self._initialized:
                return
            # Load config from environment variables if not provided
            self.email = email or os.getenv("ZULIP_BOT_EMAIL")
            self.api_key = api_key or os.getenv("ZULIP_BOT_API_KEY")
            self.site = site or os.getenv("ZULIP_SITE")
            if not all([self.email, self.api_key, self.site]):
                raise ValueError("Missing Zulip configuration. Set ZULIP_BOT_EMAIL, ZULIP_BOT_API_KEY, and ZULIP_SITE env variables.")
            # Initialize Zulip client
            self.client = zulip.Client(
                email=self.email,
                api_key=self.api_key,
                site=self.site
            )
            # Initialize services
            self.chroma_service = chroma_service or ChromaDBService()
            self.ai_service = ai_service or GeminiService()
            # Thread for message handling
            self.thread = None
            self.running = False
            # Simple set to track processed message IDs
            self.processed_message_ids = set()
            # Bot identification pattern - exact match for IT_Bot mention in Zulip format
            self.bot_mention_pattern = re.compile(r'@\*\*IT_Bot\*\*')
            # Default response for empty queries
            self.default_response = "Hello. If you have a technical question, please ask. If you require assistance with non-technical matters, please contact IT support."
            # Track backoff state for rate limiting
            self._backoff_time = 1  # Start with 1 second backoff
            self._consecutive_rate_limit_errors = 0
            self._max_backoff_time = 60  # Maximum backoff of 60 seconds
            # Mark as initialized
            self._initialized = True
            logger.info("Initialized ZulipBotService")
    def start(self):
        """Start the bot service in a separate thread."""
        with self._lock:
            if self.thread and self.thread.is_alive():
                logger.warning("Bot service is already running")
                return
            self.running = True
            self.thread = threading.Thread(target=self._message_loop)
            self.thread.daemon = True
            self.thread.start()
            logger.info("Started ZulipBotService")
    def stop(self):
        """Stop the bot service."""
        with self._lock:
            if not self.thread or not self.thread.is_alive():
                logger.warning("Bot service is not running")
                return
            self.running = False
            self.thread.join(timeout=5.0)
            logger.info("Stopped ZulipBotService")
    def _message_loop(self):
        """Main message handling loop."""
        # How far back to check for mentions (in seconds)
        # Default to 60 seconds, but can be adjusted
        lookback_period = 60
        while self.running:
            try:
                # Get messages that mention the bot
                new_messages = self._check_for_mentions(lookback_period)
                # Process new messages
                for message in new_messages:
                    self._process_message(message)
                    # Add a small delay between processing messages
                    time.sleep(0.5)
                # Clean up old processed message IDs periodically
                if len(self.processed_message_ids) > 1000:
                    self.processed_message_ids = set(list(self.processed_message_ids)[-1000:])
                # Wait before checking again (reduces API usage)
                time.sleep(5.0)
            except Exception as e:
                logger.error(f"Error in message loop: {str(e)}")
                # Apply backoff on errors to avoid hammering the API
                if "API usage exceeded rate limit" in str(e):
                    self._consecutive_rate_limit_errors += 1
                    backoff_time = min(self._backoff_time * 2, self._max_backoff_time)
                    logger.info(f"Rate limit hit, backing off for {backoff_time} seconds")
                    time.sleep(backoff_time)
                    self._backoff_time = backoff_time
                else:
                    # For other errors, just wait a bit
                    time.sleep(3)
    def _check_for_mentions(self, lookback_period):
        """
        Check for new messages that mention the bot.
        Args:
            lookback_period: How far back to check for mentions (in seconds)
        Returns:
            List of messages that mention the bot
        """
        # Calculate the timestamp for the lookback period
        lookback_timestamp = int(time.time() - lookback_period)
        try:
            # If we've had rate limit errors, apply backoff
            if self._consecutive_rate_limit_errors > 0:
                backoff_delay = min(self._backoff_time, self._max_backoff_time)
                logger.info(f"Rate limit backoff: waiting {backoff_delay} seconds before API call")
                time.sleep(backoff_delay)
            # Get all messages that mention the bot
            # Use the request endpoint for more control
            request = {
                "anchor": "newest",
                "num_before": 100,
                "num_after": 0,
                "narrow": [
                    {"operator": "is", "operand": "mentioned"},
                    {"operator": "streams", "operand": "public"}
                ],
                "client_gravatar": False,
                "apply_markdown": False
            }
            result = self.client.get_messages(request)
            # Reset backoff if request was successful
            if result.get("result") == "success":
                if self._consecutive_rate_limit_errors > 0:
                    logger.info("Successful API call, resetting rate limit backoff")
                    self._consecutive_rate_limit_errors = 0
                    self._backoff_time = 1
            else:
                logger.error(f"Failed to get messages: {result.get('msg', 'Unknown error')}")
                return []
            # Filter messages
            new_messages = []
            for message in result.get("messages", []):
                # Skip if we've already processed this message
                if message["id"] in self.processed_message_ids:
                    continue
                # Skip messages not sent after our lookback time
                if message.get("timestamp", 0) < lookback_timestamp:
                    continue
                # Skip messages from the bot itself
                if message.get("sender_email") == self.email:
                    continue
                # Check if the bot is actually mentioned in the content
                if self.bot_mention_pattern.search(message.get("content", "")):
                    # Add to processed set and new message list
                    self.processed_message_ids.add(message["id"])
                    new_messages.append(message)
            if new_messages:
                logger.info(f"Found {len(new_messages)} new mention(s) of the bot")
            return new_messages
        except Exception as e:
            if "API usage exceeded rate limit" in str(e):
                self._consecutive_rate_limit_errors += 1
                self._backoff_time = min(self._backoff_time * 2, self._max_backoff_time)
                logger.error(f"Error checking for mentions: {str(e)} (backoff: {self._backoff_time}s)")
            else:
                logger.error(f"Error checking for mentions: {str(e)}")
            return []
    def _process_message(self, message):
        """
        Process a message and send a response.
        Args:
            message: The message to process.
        """
        try:
            # Extract content
            content = message.get("content", "")
            # Log detailed information
            logger.info(f"Processing message ID: {message.get('id')}")
            # Extract user query (remove the bot mention)
            query = self.bot_mention_pattern.sub("", content).strip()
            # Log the incoming message
            logger.info(f"Extracted query: {query[:50]}...")
            # If query is empty, provide the default response
            if not query:
                logger.info(f"Empty query received, sending default response")
                self._send_response(message, self.default_response)
                return
            # Retrieve relevant context from ChromaDB
            context = self._retrieve_context(query)
            # Generate response using the AI service
            response_text = self.ai_service.generate_response(query, context)
            # Send the response
            self._send_response(message, response_text)
        except Exception as e:
            logger.error(f"Error processing message: {str(e)}")
            self._send_response(message, 
                              "I apologize, but I encountered an error while processing your request. "
                              "Please try again or contact the IT support team if the issue persists.")
    def _retrieve_context(self, query, n_results=40):
        """
        Retrieve relevant context from ChromaDB with enhanced relevance.
        Args:
            query: The user's query.
            n_results: Number of results to retrieve.
        Returns:
            A list of relevant context documents.
        """
        try:
            # Search for similar documents in ChromaDB
            search_results = self.chroma_service.search_similar(query, n_results=n_results)
            if not search_results:
                logger.warning(f"No context found for query: {query[:50]}...")
                return []
            # Extract documents and metadata
            documents = []
            # Check if there are documents in the results
            if search_results.get("documents") and len(search_results.get("documents", [])) > 0:
                # Get the documents and their metadata
                docs = search_results.get("documents", [[]])[0]
                metas = search_results.get("metadatas", [[]])[0]
                # Calculate a simple relevance score for each document based on position
                relevance_scores = []
                for i, (doc, metadata) in enumerate(zip(docs, metas)):
                    # Create a document with its metadata
                    if isinstance(doc, list) and len(doc) > 0:
                        doc = doc[0]  # Handle nested lists
                    # Include relevance position in metadata
                    if metadata:
                        metadata["relevance_position"] = i + 1
                    # Store document with enhanced metadata
                    documents.append({
                        "content": doc,
                        "metadata": metadata,
                    })
            logger.info(f"Retrieved {len(documents)} context documents for query: {query[:30]}...")
            return documents
        except Exception as e:
            logger.error(f"Error retrieving context: {str(e)}")
            return []
    def _send_response(self, original_message, response_text):
        """
        Send a response to a message.
        Args:
            original_message: The original message being responded to.
            response_text: The text of the response to send.
        """
        try:
            message_type = original_message.get("type")
            if message_type == "stream":
                # For stream messages, respond in the same stream and topic
                response = {
                    "type": "stream",
                    "to": original_message.get("display_recipient"),
                    "subject": original_message.get("subject"),
                    "content": response_text
                }
            else:
                # For private messages, respond to the sender
                response = {
                    "type": "private",
                    "to": [original_message.get("sender_email")],
                    "content": response_text
                }
            result = self.client.send_message(response)
            if result.get("result") != "success":
                error_msg = result.get("msg", "Unknown error")
                logger.error(f"Failed to send response: {error_msg}")
            else:
                logger.info(f"Sent response to message: {original_message.get('id')}")
        except Exception as e:
            logger.error(f"Error sending response: {str(e)}")
    def send_test_message(self, recipient, content):
        """
        Send a test message to verify the bot is working.
        Args:
            recipient: The recipient of the message (email for private, channel name for stream).
            content: The content of the message.
        Returns:
            The result of the API call.
        """
        if "@" in recipient:
            # Private message
            message = {
                "type": "private",
                "to": [recipient],
                "content": content
            }
        else:
            # Stream message
            message = {
                "type": "stream",
                "to": recipient,
                "subject": "Bot Test",
                "content": content
            }
        result = self.client.send_message(message)
        logger.info(f"Sent test message to {recipient}, result: {result.get('result')}")
        return result
    def reset_cache(self):
        """Reset message cache."""
        with self._lock:
            logger.info("Resetting message caches")
            self.processed_message_ids = set()
            return "Message cache reset successfully" 
--- a/app/utils/contextual_retrieval/init.py
+++ b/app/utils/contextual_retrieval/init.py
@ -0,0 +1,8 @@
 """
 Contextual Retrieval package for enhancing RAG systems.
 This package implements advanced retrieval techniques based on Anthropic's Contextual Retrieval:
 - Contextual Embeddings: Adding rich context to chunks before embedding
 - Contextual BM25: Using BM25 for exact matching with context-enhanced chunks
 - Reranking: Further improving results by reranking retrieved chunks
 """ 
--- a/app/utils/contextual_retrieval/pycache/init.cpython-311.pyc
+++ b/app/utils/contextual_retrieval/pycache/init.cpython-311.pyc
--- a/app/utils/contextual_retrieval/pycache/bm25_service.cpython-311.pyc
+++ b/app/utils/contextual_retrieval/pycache/bm25_service.cpython-311.pyc
--- a/app/utils/contextual_retrieval/pycache/context_service.cpython-311.pyc
+++ b/app/utils/contextual_retrieval/pycache/context_service.cpython-311.pyc
--- a/app/utils/contextual_retrieval/pycache/hybrid_search.cpython-311.pyc
+++ b/app/utils/contextual_retrieval/pycache/hybrid_search.cpython-311.pyc
--- a/app/utils/contextual_retrieval/pycache/reranker_service.cpython-311.pyc
+++ b/app/utils/contextual_retrieval/pycache/reranker_service.cpython-311.pyc
--- a/app/utils/contextual_retrieval/bm25_service.py
+++ b/app/utils/contextual_retrieval/bm25_service.py
@ -0,0 +1,181 @@
 """
 BM25 Service for exact keyword matching in retrieval.
 This service implements the BM25 algorithm for better lexical search,
 complementing the semantic search provided by vector embeddings.
 """
 import os
 import pickle
 import numpy as np
 from typing import Dict, List, Optional, Tuple, Union
 from rank_bm25 import BM25Okapi
 import re
 import nltk
 from nltk.tokenize import word_tokenize
 from nltk.corpus import stopwords
 # Download NLTK resources
 try:
    nltk.data.find('tokenizers/punkt')
 except LookupError:
    nltk.download('punkt', quiet=True)
 try:
    nltk.data.find('corpora/stopwords')
 except LookupError:
    nltk.download('stopwords', quiet=True)
 class BM25Service:
    """Service for BM25-based search."""
    # BM25 index and corpus
    _bm25 = None
    _corpus = []
    _doc_ids = []
    _index_path = os.path.join("chromadb", "bm25_index.pkl")
    @staticmethod
    def preprocess_text(text: str) -> List[str]:
        """
        Preprocess text for BM25 indexing.
        Args:
            text (str): Text to preprocess
        Returns:
            List[str]: List of preprocessed tokens
        """
        # Convert to lowercase
        text = text.lower()
        # Remove special characters and digits
        text = re.sub(r'[^\w\s]', ' ', text)
        text = re.sub(r'\d+', ' ', text)
        # Tokenize
        tokens = word_tokenize(text)
        # Remove stopwords
        stop_words = set(stopwords.words('english'))
        tokens = [token for token in tokens if token not in stop_words and len(token) > 1]
        return tokens
    @staticmethod
    def index_documents(documents: List[str], doc_ids: List[str]) -> None:
        """
        Create a BM25 index for a list of documents.
        Args:
            documents (List[str]): List of document contents
            doc_ids (List[str]): List of document IDs
        """
        # Preprocess documents
        tokenized_corpus = [BM25Service.preprocess_text(doc) for doc in documents]
        # Create BM25 index
        BM25Service._bm25 = BM25Okapi(tokenized_corpus)
        BM25Service._corpus = documents
        BM25Service._doc_ids = doc_ids
        # Save index to disk
        BM25Service.save_index()
    @staticmethod
    def add_document(document: str, doc_id: str) -> None:
        """
        Add a single document to the BM25 index.
        Args:
            document (str): Document content
            doc_id (str): Document ID
        """
        # Create index if it doesn't exist
        if BM25Service._bm25 is None:
            BM25Service.load_index()
            if BM25Service._bm25 is None:
                BM25Service.index_documents([document], [doc_id])
                return
        # Add document to corpus
        BM25Service._corpus.append(document)
        BM25Service._doc_ids.append(doc_id)
        # Preprocess document
        tokenized_doc = BM25Service.preprocess_text(document)
        # Rebuild index
        tokenized_corpus = [BM25Service.preprocess_text(doc) for doc in BM25Service._corpus]
        BM25Service._bm25 = BM25Okapi(tokenized_corpus)
        # Save index to disk
        BM25Service.save_index()
    @staticmethod
    def search(query: str, top_k: int = 5) -> List[Tuple[str, float]]:
        """
        Search for documents using BM25.
        Args:
            query (str): Query text
            top_k (int): Number of results to return
        Returns:
            List[Tuple[str, float]]: List of (doc_id, score) tuples
        """
        # Load index if it doesn't exist
        if BM25Service._bm25 is None:
            BM25Service.load_index()
            if BM25Service._bm25 is None:
                return []
        # Preprocess query
        tokenized_query = BM25Service.preprocess_text(query)
        # Get scores
        scores = BM25Service._bm25.get_scores(tokenized_query)
        # Get top-k documents
        top_indices = np.argsort(scores)[::-1][:top_k]
        # Return (doc_id, score) pairs
        results = []
        for idx in top_indices:
            if idx < len(BM25Service._doc_ids):
                results.append((BM25Service._doc_ids[idx], scores[idx]))
        return results
    @staticmethod
    def save_index() -> None:
        """Save BM25 index to disk."""
        try:
            # Create directory if it doesn't exist
            os.makedirs(os.path.dirname(BM25Service._index_path), exist_ok=True)
            # Save index
            with open(BM25Service._index_path, 'wb') as f:
                pickle.dump({
                    'bm25': BM25Service._bm25,
                    'corpus': BM25Service._corpus,
                    'doc_ids': BM25Service._doc_ids
                }, f)
        except Exception as e:
            print(f"Error saving BM25 index: {e}")
    @staticmethod
    def load_index() -> None:
        """Load BM25 index from disk."""
        try:
            if os.path.exists(BM25Service._index_path):
                with open(BM25Service._index_path, 'rb') as f:
                    data = pickle.load(f)
                    BM25Service._bm25 = data.get('bm25')
                    BM25Service._corpus = data.get('corpus', [])
                    BM25Service._doc_ids = data.get('doc_ids', [])
        except Exception as e:
            print(f"Error loading BM25 index: {e}")
            # Initialize with empty index
            BM25Service._bm25 = None
            BM25Service._corpus = []
            BM25Service._doc_ids = [] 
--- a/app/utils/contextual_retrieval/context_service.py
+++ b/app/utils/contextual_retrieval/context_service.py
@ -0,0 +1,112 @@
 """
 Context Service for generating rich contextual descriptions for messages.
 This service uses LLMs to generate contextual descriptions for messages,
 which improves retrieval by providing more context to the embedding process.
 """
 import os
 import time
 from typing import Dict, List, Optional, Union
 from openai import OpenAI
 from app.config import Config
 class ContextService:
    """Service for generating rich contextual descriptions for messages."""
    # Initialize OpenAI client
    client = OpenAI(api_key=Config.OPENAI_API_KEY)
    # Cache for context generation to reduce API calls
    _context_cache = {}
    @staticmethod
    def generate_context(content: str, metadata: Dict) -> str:
        """
        Generate a rich contextual description for a message.
        Args:
            content (str): The original message content
            metadata (Dict): Metadata about the message (channel, subject, sender, timestamp)
        Returns:
            str: A rich contextual description
        """
        # Create a cache key from content and metadata
        cache_key = f"{content[:100]}_{metadata.get('channel')}_{metadata.get('subject')}"
        # Check if we have this context cached
        if cache_key in ContextService._context_cache:
            return ContextService._context_cache[cache_key]
        try:
            # Create messages for context generation
            messages = [
                {
                    "role": "system", 
                    "content": "You are a context generation assistant. Generate a short, succinct context description for the given message. The context should situate this message within its domain and highlight key information that would be helpful for retrieval. Keep the context under 100 words."
                },
                {
                    "role": "user",
                    "content": f"""
                    Message details:
                    - Channel: {metadata.get('channel', 'Unknown')}
                    - Subject: {metadata.get('subject', 'Unknown')}
                    - Sender: {metadata.get('sender', 'Unknown')}
                    - Timestamp: {metadata.get('timestamp', 'Unknown')}
                    Message content:
                    {content}
                    """
                }
            ]
            # Generate the context using OpenAI
            response = ContextService.client.chat.completions.create(
                model="gpt-4o",
                messages=messages,
                max_tokens=150,
                temperature=0.3
            )
            # Extract the response text
            context = response.choices[0].message.content.strip()
            # If the context is too long, truncate it
            if len(context) > 500:
                context = context[:497] + "..."
            # Cache the result
            ContextService._context_cache[cache_key] = context
            return context
        except Exception as e:
            print(f"Error generating context: {e}")
            # Fallback to a simple context based on metadata
            channel = metadata.get('channel', 'Unknown')
            subject = metadata.get('subject', 'Unknown')
            fallback_context = f"This message is from the {channel} channel and discusses {subject}."
            # Cache the fallback
            ContextService._context_cache[cache_key] = fallback_context
            return fallback_context
    @staticmethod
    def contextualize_content(content: str, metadata: Dict) -> str:
        """
        Add rich contextual description to a message.
        Args:
            content (str): The original message content
            metadata (Dict): Metadata about the message
        Returns:
            str: The content with context prepended
        """
        # Generate the context
        context = ContextService.generate_context(content, metadata)
        # Add the context to the content
        return f"CONTEXT: {context}\n\nCONTENT: {content}" 
--- a/app/utils/contextual_retrieval/hybrid_search.py
+++ b/app/utils/contextual_retrieval/hybrid_search.py
@ -0,0 +1,160 @@
 """
 Hybrid Search Service that combines vector search and BM25 search.
 This service implements hybrid search by combining results from vector-based 
 semantic search and BM25 lexical search using rank fusion.
 """
 import numpy as np
 from typing import Dict, List, Optional, Tuple, Union
 from app.db.chroma_service import ChromaDBService
 from app.utils.contextual_retrieval.bm25_service import BM25Service
 from app.utils.contextual_retrieval.reranker_service import RerankerService
 import logging
 # Set up logging
 logger = logging.getLogger("hybrid_search")
 class HybridSearchService:
    """Service for hybrid search combining vector search and BM25."""
    @staticmethod
    def hybrid_search(query: str, n_results: int = 5, filter_criteria: Optional[Dict] = None, 
                      rerank: bool = True, semantic_weight: float = 0.7) -> List[Dict]:
        """
        Perform hybrid search using vector search and BM25.
        Args:
            query (str): Query text
            n_results (int): Number of results to return
            filter_criteria (Dict): Metadata filter criteria
            rerank (bool): Whether to apply reranking
            semantic_weight (float): Weight for semantic search (0-1)
        Returns:
            List[Dict]: Search results
        """
        try:
            # Get more results than requested for fusion
            vector_n = n_results * 3
            bm25_n = n_results * 3
            # Perform vector search - use _internal_call=True to prevent circular imports
            vector_results = ChromaDBService.search_similar(
                query_text=query,
                n_results=vector_n,
                filter_criteria=filter_criteria,
                _internal_call=True  # This prevents circular calls
            )
            # Extract vector search results
            vec_docs = []
            if vector_results and 'documents' in vector_results and len(vector_results['documents']) > 0:
                for i in range(len(vector_results['documents'][0])):
                    vec_docs.append({
                        'id': vector_results['ids'][0][i],
                        'content': vector_results['documents'][0][i],
                        'metadata': vector_results['metadatas'][0][i],
                        'vector_score': 1.0 - min(vector_results['distances'][0][i], 1.0),
                        'rank': i + 1  # 1-based rank
                    })
            # Perform BM25 search
            bm25_results = BM25Service.search(query, top_k=bm25_n)
            # Extract BM25 search results and normalize scores
            bm25_docs = []
            if bm25_results:
                # Get max score for normalization
                max_score = max([score for _, score in bm25_results]) if bm25_results else 1.0
                # Create a set of doc IDs already in vector results to avoid duplicate lookups
                existing_doc_ids = {doc['id'] for doc in vec_docs}
                for i, (doc_id, score) in enumerate(bm25_results):
                    # Skip duplicate lookups
                    if doc_id in existing_doc_ids:
                        continue
                    # Get document content from ChromaDB (if available)
                    try:
                        doc_data = ChromaDBService.get_message_by_id(doc_id)
                        if doc_data:
                            bm25_docs.append({
                                'id': doc_id,
                                'content': doc_data['content'],
                                'metadata': doc_data['metadata'],
                                'bm25_score': score / max_score if max_score > 0 else 0,
                                'rank': i + 1  # 1-based rank
                            })
                    except Exception as e:
                        logger.warning(f"Error retrieving document {doc_id}: {e}")
                        continue
            # Combine results using reciprocal rank fusion
            fused_docs = HybridSearchService._fuse_results(vec_docs, bm25_docs, semantic_weight)
            # Apply reranking if requested
            if rerank and len(fused_docs) > 0:
                try:
                    return RerankerService.rerank(query, fused_docs, top_k=n_results)
                except Exception as e:
                    logger.warning(f"Reranking failed: {e}, returning non-reranked results")
                    return fused_docs[:n_results]
            # Otherwise just return the top n fused results
            return fused_docs[:n_results]
        except Exception as e:
            logger.error(f"Error in hybrid search: {e}")
            # Return empty results on error
            return []
    @staticmethod
    def _fuse_results(vec_docs: List[Dict], bm25_docs: List[Dict], 
                      semantic_weight: float = 0.7) -> List[Dict]:
        """
        Fuse results from vector search and BM25 search.
        Args:
            vec_docs (List[Dict]): Vector search results
            bm25_docs (List[Dict]): BM25 search results
            semantic_weight (float): Weight for semantic search (0-1)
        Returns:
            List[Dict]: Fused search results
        """
        # Create a map of document IDs to documents
        doc_map = {}
        # Process vector search results
        for doc in vec_docs:
            doc_id = doc['id']
            if doc_id not in doc_map:
                doc_map[doc_id] = doc.copy()
                doc_map[doc_id]['combined_score'] = doc.get('vector_score', 0) * semantic_weight
            else:
                # Update existing document
                doc_map[doc_id]['vector_score'] = doc.get('vector_score', 0)
                doc_map[doc_id]['combined_score'] = (
                    doc_map[doc_id].get('combined_score', 0) + 
                    doc.get('vector_score', 0) * semantic_weight
                )
        # Process BM25 search results
        for doc in bm25_docs:
            doc_id = doc['id']
            if doc_id not in doc_map:
                doc_map[doc_id] = doc.copy()
                doc_map[doc_id]['combined_score'] = doc.get('bm25_score', 0) * (1 - semantic_weight)
            else:
                # Update existing document
                doc_map[doc_id]['bm25_score'] = doc.get('bm25_score', 0)
                doc_map[doc_id]['combined_score'] = (
                    doc_map[doc_id].get('combined_score', 0) + 
                    doc.get('bm25_score', 0) * (1 - semantic_weight)
                )
        # Convert map to list and sort by combined score
        results = list(doc_map.values())
        results.sort(key=lambda x: x.get('combined_score', 0), reverse=True)
        return results 
--- a/app/utils/contextual_retrieval/reranker_service.py
+++ b/app/utils/contextual_retrieval/reranker_service.py
@ -0,0 +1,249 @@
 """
 Reranker Service for improving search results by reranking candidate documents.
 This service uses a custom reranking approach combining multiple signals
 to improve the relevance of search results.
 """
 import re
 import numpy as np
 from typing import Dict, List, Optional, Tuple, Union
 import logging
 # Set up logging
 logger = logging.getLogger("reranker_service")
 class RerankerService:
    """Service for reranking search results using a custom approach."""
    # Cache for reranked results
    _rerank_cache = {}
    @staticmethod
    def rerank(query: str, documents: List[Dict], top_k: int = 20) -> List[Dict]:
        """
        Rerank documents based on relevance to query using a multi-factor approach.
        Args:
            query (str): Query text
            documents (List[Dict]): List of document dictionaries with 'id' and 'content'
            top_k (int): Number of results to return
        Returns:
            List[Dict]: Reranked documents
        """
        # Return all documents if there are fewer than top_k
        if len(documents) <= top_k:
            return documents
        # Create cache key
        cache_key = f"{query}_{sorted([doc.get('id', '') for doc in documents])}"
        # Check if we have this reranking cached
        if cache_key in RerankerService._rerank_cache:
            return RerankerService._rerank_cache[cache_key][:top_k]
        try:
            # Prepare query
            query_terms = RerankerService._tokenize(query)
            query_lower = query.lower()
            # Calculate multi-factor relevance score for each document
            scored_docs = []
            for doc in documents:
                content = doc.get('content', '')
                content_lower = content.lower()
                # 1. Term frequency scoring (similar to BM25)
                term_score = RerankerService._calculate_term_score(content_lower, query_terms)
                # 2. Exact phrase matching
                phrase_score = RerankerService._calculate_phrase_score(content_lower, query_lower)
                # 3. Semantic similarity (use existing score if available)
                semantic_score = RerankerService._get_semantic_score(doc)
                # 4. Document position bonus
                position_score = RerankerService._calculate_position_score(content_lower, query_terms)
                # 5. Document length normalization
                length_factor = RerankerService._calculate_length_factor(content)
                # Calculate final combined score
                # Weights can be adjusted based on performance
                final_score = (
                    0.35 * term_score + 
                    0.30 * phrase_score + 
                    0.25 * semantic_score + 
                    0.10 * position_score
                ) * length_factor
                scored_doc = doc.copy()
                scored_doc['score'] = final_score
                scored_doc['_term_score'] = term_score
                scored_doc['_phrase_score'] = phrase_score
                scored_doc['_semantic_score'] = semantic_score
                scored_doc['_position_score'] = position_score
                scored_docs.append(scored_doc)
            # Sort by final score (highest first)
            scored_docs.sort(key=lambda x: x.get('score', 0), reverse=True)
            # Take the top_k
            result = scored_docs[:top_k]
            # Clean up diagnostic scores before returning
            for doc in result:
                doc.pop('_term_score', None)
                doc.pop('_phrase_score', None)
                doc.pop('_semantic_score', None)
                doc.pop('_position_score', None)
            # Cache the results
            RerankerService._rerank_cache[cache_key] = result
            return result
        except Exception as e:
            logger.error(f"Error reranking documents: {e}")
            # Fallback: simple sorting based on combined_score if available
            documents.sort(key=lambda x: x.get('combined_score', 0), reverse=True)
            return documents[:top_k]
    @staticmethod
    def _tokenize(text: str) -> List[str]:
        """
        Tokenize a string into terms.
        Args:
            text (str): Text to tokenize
        Returns:
            List[str]: List of tokens
        """
        # Simple tokenization by splitting on whitespace and removing punctuation
        tokens = re.findall(r'\b\w+\b', text.lower())
        return tokens
    @staticmethod
    def _calculate_term_score(content: str, query_terms: List[str]) -> float:
        """
        Calculate term frequency score.
        Args:
            content (str): Document content
            query_terms (List[str]): Query terms
        Returns:
            float: Term frequency score
        """
        score = 0
        content_tokens = RerankerService._tokenize(content)
        # Simple term frequency calculation
        for term in query_terms:
            term_count = content_tokens.count(term)
            score += term_count
        # Normalize by document length
        if len(content_tokens) > 0:
            score = score / len(content_tokens)
        return score
    @staticmethod
    def _calculate_phrase_score(content: str, query: str) -> float:
        """
        Calculate exact phrase matching score.
        Args:
            content (str): Document content
            query (str): Original query
        Returns:
            float: Phrase matching score
        """
        # Count exact matches of the query in the content
        exact_matches = content.count(query)
        # Calculating score for sentence fragments
        score = exact_matches * 2.0  # Higher weight for exact matches
        # Check for partial matches if no exact matches
        if exact_matches == 0 and len(query) > 5:
            # Generate query n-grams (only for longer queries)
            query_parts = [query[i:i+4] for i in range(0, len(query)-3)]
            for part in query_parts:
                if len(part) >= 4:  # Only consider meaningful parts
                    score += 0.2 * content.count(part)
        return min(score, 10.0)  # Cap to avoid extremely high scores
    @staticmethod
    def _get_semantic_score(doc: Dict) -> float:
        """
        Extract semantic similarity score from document.
        Args:
            doc (Dict): Document
        Returns:
            float: Semantic similarity score
        """
        # Use vector_score if available (from vector search)
        if 'vector_score' in doc:
            return doc['vector_score']
        # Use combined_score as fallback
        if 'combined_score' in doc:
            return doc['combined_score']
        return 0.5  # Default middle value if no scores available
    @staticmethod
    def _calculate_position_score(content: str, query_terms: List[str]) -> float:
        """
        Calculate score based on position of match in document.
        Earlier matches often indicate higher relevance.
        Args:
            content (str): Document content
            query_terms (List[str]): Query terms
        Returns:
            float: Position score
        """
        score = 0
        # Check for terms in the first 20% of the document
        first_section = content[:int(len(content) * 0.2)]
        for term in query_terms:
            if term in first_section:
                score += 0.5
        return min(score, 1.0)  # Normalize to maximum of 1.0
    @staticmethod
    def _calculate_length_factor(content: str) -> float:
        """
        Calculate length normalization factor.
        Prevents extremely short documents from ranking too high.
        Args:
            content (str): Document content
        Returns:
            float: Length normalization factor
        """
        token_count = len(RerankerService._tokenize(content))
        # Penalize very short documents
        if token_count < 10:
            return 0.7
        # Slightly favor mid-sized documents
        if 20 <= token_count <= 300:
            return 1.1
        return 1.0  # Neutral factor for other documents 
--- a/app/utils/embeddings.py
+++ b/app/utils/embeddings.py
@ -0,0 +1,111 @@
 """
 Embeddings utilities using Ollama and Nomic.
 """
 import os
 import requests
 import numpy as np
 from typing import List, Optional, Union
 import ollama
 from app.config import Config
 class EmbeddingService:
    """Service for generating embeddings using Ollama and Nomic."""
    @staticmethod
    def get_ollama_embeddings(texts: List[str], model: Optional[str] = None) -> List[List[float]]:
        """
        Generate embeddings using Ollama.
        Args:
            texts: List of texts to generate embeddings for
            model: Ollama model to use for embeddings (default from config)
        Returns:
            List of embeddings as float arrays
        """
        if model is None:
            # Use model from config
            model = Config.OLLAMA_MODEL
        # Set Ollama host from config
        ollama.host = Config.OLLAMA_HOST
        embeddings = []
        for text in texts:
            try:
                # Call Ollama API for embeddings
                response = ollama.embeddings(model=model, prompt=text)
                embedding = response.get("embedding", [])
                embeddings.append(embedding)
            except Exception as e:
                print(f"Error generating Ollama embedding: {e}")
                # Return a zero embedding as fallback
                embeddings.append([0.0] * 768)  # typical dimension for text embeddings
        return embeddings
    @staticmethod
    def get_nomic_embeddings(texts: List[str]) -> List[List[float]]:
        """
        Generate embeddings using Nomic.
        Args:
            texts: List of texts to generate embeddings for
        Returns:
            List of embeddings as float arrays
        """
        try:
            # The new version of Nomic requires a Cohere API key, so we'll fall back to Ollama
            # if we don't have one configured
            cohere_api_key = Config.COHERE_API_KEY
            if not cohere_api_key:
                print("No Cohere API key found for Nomic embeddings, falling back to Ollama")
                return EmbeddingService.get_ollama_embeddings(texts)
            # Dynamically import nomic embedders to avoid startup errors if not available
            from nomic.embedders import CohereEmbedder
            # Create a Nomic embedding model using CohereEmbedder with API key
            embedding_model = CohereEmbedder(cohere_api_key=cohere_api_key)
            # Generate embeddings for the texts
            embeddings = []
            for text in texts:
                embedding = embedding_model.embed(text)
                embeddings.append(embedding)
            return embeddings
        except Exception as e:
            print(f"Error generating Nomic embeddings: {e}")
            # Fall back to Ollama embeddings
            print("Falling back to Ollama embeddings")
            return EmbeddingService.get_ollama_embeddings(texts)
    @staticmethod
    def get_embeddings(texts: Union[str, List[str]], use_nomic: Optional[bool] = None) -> List[List[float]]:
        """
        Generate embeddings using either Nomic or Ollama.
        Args:
            texts: Text or list of texts to generate embeddings for
            use_nomic: Whether to use Nomic (True) or Ollama (False), defaults to config setting
        Returns:
            List of embeddings as float arrays
        """
        # Convert single text to list
        if isinstance(texts, str):
            texts = [texts]
        # If use_nomic is not specified, use the config setting
        if use_nomic is None:
            use_nomic = Config.USE_NOMIC_EMBEDDINGS
        # Generate embeddings using chosen method
        if use_nomic:
            return EmbeddingService.get_nomic_embeddings(texts)
        else:
            return EmbeddingService.get_ollama_embeddings(texts) 
--- a/app/utils/sync_service.py
+++ b/app/utils/sync_service.py
@ -0,0 +1,217 @@
 """
 Message synchronization service.
 Handles periodic fetching of new messages from Zulip and adds them to ChromaDB.
 """
 import os
 import time
 import logging
 import threading
 import pickle
 from datetime import datetime, timedelta
 from app.db.zulip_service import ZulipDatabaseService
 from app.db.chroma_service import ChromaDBService
 # Configure logger
 logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
 )
 logger = logging.getLogger("sync_service")
 class MessageSyncService:
    """Service for synchronizing messages from Zulip to ChromaDB."""
    # File to store the last synced message ID
    _SYNC_STATE_FILE = "sync_state.pickle"
    def __init__(self, sync_interval=60, state_dir=None):
        """
        Initialize the message sync service.
        Args:
            sync_interval (int): Sync interval in seconds (default: 60)
            state_dir (str): Directory to store sync state file (default: current directory)
        """
        self.sync_interval = sync_interval
        self.is_running = False
        self.sync_thread = None
        self.state_dir = state_dir or os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
        self.last_sync_time = None
        self.last_message_id = None
        self.batch_size = 50  # Default batch size
        # Load the last synced state if available
        self._load_sync_state()
    def _set_batch_size(self, batch_size):
        """Set the batch size for syncing messages."""
        if batch_size > 0:
            self.batch_size = batch_size
            logger.info(f"Set batch size to {batch_size}")
        else:
            logger.warning(f"Invalid batch size: {batch_size}, using default")
    def _get_state_file_path(self):
        """Get the full path to the sync state file."""
        return os.path.join(self.state_dir, self._SYNC_STATE_FILE)
    def _load_sync_state(self):
        """Load the last sync state from disk."""
        try:
            state_file = self._get_state_file_path()
            if os.path.exists(state_file):
                with open(state_file, 'rb') as f:
                    state = pickle.load(f)
                    self.last_sync_time = state.get('last_sync_time')
                    self.last_message_id = state.get('last_message_id')
                    logger.info(f"Loaded sync state: last_sync_time={self.last_sync_time}, last_message_id={self.last_message_id}")
            else:
                logger.info("No previous sync state found, starting fresh")
        except Exception as e:
            logger.error(f"Error loading sync state: {e}")
    def _save_sync_state(self):
        """Save the current sync state to disk."""
        try:
            state = {
                'last_sync_time': self.last_sync_time,
                'last_message_id': self.last_message_id
            }
            state_file = self._get_state_file_path()
            with open(state_file, 'wb') as f:
                pickle.dump(state, f)
            logger.info(f"Saved sync state: {state}")
        except Exception as e:
            logger.error(f"Error saving sync state: {e}")
    def _sync_messages(self):
        """
        Sync new messages from Zulip to ChromaDB.
        This method fetches new messages from the Zulip database that haven't been
        synchronized yet and adds them to ChromaDB.
        """
        try:
            # Set default sync time if not set yet
            if not self.last_sync_time:
                # Start with messages from the last 7 days if no previous sync
                self.last_sync_time = datetime.now() - timedelta(days=7)
            # Get messages newer than the last sync time
            logger.info(f"Fetching messages since {self.last_sync_time} or ID > {self.last_message_id}")
            # Get new messages
            messages = []
            if self.last_message_id:
                # Get messages with ID greater than the last processed message ID
                messages = ZulipDatabaseService.get_messages_newer_than_id(self.last_message_id, limit=self.batch_size)
            else:
                # Get messages from IT channels since the last sync time
                messages = ZulipDatabaseService.get_messages_from_it_channels(
                    since=self.last_sync_time, 
                    limit=self.batch_size
                )
            if not messages:
                logger.info("No new messages found to sync")
                return
            logger.info(f"Found {len(messages)} new messages to sync")
            # Add messages to ChromaDB
            synced_count = 0
            already_exists_count = 0
            highest_message_id = self.last_message_id or 0
            # Get a list of unique message IDs
            unique_message_ids = set(message.id for message in messages)
            logger.info(f"Found {len(unique_message_ids)} unique message IDs out of {len(messages)} messages")
            for message in messages:
                message_id = message.id
                # Update highest message ID seen
                if message_id > highest_message_id:
                    highest_message_id = message_id
                channel_name = ZulipDatabaseService.get_channel_name_for_message(message)
                sender_name = ZulipDatabaseService.get_sender_name_for_message(message)
                # Check if this message already exists in ChromaDB to avoid duplicates
                if ChromaDBService.message_exists(message_id):
                    already_exists_count += 1
                    logger.debug(f"Message {message_id} already exists in ChromaDB, skipping")
                    continue
                # Add the message to ChromaDB
                success = ChromaDBService.add_message(
                    message_id=message_id,
                    content=message.content,
                    channel_name=channel_name,
                    subject=message.subject,
                    sender_name=sender_name,
                    date_sent=message.date_sent
                )
                if success:
                    synced_count += 1
                else:
                    logger.warning(f"Failed to add message {message_id} to ChromaDB")
            # Update the last sync time and message ID
            self.last_sync_time = datetime.now()
            if highest_message_id > (self.last_message_id or 0):
                self.last_message_id = highest_message_id
            # Save the sync state
            self._save_sync_state()
            logger.info(f"Sync completed. Added {synced_count} new messages to ChromaDB. Skipped {already_exists_count} existing messages. Last message ID: {self.last_message_id}")
        except Exception as e:
            logger.error(f"Error syncing messages: {e}")
    def _sync_loop(self):
        """Main sync loop."""
        while self.is_running:
            try:
                self._sync_messages()
                # Sleep for the specified interval
                for _ in range(self.sync_interval):
                    if not self.is_running:
                        break
                    time.sleep(1)
            except Exception as e:
                logger.error(f"Error in sync loop: {e}")
                # Sleep a bit before retrying to avoid tight error loops
                time.sleep(5)
    def start(self):
        """Start the message sync service."""
        if self.is_running:
            logger.warning("Sync service is already running")
            return
        logger.info(f"Starting message sync service with interval {self.sync_interval} seconds")
        self.is_running = True
        self.sync_thread = threading.Thread(target=self._sync_loop)
        self.sync_thread.daemon = True
        self.sync_thread.start()
    def stop(self):
        """Stop the message sync service."""
        if not self.is_running:
            logger.warning("Sync service is not running")
            return
        logger.info("Stopping message sync service")
        self.is_running = False
        if self.sync_thread:
            self.sync_thread.join(timeout=10)
        logger.info("Sync service stopped")
    def sync_now(self):
        """Manually trigger a sync operation."""
        logger.info("Manual sync triggered")
        self._sync_messages() 
--- a/chromadb/bm25_index.pkl
+++ b/chromadb/bm25_index.pkl
--- a/chromadb/chroma.sqlite3
+++ b/chromadb/chroma.sqlite3
--- a/chromadb/dfdf3cf3-41c2-4f8b-8e2f-0e411dccc78d/data_level0.bin
+++ b/chromadb/dfdf3cf3-41c2-4f8b-8e2f-0e411dccc78d/data_level0.bin
--- a/chromadb/dfdf3cf3-41c2-4f8b-8e2f-0e411dccc78d/header.bin
+++ b/chromadb/dfdf3cf3-41c2-4f8b-8e2f-0e411dccc78d/header.bin
--- a/chromadb/dfdf3cf3-41c2-4f8b-8e2f-0e411dccc78d/index_metadata.pickle
+++ b/chromadb/dfdf3cf3-41c2-4f8b-8e2f-0e411dccc78d/index_metadata.pickle
--- a/chromadb/dfdf3cf3-41c2-4f8b-8e2f-0e411dccc78d/length.bin
+++ b/chromadb/dfdf3cf3-41c2-4f8b-8e2f-0e411dccc78d/length.bin
--- a/chromadb/dfdf3cf3-41c2-4f8b-8e2f-0e411dccc78d/link_lists.bin
+++ b/chromadb/dfdf3cf3-41c2-4f8b-8e2f-0e411dccc78d/link_lists.bin
--- a/compare_all_messages.py
+++ b/compare_all_messages.py
@ -0,0 +1,141 @@
 #!/usr/bin/env python
 """
 Simple script to compare ALL messages in Zulip to ChromaDB with no restrictions.
 """
 import os
 import sys
 import logging
 from collections import defaultdict
 from datetime import datetime
 # Configure logging
 logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
 )
 logger = logging.getLogger("compare_all_messages")
 # Add the current directory to the path so we can import the app module
 sys.path.append(os.path.dirname(os.path.abspath(__file__)))
 # Apply NumPy compatibility patch for ChromaDB
 from app.utils import patch_chromadb_numpy
 patch_chromadb_numpy()
 from app import create_app
 from app.db import get_chroma_collection, get_db_session
 from app.db.zulip_service import ZulipDatabaseService
 from app.models.zulip import Message
 def main():
    """Main function to compare Zulip messages with ChromaDB entries."""
    logger.info("Starting simple comparison of ALL messages")
    # Create the Flask app (needed for context)
    app = create_app()
    with app.app_context():
        print("\n====================================================")
        print("COMPARING ALL ZULIP MESSAGES WITH CHROMADB")
        print(f"Started at: {datetime.now()}")
        print("====================================================\n")
        try:
            # Get Zulip DB session
            session = get_db_session()
            # Get ALL messages from Zulip
            print("Fetching all messages from Zulip...")
            zulip_messages = session.query(Message).all()
            zulip_ids = set(str(msg.id) for msg in zulip_messages)
            # Get channel counts
            channel_counts = defaultdict(int)
            for message in zulip_messages:
                channel_name = ZulipDatabaseService.get_channel_name_for_message(message)
                if channel_name is None:
                    channel_name = "Unknown Channel"
                channel_counts[channel_name] += 1
            # Print Zulip stats
            print(f"\nZulip has {len(zulip_messages)} total messages across {len(channel_counts)} channels")
            # Get ChromaDB collection
            collection = get_chroma_collection()
            if not collection:
                print("ERROR: Failed to get ChromaDB collection")
                return
            # Get all entries from ChromaDB
            print("Fetching all entries from ChromaDB...")
            chroma_result = collection.get(include=['metadatas'])
            if not chroma_result or 'ids' not in chroma_result or not chroma_result['ids']:
                print("No entries found in ChromaDB")
                return
            # Get unique ChromaDB IDs
            chroma_ids = set(chroma_result['ids'])
            # Get channel counts for ChromaDB
            chroma_channel_counts = defaultdict(int)
            for i, _ in enumerate(chroma_result['ids']):
                if chroma_result.get('metadatas') and len(chroma_result['metadatas']) > i:
                    metadata = chroma_result['metadatas'][i]
                    channel = metadata.get('channel', 'Unknown')
                    chroma_channel_counts[channel] += 1
            # Print ChromaDB stats
            print(f"ChromaDB has {len(chroma_result['ids'])} total entries")
            print(f"ChromaDB has {len(chroma_ids)} unique entries")
            # Calculate missing and extra
            missing_from_chromadb = zulip_ids - chroma_ids
            extra_in_chromadb = chroma_ids - zulip_ids
            # Calculate overall sync percentage
            sync_percentage = (len(chroma_ids) / len(zulip_ids) * 100) if zulip_ids else 0
            # Print comparison results
            print("\n====================================================")
            print("COMPARISON RESULTS")
            print("====================================================")
            print(f"Zulip total messages: {len(zulip_messages)}")
            print(f"ChromaDB total entries: {len(chroma_result['ids'])}")
            print(f"ChromaDB unique entries: {len(chroma_ids)}")
            print(f"Sync percentage: {sync_percentage:.2f}%")
            print(f"Messages in Zulip but not in ChromaDB: {len(missing_from_chromadb)}")
            print(f"Entries in ChromaDB not in Zulip: {len(extra_in_chromadb)}")
            # Print channel comparison
            print("\nCHANNEL COMPARISON:")
            print("-" * 70)
            print(f"{'Channel':<25} {'Zulip':<10} {'ChromaDB':<10} {'Diff':<10} {'%':<10}")
            print("-" * 70)
            all_channels = sorted(set(channel_counts.keys()) | set(chroma_channel_counts.keys()))
            for channel in all_channels:
                zulip_count = channel_counts.get(channel, 0)
                chroma_count = chroma_channel_counts.get(channel, 0)
                diff = zulip_count - chroma_count
                percentage = (chroma_count / zulip_count * 100) if zulip_count > 0 else 0
                print(f"{channel[:25]:<25} {zulip_count:<10} {chroma_count:<10} {diff:<10} {percentage:.2f}%")
            # Print recommendations
            print("\n====================================================")
            print("RECOMMENDATIONS")
            print("====================================================")
            if sync_percentage < 100:
                print("- Run ./sync_all_messages.py to sync missing messages")
            else:
                print("- All messages are synced!")
            print(f"\nComparison completed at: {datetime.now()}")
        except Exception as e:
            print(f"Error during comparison: {e}")
            logger.error(f"Error during comparison: {e}")
 if __name__ == "__main__":
    main() 
--- a/compare_messages.py
+++ b/compare_messages.py
@ -0,0 +1,333 @@
 #!/usr/bin/env python
 """
 Script to compare the number of messages in Zulip channels to ChromaDB.
 This script will gather statistics on message counts from both Zulip DB and ChromaDB,
 then generate a report showing discrepancies between the two.
 """
 import os
 import sys
 import logging
 from collections import defaultdict, Counter
 from datetime import datetime, timedelta
 import argparse
 # Configure logging
 logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
 )
 logger = logging.getLogger("compare_messages")
 # Add the current directory to the path so we can import the app module
 sys.path.append(os.path.dirname(os.path.abspath(__file__)))
 # Apply NumPy compatibility patch for ChromaDB
 from app.utils import patch_chromadb_numpy
 patch_chromadb_numpy()
 from app import create_app
 from app.db import get_chroma_collection, get_db_session
 from app.db.zulip_service import ZulipDatabaseService
 from app.models.zulip import Message, Stream, Recipient, UserProfile
 from sqlalchemy import and_, not_, or_
 from app.config import Config
 def get_excluded_user_ids():
    """Get the user IDs of IT_Bot and ai_bot."""
    session = get_db_session()
    excluded_users = session.query(UserProfile).filter(
        UserProfile.full_name.in_(['IT_Bot', 'ai_bot'])
    ).all()
    excluded_user_ids = [user.id for user in excluded_users]
    logger.info(f"Excluding messages from users: {[u.full_name for u in excluded_users]} (IDs: {excluded_user_ids})")
    return excluded_user_ids
 def get_sandbox_recipient_id():
    """Get the recipient ID for the sandbox channel."""
    session = get_db_session()
    sandbox_stream = session.query(Stream).filter(
        Stream.name == 'sandbox'
    ).first()
    if sandbox_stream:
        logger.info(f"Excluding messages from sandbox channel (recipient_id={sandbox_stream.recipient_id})")
        return sandbox_stream.recipient_id
    else:
        logger.warning("Sandbox channel not found")
        return None
 def get_zulip_message_counts(days=30):
    """
    Get message counts from Zulip database for all channels except sandbox,
    also excluding IT_Bot and ai_bot messages.
    Args:
        days: Number of days to look back
    Returns:
        dict: Channel name to message count mapping
    """
    logger.info(f"Getting message counts from Zulip DB for the last {days} days")
    try:
        session = get_db_session()
        # Get excluded user IDs (IT_Bot and ai_bot)
        excluded_user_ids = get_excluded_user_ids()
        # Get sandbox recipient ID to exclude
        sandbox_recipient_id = get_sandbox_recipient_id()
        # Build filters
        since_date = datetime.now() - timedelta(days=days)
        filters = [Message.date_sent >= since_date]
        # Add filter for excluded users
        if excluded_user_ids:
            filters.append(not_(Message.sender_id.in_(excluded_user_ids)))
        # Add filter for excluded recipient (sandbox)
        if sandbox_recipient_id:
            filters.append(Message.recipient_id != sandbox_recipient_id)
        # Get all messages
        messages = session.query(Message).filter(and_(*filters)).all()
        # Get all channels except sandbox
        streams = session.query(Stream).filter(
            Stream.deactivated == False
        ).all()
        # Filter out sandbox
        included_streams = [stream for stream in streams 
                          if stream.recipient_id != sandbox_recipient_id]
        # Print the list of channels being analyzed
        channels = [(stream.name, stream.recipient_id) for stream in included_streams]
        channels.sort(key=lambda x: x[0])
        logger.info(f"Analyzing messages from {len(channels)} channels:")
        for channel_name, recipient_id in channels:
            logger.info(f"- {channel_name} (recipient_id={recipient_id})")
        # Count messages by channel
        channel_counts = defaultdict(int)
        message_ids = set()
        for message in messages:
            channel_name = ZulipDatabaseService.get_channel_name_for_message(message)
            if channel_name and channel_name != "sandbox":
                channel_counts[channel_name] += 1
                message_ids.add(str(message.id))  # Convert to string for comparison with ChromaDB
        # Print the message counts by channel
        logger.info(f"Message counts by channel:")
        for channel, count in sorted(channel_counts.items()):
            logger.info(f"- {channel}: {count} messages")
        return {
            'channel_counts': dict(channel_counts),
            'total_count': len(messages),
            'unique_count': len(message_ids),
            'message_ids': message_ids
        }
    except Exception as e:
        logger.error(f"Error getting Zulip message counts: {e}")
        return {'channel_counts': {}, 'total_count': 0, 'unique_count': 0, 'message_ids': set()}
 def get_chromadb_message_counts():
    """
    Get message counts from ChromaDB.
    Returns:
        dict: Statistics about ChromaDB messages
    """
    logger.info("Getting message counts from ChromaDB")
    try:
        collection = get_chroma_collection()
        if not collection:
            logger.error("Failed to get ChromaDB collection")
            return {'channel_counts': {}, 'total_count': 0, 'unique_count': 0, 'message_ids': set()}
        # Get all entries
        result = collection.get(include=['metadatas'])
        if not result or 'ids' not in result or not result['ids']:
            logger.info("No entries found in ChromaDB")
            return {'channel_counts': {}, 'total_count': 0, 'unique_count': 0, 'message_ids': set()}
        # Count messages by channel
        channel_counts = defaultdict(int)
        message_ids = set()
        for i, message_id in enumerate(result['ids']):
            # Extract channel from metadata
            if result.get('metadatas') and len(result['metadatas']) > i:
                metadata = result['metadatas'][i]
                channel = metadata.get('channel', 'Unknown')
                if channel != "sandbox":
                    channel_counts[channel] += 1
            # Add to message_ids set
            message_ids.add(message_id)
        # Count duplicates
        id_counts = Counter(result['ids'])
        duplicates = {message_id: count for message_id, count in id_counts.items() if count > 1}
        # Print the message counts by channel
        logger.info(f"ChromaDB message counts by channel:")
        for channel, count in sorted(channel_counts.items()):
            logger.info(f"- {channel}: {count} messages")
        return {
            'channel_counts': dict(channel_counts),
            'total_count': len(result['ids']),
            'unique_count': len(message_ids),
            'message_ids': message_ids,
            'duplicate_count': len(duplicates),
            'duplicates': duplicates
        }
    except Exception as e:
        logger.error(f"Error getting ChromaDB message counts: {e}")
        return {'channel_counts': {}, 'total_count': 0, 'unique_count': 0, 'message_ids': set()}
 def compare_counts(zulip_counts, chromadb_counts, days):
    """
    Compare message counts between Zulip and ChromaDB.
    Args:
        zulip_counts: Counts from Zulip DB
        chromadb_counts: Counts from ChromaDB
        days: Number of days looked back
    Returns:
        dict: Comparison statistics
    """
    logger.info("Comparing message counts")
    # Get message IDs in Zulip but not in ChromaDB
    zulip_ids = set(zulip_counts['message_ids'])
    chroma_ids = set(chromadb_counts['message_ids'])
    # Convert all IDs to strings for comparison
    zulip_ids = {str(id) for id in zulip_ids}
    chroma_ids = {str(id) for id in chroma_ids}
    missing_from_chromadb = zulip_ids - chroma_ids
    # Get message IDs in ChromaDB but not in Zulip (within the timeframe)
    extra_in_chromadb = chroma_ids - zulip_ids
    # Channel comparison
    channel_comparison = {}
    all_channels = set(zulip_counts['channel_counts'].keys()) | set(chromadb_counts['channel_counts'].keys())
    for channel in all_channels:
        zulip_count = zulip_counts['channel_counts'].get(channel, 0)
        chromadb_count = chromadb_counts['channel_counts'].get(channel, 0)
        difference = zulip_count - chromadb_count
        channel_comparison[channel] = {
            'zulip_count': zulip_count,
            'chromadb_count': chromadb_count,
            'difference': difference,
            'percentage': (chromadb_count / zulip_count * 100) if zulip_count > 0 else 0
        }
    return {
        'channel_comparison': channel_comparison,
        'missing_from_chromadb': missing_from_chromadb,
        'missing_count': len(missing_from_chromadb),
        'extra_in_chromadb': extra_in_chromadb,
        'extra_count': len(extra_in_chromadb),
        'zulip_total': zulip_counts['total_count'],
        'chromadb_total': chromadb_counts['total_count'],
        'zulip_unique': zulip_counts['unique_count'],
        'chromadb_unique': chromadb_counts['unique_count'],
        'duplicate_count': chromadb_counts.get('duplicate_count', 0),
        'days': days
    }
 def print_comparison_report(comparison):
    """
    Print a report of the comparison.
    Args:
        comparison: Comparison statistics
    """
    print("\n" + "=" * 80)
    print(f"ZULIP TO CHROMADB COMPARISON REPORT (Last {comparison['days']} days)")
    print("=" * 80)
    print("\nSUMMARY:")
    print(f"Zulip total messages: {comparison['zulip_total']}")
    print(f"Zulip unique messages: {comparison['zulip_unique']}")
    print(f"ChromaDB total entries: {comparison['chromadb_total']}")
    print(f"ChromaDB unique entries: {comparison['chromadb_unique']}")
    print(f"Duplicate entries in ChromaDB: {comparison['duplicate_count']}")
    sync_percentage = (comparison['chromadb_unique'] / comparison['zulip_unique'] * 100) if comparison['zulip_unique'] > 0 else 0
    print(f"Overall sync rate: {sync_percentage:.2f}%")
    print(f"Messages in Zulip but missing from ChromaDB: {comparison['missing_count']}")
    print(f"Entries in ChromaDB not found in recent Zulip data: {comparison['extra_count']}")
    print("\nCHANNEL BREAKDOWN:")
    print("-" * 80)
    print(f"{'Channel':<25} {'Zulip':<10} {'ChromaDB':<10} {'Diff':<10} {'Sync %':<10}")
    print("-" * 80)
    for channel, stats in sorted(comparison['channel_comparison'].items()):
        print(f"{channel:<25} {stats['zulip_count']:<10} {stats['chromadb_count']:<10} {stats['difference']:<10} {stats['percentage']:.2f}%")
    if comparison['missing_count'] > 0:
        print("\nMISSING MESSAGE IDS (Sample):")
        print(", ".join(str(mid) for mid in list(comparison['missing_from_chromadb'])[:10]))
    if comparison['duplicate_count'] > 0:
        print("\nDUPLICATE ENTRIES DETECTED")
        print(f"Total messages with duplicates: {comparison['duplicate_count']}")
    print("\n" + "=" * 80)
    print("RECOMMENDATIONS:")
    if comparison['duplicate_count'] > 0:
        print("- Run ./fix_duplicate_entries.py to remove duplicate entries")
    if comparison['missing_count'] > 0:
        print("- Run python sync_all_channels.py --force --days {0} to sync missing messages".format(comparison['days']))
    if sync_percentage < 95:
        print("- Investigate sync service settings and DB connection issues")
    print("=" * 80 + "\n")
 def main():
    """Main entry point."""
    parser = argparse.ArgumentParser(description="Compare Zulip channel messages to ChromaDB entries")
    parser.add_argument("--days", type=int, default=30, help="Number of days to look back in Zulip history")
    args = parser.parse_args()
    logger.info("Starting message comparison")
    # Create the Flask app (needed for context)
    app = create_app()
    with app.app_context():
        # Get message counts
        zulip_counts = get_zulip_message_counts(days=args.days)
        chromadb_counts = get_chromadb_message_counts()
        # Compare counts
        comparison = compare_counts(zulip_counts, chromadb_counts, args.days)
        # Print report
        print_comparison_report(comparison)
    logger.info("Comparison completed")
 if __name__ == "__main__":
    main() 
--- a/ecosystem.config.js
+++ b/ecosystem.config.js
@ -0,0 +1,23 @@
 module.exports = {
  apps: [
    {
      name: 'zulip-bot',
      script: './run_app.sh',
      interpreter: '/bin/bash',
      instances: 1,
      autorestart: true,
      watch: false,
      max_memory_restart: '500M',
      env: {
        NODE_ENV: 'production',
        FLASK_APP: 'app',
        FLASK_RUN_PORT: 5100
      },
      log_date_format: 'YYYY-MM-DD HH:mm:ss',
      error_file: 'logs/zulip-bot-error.log',
      out_file: 'logs/zulip-bot-out.log',
      merge_logs: true,
      time: true
    }
  ]
 }; 
--- a/logs/zulip-bot-error.log
+++ b/logs/zulip-bot-error.log
--- a/logs/zulip-bot-out.log
+++ b/logs/zulip-bot-out.log
@ -0,0 +1,47 @@
 2025-05-14T17:36:16: Checking for processes on port 5100...
 2025-05-14T17:36:16: No process found on port 5100
 2025-05-14T17:36:16: Activating virtual environment...
 2025-05-14T17:36:16: Starting Flask app on port 5100...
 2025-05-14T17:36:17: NumPy compatibility patch applied for ChromaDB
 2025-05-14T17:36:17:  * Serving Flask app 'app'
 2025-05-14T17:36:17:  * Debug mode: on
 2025-05-14T17:38:41: Flask app stopped
 2025-05-14T17:38:41: Checking for processes on port 5100...
 2025-05-14T17:38:41: No process found on port 5100
 2025-05-14T17:38:41: Activating virtual environment...
 2025-05-14T17:38:41: Starting Flask app on port 5100...
 2025-05-14T17:38:42: NumPy compatibility patch applied for ChromaDB
 2025-05-14T17:38:42:  * Serving Flask app 'app'
 2025-05-14T17:38:42:  * Debug mode: on
 2025-05-14T17:38:42: Flask app stopped
 2025-05-14T17:38:42: Checking for processes on port 5100...
 2025-05-14T17:38:42: Killing process 2093957 on port 5100
 2025-05-14T17:38:42: Activating virtual environment...
 2025-05-14T17:38:42: Starting Flask app on port 5100...
 2025-05-14T17:38:43: NumPy compatibility patch applied for ChromaDB
 2025-05-14T17:38:43:  * Serving Flask app 'app'
 2025-05-14T17:38:43:  * Debug mode: on
 2025-05-14T17:38:51: Flask app stopped
 2025-05-14T17:38:51: Checking for processes on port 5100...
 2025-05-14T17:38:51: No process found on port 5100
 2025-05-14T17:38:51: Activating virtual environment...
 2025-05-14T17:38:51: Starting Flask app on port 5100...
 2025-05-14T17:38:52: NumPy compatibility patch applied for ChromaDB
 2025-05-14T17:38:52:  * Serving Flask app 'app'
 2025-05-14T17:38:52:  * Debug mode: on
 2025-05-15T09:29:44: Flask app stopped
 2025-05-15T09:29:44: Checking for processes on port 5100...
 2025-05-15T09:29:44: No process found on port 5100
 2025-05-15T09:29:44: Activating virtual environment...
 2025-05-15T09:29:44: Starting Flask app on port 5100...
 2025-05-15T09:29:45: NumPy compatibility patch applied for ChromaDB
 2025-05-15T09:29:45:  * Serving Flask app 'app'
 2025-05-15T09:29:45:  * Debug mode: on
 2025-05-15T09:29:46: Flask app stopped
 2025-05-15T09:29:46: Checking for processes on port 5100...
 2025-05-15T09:29:46: No process found on port 5100
 2025-05-15T09:29:46: Activating virtual environment...
 2025-05-15T09:29:46: Starting Flask app on port 5100...
 2025-05-15T09:29:47: NumPy compatibility patch applied for ChromaDB
 2025-05-15T09:29:47:  * Serving Flask app 'app'
 2025-05-15T09:29:47:  * Debug mode: on
--- a/pm2_start.sh
+++ b/pm2_start.sh
@ -0,0 +1,38 @@
 #!/bin/bash
 # Create logs directory if it doesn't exist
 mkdir -p logs
 # Make sure the run_app.sh script is executable
 chmod +x run_app.sh
 # Check if PM2 is installed
 if ! command -v pm2 &> /dev/null; then
    echo "PM2 is not installed. Installing..."
    npm install -g pm2
 fi
 # Start the application with PM2
 echo "Starting Zulip Bot service with PM2..."
 pm2 start ecosystem.config.js
 # Save the current PM2 configuration
 echo "Saving PM2 configuration..."
 pm2 save
 # Configure PM2 to start on boot (may require sudo)
 echo "Setting up PM2 to start on system boot..."
 if sudo pm2 startup | grep -q "sudo"; then
    # If the output contains sudo commands, extract and run them
    sudo_cmd=$(sudo pm2 startup | grep "sudo" | tail -n 1)
    echo "Run the following command with sudo privileges to enable PM2 on startup:"
    echo "$sudo_cmd"
 else
    echo "PM2 startup configuration completed."
 fi
 echo "PM2 service setup complete. Zulip Bot is now running as a service."
 echo "To check status: pm2 status"
 echo "To view logs: pm2 logs zulip-bot"
 echo "To restart: pm2 restart zulip-bot"
 echo "To stop: pm2 stop zulip-bot" 
--- a/project_config.md
+++ b/project_config.md
@ -0,0 +1,81 @@
 # Project Configuration (LTM)
 *This file contains the stable, long-term context for the project.*
 *It should be updated infrequently, primarily when core goals, tech, or patterns change.*
 ---
 ## Core Goal
 Develop a Python-based Flask application that integrates with Zulip to:
 * Connect to a Zulip PostgreSQL database.
 * Retrieve messages from the specified channels: **IT Discussions, IT Knowledge, IT Support**.
 * Embed these messages into ChromaDB for efficient retrieval.
 * Implement a Zulip bot named **IT\_Bot** that responds to user queries when mentioned using the format `@**IT_Bot**`.
 * Generate context-based responses using the Gemini API.
 ---
 ## Tech Stack
 * **Backend:** Python, Flask
 * **Database:** PostgreSQL (Zulip DB), ChromaDB
 * **AI Integration:** Gemini API
 * **Bot Framework:** Zulip Bot API
 * **Environment Management:** Virtualenv or Conda
 * **Version Control:** Git
 ---
 ## Critical Patterns & Conventions
 * **Database Access:**
  * Store database credentials securely (e.g., environment variables or a secrets manager).
  * Use SQLAlchemy ORM for structured queries.
 * **Message Retrieval:**
  * Implement periodic tasks to pull messages from the channels.
  * Ensure idempotent operations to prevent duplicates in ChromaDB.
 * **Embedding Strategy:**
  * Embed messages with metadata (e.g., channel name, timestamp, user ID).
 * **Bot Activation:**
  * The bot listens for `@**IT_Bot**` mentions.
  * Upon activation, relevant context is fetched from ChromaDB, and a response is generated using the Gemini API.
 * **Error Handling:**
  * Implement structured logging.
  * Gracefully handle API rate limits and database connection errors.
 * **Security:**
  * Store credentials and API keys in environment variables.
  * Implement rate limiting to prevent abuse.
 ---
 ## Key Constraints
 * **Channels Monitored:** IT Discussions, IT Knowledge, IT Support
 * **Response Trigger:** Mentions of `@**IT_Bot**`
 * **Language Support:** English, Georgian
 * **Message Volume:** Approximately 500 messages per day.
 * **Deployment:** Local network server
 * **Zulip Bot Config:**
 ```
 [api]
 email=IT_bot-bot@zulip.lci.ge
 key=ta8x0Rwlf5yLlZutETiTZbHFtQMVOv1z
 site=https://zulip.lci.ge
 ```
 * **Database Connection:** `zulip:BlackMoonSky89@zulip.lci.ge:5432/zulip`
 * **Gemini API Key:** `AIzaSyD_VYKUcleCUkAxZj1sX3pWLHvGk0HDe9s`
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,14 @@
 Flask==2.2.3
 Werkzeug==2.2.3
 SQLAlchemy==2.0.9
 psycopg2-binary==2.9.6
 python-dotenv==1.0.0
 chromadb==0.4.6
 zulip==0.8.2
 google-generativeai==0.3.1
 ollama==0.1.5
 nomic==2.0.3
 cohere==5.15.0
 rank-bm25==0.2.2
 nltk==3.8.1
 openai==1.30.4 
--- a/reset_chromadb.py
+++ b/reset_chromadb.py
@ -0,0 +1,87 @@
 #!/usr/bin/env python3
 """
 Script to reset the ChromaDB completely and properly.
 This fixes issues with the vector database that cause "Add of existing embedding ID" warnings.
 """
 import os
 import shutil
 import logging
 import chromadb
 from chromadb.utils import embedding_functions
 from app.utils.embeddings import EmbeddingService
 # Configure logging
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
 logger = logging.getLogger("reset_chromadb")
 def main():
    """Main function to reset ChromaDB."""
    try:
        # Default ChromaDB path used in the application
        chromadb_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "chromadb")
        logger.info(f"Preparing to reset ChromaDB at {chromadb_path}")
        # First check if the directory exists
        if not os.path.exists(chromadb_path):
            logger.info("ChromaDB directory doesn't exist yet. Creating a fresh one.")
            os.makedirs(chromadb_path, exist_ok=True)
            logger.info("ChromaDB directory created successfully.")
            return
        # Backup the existing ChromaDB directory
        backup_path = f"{chromadb_path}_backup"
        logger.info(f"Creating backup of ChromaDB at {backup_path}")
        # Remove old backup if it exists
        if os.path.exists(backup_path):
            logger.info("Removing old backup")
            shutil.rmtree(backup_path)
        # Create backup
        shutil.copytree(chromadb_path, backup_path)
        logger.info("Backup created successfully")
        # Delete the ChromaDB directory
        logger.info("Removing existing ChromaDB directory")
        shutil.rmtree(chromadb_path)
        # Create fresh ChromaDB
        logger.info("Creating fresh ChromaDB")
        os.makedirs(chromadb_path, exist_ok=True)
        # Initialize a fresh ChromaDB client and create a new collection
        logger.info("Initializing fresh ChromaDB client")
        client = chromadb.PersistentClient(
            path=chromadb_path,
            settings=chromadb.Settings(
                allow_reset=True,
                anonymized_telemetry=False
            )
        )
        # Create a custom embedding function
        class CustomEmbeddingFunction(embedding_functions.EmbeddingFunction):
            def __call__(self, texts):
                return EmbeddingService.get_ollama_embeddings(texts)
        # Create a fresh collection
        logger.info("Creating fresh collection")
        collection = client.create_collection(
            name="zulip_messages",
            metadata={
                "hnsw:space": "cosine"
            },
            embedding_function=CustomEmbeddingFunction()
        )
        logger.info("ChromaDB reset completed successfully")
        logger.info(f"To restore the backup if needed, delete {chromadb_path} and rename {backup_path} to {chromadb_path}")
    except Exception as e:
        logger.error(f"Error resetting ChromaDB: {e}")
        logger.error("ChromaDB reset failed. Please check the error and try again.")
 if __name__ == "__main__":
    main() 
--- a/run_app.sh
+++ b/run_app.sh
@ -0,0 +1,26 @@
 #!/bin/bash
 # Kill any process using port 5100
 echo "Checking for processes on port 5100..."
 pid=$(lsof -ti:5100)
 if [ -n "$pid" ]; then
    echo "Killing process $pid on port 5100"
    kill -9 $pid
 else
    echo "No process found on port 5100"
 fi
 # Activate virtual environment
 echo "Activating virtual environment..."
 source venv/bin/activate
 # Set Flask environment variables
 export FLASK_APP=app
 export FLASK_RUN_PORT=5100
 # Run the Flask app
 echo "Starting Flask app on port 5100..."
 flask run --port=5100 --no-reload
 # This script won't reach here unless the flask app is interrupted
 echo "Flask app stopped" 
--- a/setup.sh
+++ b/setup.sh
@ -0,0 +1,13 @@
 #!/bin/bash
 # Create a virtual environment
 python3.11 -m venv venv
 # Activate the virtual environment
 source venv/bin/activate
 # Install the required packages
 pip install -r requirements.txt
 echo "Setup completed successfully!"
 echo "To activate the virtual environment, run: source venv/bin/activate" 
--- a/sync_all_channels.py
+++ b/sync_all_channels.py
@ -0,0 +1,580 @@
 #!/usr/bin/env python
 """
 Script to sync messages from all Zulip channels (except sandbox) to ChromaDB.
 This script also excludes messages from IT_Bot and ai_bot users.
 """
 import os
 import sys
 import argparse
 import logging
 import signal
 import time
 from datetime import datetime, timedelta
 import pickle
 # Add the current directory to the path so we can import the app module
 sys.path.append(os.path.dirname(os.path.abspath(__file__)))
 # Apply NumPy compatibility patch for ChromaDB
 from app.utils import patch_chromadb_numpy
 patch_chromadb_numpy()
 from app import create_app
 from app.db.zulip_service import ZulipDatabaseService
 from app.db.chroma_service import ChromaDBService
 from app.models.zulip import Message, Stream, Recipient, UserProfile
 from sqlalchemy import and_, not_, or_
 from app.db import get_db_session
 # Configure logging
 logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
 )
 logger = logging.getLogger("sync_all_channels")
 # Global flag for graceful shutdown
 is_shutting_down = False
 # Signal handler for CTRL+C
 def signal_handler(sig, frame):
    global is_shutting_down
    logger.info("Received shutdown signal, completing current operation before exiting...")
    is_shutting_down = True
 # Register signal handler
 signal.signal(signal.SIGINT, signal_handler)
 signal.signal(signal.SIGTERM, signal_handler)
 class AllChannelSyncService:
    """Service for syncing messages from all channels except sandbox."""
    # File to store the last synced message ID
    _SYNC_STATE_FILE = "all_channels_sync_state.pickle"
    def __init__(self, batch_size=200, include_direct_messages=False):
        """
        Initialize the sync service.
        Args:
            batch_size (int): Number of messages to process in each batch
            include_direct_messages (bool): Whether to include direct messages
        """
        self.batch_size = batch_size
        self.last_sync_time = None
        self.last_message_id = None
        self.state_dir = os.path.dirname(os.path.abspath(__file__))
        self.channels_to_sync = []
        self.include_direct_messages = include_direct_messages
        # Load the last synced state if available
        self._load_sync_state()
    def _get_state_file_path(self):
        """Get the full path to the sync state file."""
        return os.path.join(self.state_dir, self._SYNC_STATE_FILE)
    def _load_sync_state(self):
        """Load the last sync state from disk."""
        try:
            state_file = self._get_state_file_path()
            if os.path.exists(state_file):
                with open(state_file, 'rb') as f:
                    state = pickle.load(f)
                    self.last_sync_time = state.get('last_sync_time')
                    self.last_message_id = state.get('last_message_id')
                    logger.info(f"Loaded sync state: last_sync_time={self.last_sync_time}, last_message_id={self.last_message_id}")
            else:
                logger.info("No previous sync state found, starting fresh")
        except Exception as e:
            logger.error(f"Error loading sync state: {e}")
    def _save_sync_state(self, channel_counts=None):
        """Save the current sync state to disk."""
        try:
            state = {
                'last_sync_time': self.last_sync_time,
                'last_message_id': self.last_message_id
            }
            if channel_counts:
                state['channel_counts'] = channel_counts
            state_file = self._get_state_file_path()
            # Save to a temporary file first, then rename to avoid corruption if interrupted
            temp_file = state_file + '.temp'
            with open(temp_file, 'wb') as f:
                pickle.dump(state, f)
                f.flush()
                os.fsync(f.fileno())  # Ensure data is written to disk
            # Rename the temp file to the actual state file (atomic operation)
            os.rename(temp_file, state_file)
            logger.info(f"Saved sync state: {state}")
        except Exception as e:
            logger.error(f"Error saving sync state: {e}")
    def get_excluded_user_ids(self):
        """Get the user IDs of IT_Bot and ai_bot."""
        session = get_db_session()
        excluded_users = session.query(UserProfile).filter(
            UserProfile.full_name.in_(['IT_Bot', 'ai_bot'])
        ).all()
        excluded_user_ids = [user.id for user in excluded_users]
        logger.info(f"Excluding messages from users: {[u.full_name for u in excluded_users]} (IDs: {excluded_user_ids})")
        return excluded_user_ids
    def get_sandbox_recipient_id(self):
        """Get the recipient ID for the sandbox channel."""
        session = get_db_session()
        sandbox_stream = session.query(Stream).filter(
            Stream.name == 'sandbox'
        ).first()
        if sandbox_stream:
            logger.info(f"Excluding messages from sandbox channel (recipient_id={sandbox_stream.recipient_id})")
            return sandbox_stream.recipient_id
        else:
            logger.warning("Sandbox channel not found")
            return None
    def get_channels_to_sync(self):
        """Get all active channels except sandbox with their recipient IDs."""
        session = get_db_session()
        sandbox_recipient_id = self.get_sandbox_recipient_id()
        # Get all active streams
        streams = session.query(Stream).filter(
            Stream.deactivated == False
        ).all()
        # Filter out sandbox
        included_streams = [stream for stream in streams 
                           if stream.recipient_id != sandbox_recipient_id]
        # Create a list of channels to sync with their recipient IDs
        channels = [(stream.name, stream.recipient_id) for stream in included_streams]
        # Sort by channel name
        channels.sort(key=lambda x: x[0])
        # Print the list of channels
        logger.info(f"Found {len(channels)} channels to sync:")
        for channel_name, recipient_id in channels:
            logger.info(f"- {channel_name} (recipient_id={recipient_id})")
        self.channels_to_sync = channels
        # Return just the recipient IDs for filtering
        recipient_ids = [recipient_id for _, recipient_id in channels]
        return recipient_ids
    def get_messages_newer_than_id(self, message_id, excluded_user_ids, excluded_recipient_id):
        """Get messages with ID greater than the specified ID."""
        session = get_db_session()
        # Build filters
        filters = [Message.id > message_id]
        # Add filter for excluded users
        if excluded_user_ids:
            filters.append(not_(Message.sender_id.in_(excluded_user_ids)))
        # Add filter for excluded recipient (sandbox)
        if excluded_recipient_id:
            filters.append(Message.recipient_id != excluded_recipient_id)
        messages = session.query(Message).filter(
            and_(*filters)
        ).order_by(Message.id.asc()).limit(self.batch_size).all()
        return messages
    def get_messages_for_timeframe(self, since, excluded_user_ids, excluded_recipient_id, limit=1000, all_messages=False):
        """
        Get messages from the specified timeframe.
        Args:
            since (datetime): Get messages after this datetime
            excluded_user_ids (list): User IDs to exclude
            excluded_recipient_id (int): Recipient ID to exclude
            limit (int): Maximum number of messages to return
            all_messages (bool): If True, ignore the since parameter and get all messages
        Returns:
            list: List of Message objects
        """
        session = get_db_session()
        # Build filters
        filters = []
        # Add date filter if specified and not getting all messages
        if since and not all_messages:
            filters.append(Message.date_sent >= since)
        # Add filter for excluded users
        if excluded_user_ids:
            filters.append(not_(Message.sender_id.in_(excluded_user_ids)))
        # Add filter for excluded recipient (sandbox)
        if excluded_recipient_id:
            filters.append(Message.recipient_id != excluded_recipient_id)
        # Get results
        query = session.query(Message)
        if filters:
            query = query.filter(and_(*filters))
        messages = query.order_by(Message.id.desc()).limit(limit).all()
        return messages
    def get_channel_message_counts(self, since, excluded_user_ids, excluded_recipient_id, all_messages=False):
        """Get message counts by channel for the specified timeframe."""
        session = get_db_session()
        # Build filters
        filters = []
        # Add date filter if specified and not getting all messages
        if since and not all_messages:
            filters.append(Message.date_sent >= since)
        # Add filter for excluded users
        if excluded_user_ids:
            filters.append(not_(Message.sender_id.in_(excluded_user_ids)))
        # Add filter for excluded recipient (sandbox)
        if excluded_recipient_id:
            filters.append(Message.recipient_id != excluded_recipient_id)
        # Get all messages
        query = session.query(Message)
        if filters:
            query = query.filter(and_(*filters))
        messages = query.all()
        # Count messages by channel
        channel_counts = {}
        for message in messages:
            channel_name = ZulipDatabaseService.get_channel_name_for_message(message)
            if channel_name:
                if channel_name not in channel_counts:
                    channel_counts[channel_name] = 0
                channel_counts[channel_name] += 1
        # Sort by channel name
        sorted_counts = {k: channel_counts[k] for k in sorted(channel_counts.keys())}
        # Print the message counts by channel
        logger.info(f"Message counts by channel:")
        for channel, count in sorted_counts.items():
            logger.info(f"- {channel}: {count} messages")
        return sorted_counts
    def sync_messages(self, days=None, force=False, max_messages=5000, all_messages=False):
        """
        Sync messages from all Zulip channels to ChromaDB.
        Args:
            days (int): Number of days to look back for messages (default: use sync state)
            force (bool): Whether to force sync all messages from the lookback period
            max_messages (int): Maximum total number of messages to sync
            all_messages (bool): If True, ignore date filtering and sync all messages
        """
        global is_shutting_down
        try:
            # Get excluded user IDs (IT_Bot and ai_bot)
            excluded_user_ids = self.get_excluded_user_ids()
            # Get sandbox recipient ID to exclude
            excluded_recipient_id = self.get_sandbox_recipient_id()
            # Get all channels to sync and their recipient IDs
            self.get_channels_to_sync()
            # Reset sync state if forced
            if force:
                if all_messages:
                    self.last_sync_time = None
                    self.last_message_id = None
                    logger.info("Force syncing ALL messages regardless of date")
                elif days:
                    self.last_sync_time = datetime.now() - timedelta(days=days)
                    self.last_message_id = None
                    logger.info(f"Force syncing messages from the last {days} days")
            # Set default sync time if not set yet and not syncing all messages
            if not self.last_sync_time and not all_messages and not force:
                # Start with messages from the last 30 days if no previous sync
                self.last_sync_time = datetime.now() - timedelta(days=30 if not days else days)
                logger.info(f"No previous sync time, starting from {self.last_sync_time}")
            # Count total messages to sync if forcing
            total_messages = 0
            if force:
                since_date = None if all_messages else (datetime.now() - timedelta(days=days if days else 30))
                all_messages_count = self.get_messages_for_timeframe(
                    since=since_date,
                    excluded_user_ids=excluded_user_ids,
                    excluded_recipient_id=excluded_recipient_id,
                    limit=max_messages,
                    all_messages=all_messages
                )
                total_messages = len(all_messages_count)
                logger.info(f"Found a total of {total_messages} messages to sync")
                # Get message counts by channel
                self.get_channel_message_counts(since_date, excluded_user_ids, excluded_recipient_id, all_messages=all_messages)
            # Run multiple batches of sync
            total_synced = 0
            already_exists_count = 0
            highest_message_id = self.last_message_id or 0
            batch_count = 0
            # Track synced messages by channel
            channel_sync_counts = {}
            # Time to save state
            last_save_time = time.time()
            save_interval = 10  # Save state every 10 seconds
            while not is_shutting_down:
                batch_count += 1
                logger.info(f"Running batch {batch_count}, synced {total_synced} messages so far")
                # Get new messages
                messages = []
                if self.last_message_id:
                    # Get messages with ID greater than the last processed message ID
                    messages = self.get_messages_newer_than_id(
                        self.last_message_id,
                        excluded_user_ids,
                        excluded_recipient_id
                    )
                else:
                    # Get messages since the last sync time or all messages
                    messages = self.get_messages_for_timeframe(
                        since=self.last_sync_time,
                        excluded_user_ids=excluded_user_ids,
                        excluded_recipient_id=excluded_recipient_id,
                        limit=self.batch_size,
                        all_messages=all_messages
                    )
                if not messages:
                    logger.info("No new messages found to sync")
                    break
                logger.info(f"Found {len(messages)} new messages to sync in batch {batch_count}")
                # Process each message
                synced_in_batch = 0
                for message in messages:
                    # Check if we need to shutdown
                    if is_shutting_down:
                        logger.info("Shutdown requested, saving state and exiting...")
                        break
                    message_id = message.id
                    # Update highest message ID seen
                    if message_id > highest_message_id:
                        highest_message_id = message_id
                    channel_name = ZulipDatabaseService.get_channel_name_for_message(message)
                    sender_name = ZulipDatabaseService.get_sender_name_for_message(message)
                    # Skip excluded channels and users
                    if channel_name == "sandbox":
                        continue
                    if sender_name in ["IT_Bot", "ai_bot"]:
                        continue
                    # Skip direct messages unless explicitly included
                    if not self.include_direct_messages and channel_name in ["Direct Message", "Group Message"]:
                        logger.debug(f"Skipping {channel_name} message {message_id} (use --include-direct-messages to include)")
                        continue
                    # Check if this message already exists in ChromaDB to avoid duplicates
                    if ChromaDBService.message_exists(message_id):
                        already_exists_count += 1
                        logger.debug(f"Message {message_id} already exists in ChromaDB, skipping")
                        continue
                    # Handle None channel names
                    if channel_name is None:
                        channel_name = "Unknown Channel"
                        logger.warning(f"Found message {message_id} with None channel name, using '{channel_name}' instead")
                    # Add the message to ChromaDB
                    try:
                        success = ChromaDBService.add_message(
                            message_id=message_id,
                            content=message.content,
                            channel_name=channel_name,
                            subject=message.subject,
                            sender_name=sender_name,
                            date_sent=message.date_sent
                        )
                        if success:
                            synced_in_batch += 1
                            total_synced += 1
                            # Update channel counts
                            if channel_name not in channel_sync_counts:
                                channel_sync_counts[channel_name] = 0
                            channel_sync_counts[channel_name] += 1
                            # Update the last message ID after each successful addition
                            self.last_message_id = message_id
                            # Save state periodically
                            current_time = time.time()
                            if current_time - last_save_time > save_interval:
                                self.last_sync_time = datetime.now()
                                self._save_sync_state(channel_sync_counts)
                                last_save_time = current_time
                        else:
                            logger.warning(f"Failed to add message {message_id} to ChromaDB")
                    except Exception as e:
                        logger.error(f"Error adding message {message_id} to ChromaDB: {e}")
                        # Continue with next message
                # Update the last sync time and message ID at the end of the batch
                self.last_sync_time = datetime.now()
                if highest_message_id > (self.last_message_id or 0):
                    self.last_message_id = highest_message_id
                # Save the sync state after each batch
                self._save_sync_state(channel_sync_counts)
                last_save_time = time.time()
                logger.info(f"Batch {batch_count} completed. Added {synced_in_batch} new messages to ChromaDB. " +
                          f"Total synced: {total_synced}. Last message ID: {self.last_message_id}")
                # Check if we've reached the max messages limit
                if total_synced >= max_messages:
                    logger.info(f"Reached max messages limit of {max_messages}")
                    break
                # If this batch had fewer messages than the batch size, we're done
                if len(messages) < self.batch_size:
                    logger.info("Fetched fewer messages than batch size, assuming all messages have been processed")
                    break
            # Final state save with channel statistics
            if is_shutting_down:
                logger.info("Shutdown signal received, saving final state...")
            # Print synced messages by channel
            if channel_sync_counts:
                logger.info("Messages synced by channel:")
                try:
                    # Use a safe sorting method that handles None keys
                    sorted_items = sorted(channel_sync_counts.items(), 
                                         key=lambda item: item[0] if item[0] is not None else "")
                    for channel, count in sorted_items:
                        channel_name = channel if channel is not None else "Unknown Channel"
                        logger.info(f"- {channel_name}: {count} messages")
                except Exception as e:
                    logger.warning(f"Error displaying channel stats: {e}")
                    # Fallback display without sorting
                    for channel, count in channel_sync_counts.items():
                        channel_name = channel if channel is not None else "Unknown Channel"
                        logger.info(f"- {channel_name}: {count} messages")
            # Return the final stats
            stats = {
                'last_sync_time': self.last_sync_time,
                'last_message_id': self.last_message_id,
                'total_synced': total_synced,
                'batches': batch_count,
                'already_exists': already_exists_count,
                'channel_counts': channel_sync_counts
            }
            logger.info(f"Sync completed. Current state: {stats}")
            return stats
        except Exception as e:
            logger.error(f"Error syncing messages: {e}")
            # Save state on error
            self._save_sync_state()
            return None
 def main():
    """Main entry point."""
    # Parse command line arguments
    parser = argparse.ArgumentParser(description="Sync messages from all Zulip channels to ChromaDB")
    parser.add_argument("--days", type=int, help="Number of days to look back for messages")
    parser.add_argument("--force", action="store_true", help="Force sync all messages from the lookback period")
    parser.add_argument("--batch-size", type=int, default=200, help="Number of messages to process in each batch")
    parser.add_argument("--max-messages", type=int, default=10000, help="Maximum total number of messages to sync")
    parser.add_argument("--include-direct-messages", action="store_true", help="Include direct and group messages in sync")
    parser.add_argument("--all-messages", action="store_true", help="Sync all messages regardless of date")
    args = parser.parse_args()
    # Create the Flask app
    app = create_app()
    with app.app_context():
        try:
            # Initialize sync service
            sync_service = AllChannelSyncService(
                batch_size=args.batch_size,
                include_direct_messages=args.include_direct_messages
            )
            # Sync messages
            stats = sync_service.sync_messages(
                days=args.days, 
                force=args.force,
                max_messages=args.max_messages,
                all_messages=args.all_messages
            )
            if stats:
                channel_counts = stats.get('channel_counts', {})
                print(f"\nSync completed at {datetime.now()}")
                print(f"Last sync time: {stats['last_sync_time']}")
                print(f"Last message ID: {stats['last_message_id']}")
                print(f"Total messages synced: {stats['total_synced']}")
                print(f"Number of batches: {stats['batches']}")
                print(f"Messages already in DB: {stats['already_exists']}")
                if channel_counts:
                    print("\nMessages synced by channel:")
                    try:
                        # Use a safe sorting method that handles None keys
                        sorted_items = sorted(channel_counts.items(), 
                                             key=lambda item: item[0] if item[0] is not None else "")
                        for channel, count in sorted_items:
                            channel_name = channel if channel is not None else "Unknown Channel"
                            print(f"- {channel_name}: {count} messages")
                    except Exception as e:
                        # Fallback display without sorting
                        for channel, count in channel_counts.items():
                            channel_name = channel if channel is not None else "Unknown Channel"
                            print(f"- {channel_name}: {count} messages")
        except KeyboardInterrupt:
            print("\nSync process interrupted by user. State has been saved.")
            logger.info("Sync process interrupted by user. State has been saved.")
 if __name__ == "__main__":
    main() 
--- a/sync_all_messages.py
+++ b/sync_all_messages.py
@ -0,0 +1,157 @@
 #!/usr/bin/env python
 """
 Script to sync ALL messages from Zulip to ChromaDB with NO restrictions.
 This script will sync everything - all channels, all users, all time periods.
 """
 import os
 import sys
 import logging
 from datetime import datetime
 # Configure logging
 logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
 )
 logger = logging.getLogger("sync_all_messages")
 # Add the current directory to the path so we can import the app module
 sys.path.append(os.path.dirname(os.path.abspath(__file__)))
 # Apply NumPy compatibility patch for ChromaDB
 from app.utils import patch_chromadb_numpy
 patch_chromadb_numpy()
 from app import create_app
 from app.db import get_db_session
 from app.db.zulip_service import ZulipDatabaseService
 from app.db.chroma_service import ChromaDBService
 from app.models.zulip import Message
 def sync_all_messages():
    """
    Sync ALL messages from Zulip to ChromaDB with no restrictions.
    All messages are processed in a single pass.
    Returns:
        dict: Statistics about the sync
    """
    logger.info("Starting unrestricted sync of ALL messages in one pass")
    session = get_db_session()
    total_synced = 0
    already_exists = 0
    channel_counts = {}
    # Get all messages at once
    logger.info("Fetching ALL messages from Zulip database")
    messages = session.query(Message).order_by(Message.id).all()
    total_messages = len(messages)
    logger.info(f"Found {total_messages} total messages in Zulip database")
    # Process all messages
    logger.info("Processing all messages")
    for i, message in enumerate(messages):
        message_id = message.id
        # Log progress at intervals
        if i % 500 == 0 and i > 0:
            logger.info(f"Progress: {i}/{total_messages} messages processed ({(i/total_messages)*100:.1f}%)")
        # Get message details
        try:
            channel_name = ZulipDatabaseService.get_channel_name_for_message(message)
            sender_name = ZulipDatabaseService.get_sender_name_for_message(message)
            # Handle None channel names
            if channel_name is None:
                channel_name = "Unknown Channel"
                logger.warning(f"Message {message_id} has None channel name, using '{channel_name}' instead")
            # Check if message already exists in ChromaDB
            if ChromaDBService.message_exists(message_id):
                already_exists += 1
                continue
            # Add message to ChromaDB
            success = ChromaDBService.add_message(
                message_id=message_id,
                content=message.content,
                channel_name=channel_name,
                subject=message.subject,
                sender_name=sender_name,
                date_sent=message.date_sent
            )
            if success:
                total_synced += 1
                # Update channel counts
                if channel_name not in channel_counts:
                    channel_counts[channel_name] = 0
                channel_counts[channel_name] += 1
            else:
                logger.warning(f"Failed to add message {message_id} to ChromaDB")
        except Exception as e:
            logger.error(f"Error processing message {message_id}: {e}")
    # Print channel statistics
    if channel_counts:
        logger.info("Messages synced by channel:")
        for channel, count in sorted(channel_counts.items()):
            logger.info(f"- {channel}: {count} messages")
    # Return statistics
    return {
        'total_messages': total_messages,
        'total_synced': total_synced,
        'already_exists': already_exists,
        'channel_counts': channel_counts
    }
 def main():
    """Main entry point."""
    try:
        # Create the Flask app (needed for context)
        app = create_app()
        with app.app_context():
            print("\n====================================================")
            print("STARTING UNRESTRICTED SYNC OF ALL ZULIP MESSAGES")
            print(f"Started at: {datetime.now()}")
            print("====================================================\n")
            # Sync all messages
            start_time = datetime.now()
            stats = sync_all_messages()
            end_time = datetime.now()
            duration = end_time - start_time
            # Print summary
            print("\n====================================================")
            print("SYNC COMPLETE")
            print(f"Started at: {start_time}")
            print(f"Completed at: {end_time}")
            print(f"Duration: {duration}")
            print(f"Total messages in Zulip: {stats['total_messages']}")
            print(f"Total messages synced: {stats['total_synced']}")
            print(f"Messages already in ChromaDB: {stats['already_exists']}")
            # Print channel counts
            if stats['channel_counts']:
                print("\nMessages synced by channel:")
                for channel, count in sorted(stats['channel_counts'].items()):
                    print(f"- {channel}: {count} messages")
            print("====================================================\n")
    except KeyboardInterrupt:
        print("\nSync process interrupted by user")
        logger.info("Sync process interrupted by user")
    except Exception as e:
        print(f"\nError during sync: {e}")
        logger.error(f"Error during sync: {e}")
 if __name__ == "__main__":
    main() 
--- a/sync_and_verify.sh
+++ b/sync_and_verify.sh
@ -0,0 +1,132 @@
 #!/bin/bash
 # Script to sync all messages from all channels (except sandbox) and verify
 # they're in ChromaDB
 # Set up logging
 LOG_FILE="logs/sync_and_verify_$(date +%Y%m%d_%H%M%S).log"
 mkdir -p logs
 # Make sure scripts are executable
 chmod +x sync_all_channels.py
 chmod +x compare_messages.py
 chmod +x fix_unknown_channels.py
 echo "======================================================"
 echo "  ZULIP CHANNEL SYNC AND VERIFY PROCESS"
 echo "  $(date)"
 echo "  Logging to: $LOG_FILE"
 echo "======================================================"
 echo ""
 echo "=====================================================" | tee -a "$LOG_FILE"
 echo "SYNC AND VERIFY PROCESS - $(date)" | tee -a "$LOG_FILE"
 echo "=====================================================" | tee -a "$LOG_FILE"
 # Activate virtual environment if it exists
 if [ -d "venv" ]; then
    echo "Activating virtual environment..." | tee -a "$LOG_FILE"
    source venv/bin/activate
 fi
 # Set parameters for the sync
 DAYS_TO_SYNC=365  # Used for verification only
 MAX_MESSAGES=250
 FORCE_SYNC=true
 INCLUDE_DIRECT_MESSAGES=true
 ALL_MESSAGES=true  # Sync all messages regardless of date
 TOTAL_BATCHES=1000  # Number of batches to run
 echo "Configuration:" | tee -a "$LOG_FILE"
 echo "- Maximum messages per batch: $MAX_MESSAGES" | tee -a "$LOG_FILE"
 echo "- Force sync: $FORCE_SYNC" | tee -a "$LOG_FILE"
 echo "- Include direct messages: $INCLUDE_DIRECT_MESSAGES" | tee -a "$LOG_FILE"
 echo "- Sync all messages: $ALL_MESSAGES" | tee -a "$LOG_FILE"
 echo "- Number of batches: $TOTAL_BATCHES" | tee -a "$LOG_FILE"
 echo "- Days for verification: $DAYS_TO_SYNC" | tee -a "$LOG_FILE"
 echo "" | tee -a "$LOG_FILE"
 # Step 1: Sync messages in multiple batches
 echo "" | tee -a "$LOG_FILE"
 echo "Step 1: Syncing messages from all channels (except sandbox)..." | tee -a "$LOG_FILE"
 echo "This will exclude messages from IT_Bot and ai_bot" | tee -a "$LOG_FILE"
 echo "Running $TOTAL_BATCHES batches of $MAX_MESSAGES messages each" | tee -a "$LOG_FILE"
 echo "" | tee -a "$LOG_FILE"
 # Build the base command
 SYNC_CMD="python sync_all_channels.py --max-messages $MAX_MESSAGES"
 if [ "$INCLUDE_DIRECT_MESSAGES" = true ]; then
    SYNC_CMD="$SYNC_CMD --include-direct-messages"
 fi
 if [ "$ALL_MESSAGES" = true ]; then
    SYNC_CMD="$SYNC_CMD --all-messages"
 fi
 # Run multiple batches
 for ((i=1; i<=$TOTAL_BATCHES; i++))
 do
    echo "Running batch $i of $TOTAL_BATCHES..." | tee -a "$LOG_FILE"
    BATCH_CMD="$SYNC_CMD"
    # If ALL_MESSAGES is true, we should use --force for all batches to ensure we get historical data,
    # provided that FORCE_SYNC is also enabled.
    # If ALL_MESSAGES is false, then --force (if enabled by FORCE_SYNC) applies only to the first batch.
    if [ "$FORCE_SYNC" = true ]; then
        if [ "$ALL_MESSAGES" = true ] || [ $i -eq 1 ]; then
            BATCH_CMD="$BATCH_CMD --force"
        fi
    fi
    echo "Running: $BATCH_CMD" | tee -a "$LOG_FILE"
    echo "" | tee -a "$LOG_FILE"
    # Run the sync command
    $BATCH_CMD | tee -a "$LOG_FILE"
    # Pause between batches
    if [ $i -lt $TOTAL_BATCHES ]; then
        echo "Pausing for 5 seconds between batches..." | tee -a "$LOG_FILE"
        sleep 5
    fi
 done
 # Step 2: Fix Unknown Channel entries
 echo "" | tee -a "$LOG_FILE"
 echo "Step 2: Fixing 'Unknown Channel' entries..." | tee -a "$LOG_FILE"
 echo "" | tee -a "$LOG_FILE"
 # Run the fix unknown channels script
 FIX_CMD="python fix_unknown_channels.py"
 echo "Running: $FIX_CMD" | tee -a "$LOG_FILE"
 echo "" | tee -a "$LOG_FILE"
 $FIX_CMD | tee -a "$LOG_FILE"
 # Step 3: Verify all messages are in ChromaDB
 echo "" | tee -a "$LOG_FILE"
 echo "Step 3: Verifying all messages are in ChromaDB..." | tee -a "$LOG_FILE"
 echo "" | tee -a "$LOG_FILE"
 # Run comparison with the specified number of days for verification
 COMPARE_CMD="python compare_messages.py --days $DAYS_TO_SYNC"
 echo "Running: $COMPARE_CMD" | tee -a "$LOG_FILE"
 echo "" | tee -a "$LOG_FILE"
 $COMPARE_CMD | tee -a "$LOG_FILE"
 echo "" | tee -a "$LOG_FILE"
 echo "=====================================================" | tee -a "$LOG_FILE"
 echo "Sync and verification process completed at $(date)" | tee -a "$LOG_FILE"
 echo "See $LOG_FILE for complete log" | tee -a "$LOG_FILE"
 echo "=====================================================" | tee -a "$LOG_FILE"
 echo ""
 echo "======================================================"
 echo "  SYNC AND VERIFICATION PROCESS COMPLETED"
 echo "  $(date)"
 echo "  Log file: $LOG_FILE"
 echo "======================================================"
 # If we activated a virtual environment, deactivate it
 if [ -n "$VIRTUAL_ENV" ]; then
    deactivate
 fi 
--- a/sync_messages.py
+++ b/sync_messages.py
@ -0,0 +1,141 @@
 #!/usr/bin/env python
 """
 Script to manually sync messages from Zulip to ChromaDB.
 This can be run standalone or as a scheduled cron job.
 """
 import os
 import sys
 import argparse
 import logging
 from datetime import datetime, timedelta
 # Add the current directory to the path so we can import the app module
 sys.path.append(os.path.dirname(os.path.abspath(__file__)))
 # Apply NumPy compatibility patch for ChromaDB
 from app.utils import patch_chromadb_numpy
 patch_chromadb_numpy()
 from app import create_app
 from app.utils.sync_service import MessageSyncService
 from app.db.zulip_service import ZulipDatabaseService
 # Configure logging
 logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
 )
 logger = logging.getLogger("sync_messages")
 def sync_messages(days=None, force=False, batch_size=200, max_messages=5000):
    """
    Sync messages from Zulip to ChromaDB.
    Args:
        days (int): Number of days to look back for messages (default: use sync state)
        force (bool): Whether to force sync all messages from the lookback period
        batch_size (int): Number of messages to process in each batch
        max_messages (int): Maximum total number of messages to sync
    """
    # Create the Flask app
    app = create_app()
    with app.app_context():
        sync_service = MessageSyncService()
        if force and days:
            # If force syncing for a specific number of days, reset the sync state
            sync_service.last_sync_time = datetime.now() - timedelta(days=days)
            sync_service.last_message_id = None
            logger.info(f"Force syncing messages from the last {days} days")
        # Count total messages to sync
        if force:
            # Query to get message count
            since_date = datetime.now() - timedelta(days=days if days else 30)
            all_messages = ZulipDatabaseService.get_messages_from_it_channels(
                since=since_date if since_date else None,
                limit=5000
            )
            total_messages = len(all_messages)
            logger.info(f"Found a total of {total_messages} messages to sync")
        # Run multiple batches of sync
        total_synced = 0
        batch_count = 0
        # In force mode, we need to manually run multiple batches
        if force:
            while total_synced < min(total_messages, max_messages):
                # Manual sync with our custom batch size
                logger.info(f"Running batch {batch_count+1}, synced {total_synced} messages so far")
                # For first batch, we already reset the sync state above
                # For subsequent batches, we'll use the last_message_id that was set
                # Run the sync
                sync_service._set_batch_size(batch_size)
                sync_service.sync_now()
                # Update counters
                batch_count += 1
                # Check how many we've synced by looking at highest message ID
                if sync_service.last_message_id:
                    # We've synced up to this message ID
                    synced_in_batch = ZulipDatabaseService.count_messages_up_to_id(
                        sync_service.last_message_id, 
                        since=since_date if since_date else None
                    )
                    # Update total (use max to ensure we don't decrease if count is wrong)
                    total_synced = max(total_synced, synced_in_batch)
                    logger.info(f"Processed {synced_in_batch} messages out of {total_messages}")
                    # If we've synced all messages or reached our limit, break
                    if synced_in_batch >= total_messages or synced_in_batch >= max_messages:
                        break
                else:
                    # If no message ID was set, something went wrong
                    logger.warning("No message ID set after sync, may not have found any messages")
                    break
        else:
            # Just run a single sync with default settings
            sync_service.sync_now()
        # Get the stats
        stats = {
            'last_sync_time': sync_service.last_sync_time,
            'last_message_id': sync_service.last_message_id,
            'total_synced': total_synced,
            'batches': batch_count
        }
        logger.info(f"Sync completed. Current state: {stats}")
        return stats
 if __name__ == "__main__":
    # Parse command line arguments
    parser = argparse.ArgumentParser(description="Sync messages from Zulip to ChromaDB")
    parser.add_argument("--days", type=int, help="Number of days to look back for messages")
    parser.add_argument("--force", action="store_true", help="Force sync all messages from the lookback period")
    parser.add_argument("--batch-size", type=int, default=200, help="Number of messages to process in each batch")
    parser.add_argument("--max-messages", type=int, default=5000, help="Maximum total number of messages to sync")
    args = parser.parse_args()
    # Sync messages
    stats = sync_messages(
        days=args.days, 
        force=args.force,
        batch_size=args.batch_size,
        max_messages=args.max_messages
    )
    print(f"\nSync completed at {datetime.now()}")
    print(f"Last sync time: {stats['last_sync_time']}")
    print(f"Last message ID: {stats['last_message_id']}")
    print(f"Total messages synced: {stats['total_synced']}")
    print(f"Number of batches: {stats['batches']}") 
--- a/sync_state.pickle
+++ b/sync_state.pickle
--- a/update_to_openai.sh
+++ b/update_to_openai.sh
@ -0,0 +1,42 @@
 #!/bin/bash
 # Script to migrate from Google Gemini to OpenAI GPT-4o
 echo "Migrating from Google Gemini to OpenAI GPT-4o..."
 # 1. Activate the virtual environment
 source venv/bin/activate
 # 2. Install OpenAI package
 echo "Installing OpenAI package..."
 pip install openai==1.30.4
 # 3. Prompt for OpenAI API key
 read -p "Enter your OpenAI API key: " openai_api_key
 # 4. Update the .env file
 echo "Updating .env file..."
 if grep -q "OPENAI_API_KEY" .env; then
  # Replace existing OPENAI_API_KEY
  sed -i "s/OPENAI_API_KEY=.*/OPENAI_API_KEY=$openai_api_key/" .env
 else
  # Add new OPENAI_API_KEY entry
  sed -i "/GEMINI_API_KEY/i # OpenAI GPT-4o (new)\nOPENAI_API_KEY=$openai_api_key\n" .env
 fi
 # 5. Reset and rebuild the ChromaDB
 echo "Do you want to reset and rebuild the ChromaDB? (y/n)"
 read -p "> " rebuild_db
 if [[ $rebuild_db == "y" || $rebuild_db == "Y" ]]; then
  echo "Resetting ChromaDB..."
  ./reset_chromadb.py
  echo "Rebuilding database (syncing past 7 days of messages)..."
  python sync_messages.py --force --days 7
 fi
 echo "Migration completed successfully!"
 echo "Please restart your application to apply the changes:"
 echo "  1. Stop the current process"
 echo "  2. Run ./run_app.sh to start with OpenAI integration" 
--- a/venv/bin/Activate.ps1
+++ b/venv/bin/Activate.ps1
@ -0,0 +1,247 @@
 <#
 .Synopsis
 Activate a Python virtual environment for the current PowerShell session.
 .Description
 Pushes the python executable for a virtual environment to the front of the
 $Env:PATH environment variable and sets the prompt to signify that you are
 in a Python virtual environment. Makes use of the command line switches as
 well as the `pyvenv.cfg` file values present in the virtual environment.
 .Parameter VenvDir
 Path to the directory that contains the virtual environment to activate. The
 default value for this is the parent of the directory that the Activate.ps1
 script is located within.
 .Parameter Prompt
 The prompt prefix to display when this virtual environment is activated. By
 default, this prompt is the name of the virtual environment folder (VenvDir)
 surrounded by parentheses and followed by a single space (ie. '(.venv) ').
 .Example
 Activate.ps1
 Activates the Python virtual environment that contains the Activate.ps1 script.
 .Example
 Activate.ps1 -Verbose
 Activates the Python virtual environment that contains the Activate.ps1 script,
 and shows extra information about the activation as it executes.
 .Example
 Activate.ps1 -VenvDir C:\Users\MyUser\Common\.venv
 Activates the Python virtual environment located in the specified location.
 .Example
 Activate.ps1 -Prompt "MyPython"
 Activates the Python virtual environment that contains the Activate.ps1 script,
 and prefixes the current prompt with the specified string (surrounded in
 parentheses) while the virtual environment is active.
 .Notes
 On Windows, it may be required to enable this Activate.ps1 script by setting the
 execution policy for the user. You can do this by issuing the following PowerShell
 command:
 PS C:\> Set-ExecutionPolicy -ExecutionPolicy RemoteSigned -Scope CurrentUser
 For more information on Execution Policies: 
 https://go.microsoft.com/fwlink/?LinkID=135170
 #>
 Param(
    [Parameter(Mandatory = $false)]
    [String]
    $VenvDir,
    [Parameter(Mandatory = $false)]
    [String]
    $Prompt
 )
 <# Function declarations --------------------------------------------------- #>
 <#
 .Synopsis
 Remove all shell session elements added by the Activate script, including the
 addition of the virtual environment's Python executable from the beginning of
 the PATH variable.
 .Parameter NonDestructive
 If present, do not remove this function from the global namespace for the
 session.
 #>
 function global:deactivate ([switch]$NonDestructive) {
    # Revert to original values
    # The prior prompt:
    if (Test-Path -Path Function:_OLD_VIRTUAL_PROMPT) {
        Copy-Item -Path Function:_OLD_VIRTUAL_PROMPT -Destination Function:prompt
        Remove-Item -Path Function:_OLD_VIRTUAL_PROMPT
    }
    # The prior PYTHONHOME:
    if (Test-Path -Path Env:_OLD_VIRTUAL_PYTHONHOME) {
        Copy-Item -Path Env:_OLD_VIRTUAL_PYTHONHOME -Destination Env:PYTHONHOME
        Remove-Item -Path Env:_OLD_VIRTUAL_PYTHONHOME
    }
    # The prior PATH:
    if (Test-Path -Path Env:_OLD_VIRTUAL_PATH) {
        Copy-Item -Path Env:_OLD_VIRTUAL_PATH -Destination Env:PATH
        Remove-Item -Path Env:_OLD_VIRTUAL_PATH
    }
    # Just remove the VIRTUAL_ENV altogether:
    if (Test-Path -Path Env:VIRTUAL_ENV) {
        Remove-Item -Path env:VIRTUAL_ENV
    }
    # Just remove VIRTUAL_ENV_PROMPT altogether.
    if (Test-Path -Path Env:VIRTUAL_ENV_PROMPT) {
        Remove-Item -Path env:VIRTUAL_ENV_PROMPT
    }
    # Just remove the _PYTHON_VENV_PROMPT_PREFIX altogether:
    if (Get-Variable -Name "_PYTHON_VENV_PROMPT_PREFIX" -ErrorAction SilentlyContinue) {
        Remove-Variable -Name _PYTHON_VENV_PROMPT_PREFIX -Scope Global -Force
    }
    # Leave deactivate function in the global namespace if requested:
    if (-not $NonDestructive) {
        Remove-Item -Path function:deactivate
    }
 }
 <#
 .Description
 Get-PyVenvConfig parses the values from the pyvenv.cfg file located in the
 given folder, and returns them in a map.
 For each line in the pyvenv.cfg file, if that line can be parsed into exactly
 two strings separated by `=` (with any amount of whitespace surrounding the =)
 then it is considered a `key = value` line. The left hand string is the key,
 the right hand is the value.
 If the value starts with a `'` or a `"` then the first and last character is
 stripped from the value before being captured.
 .Parameter ConfigDir
 Path to the directory that contains the `pyvenv.cfg` file.
 #>
 function Get-PyVenvConfig(
    [String]
    $ConfigDir
 ) {
    Write-Verbose "Given ConfigDir=$ConfigDir, obtain values in pyvenv.cfg"
    # Ensure the file exists, and issue a warning if it doesn't (but still allow the function to continue).
    $pyvenvConfigPath = Join-Path -Resolve -Path $ConfigDir -ChildPath 'pyvenv.cfg' -ErrorAction Continue
    # An empty map will be returned if no config file is found.
    $pyvenvConfig = @{ }
    if ($pyvenvConfigPath) {
        Write-Verbose "File exists, parse `key = value` lines"
        $pyvenvConfigContent = Get-Content -Path $pyvenvConfigPath
        $pyvenvConfigContent | ForEach-Object {
            $keyval = $PSItem -split "\s*=\s*", 2
            if ($keyval[0] -and $keyval[1]) {
                $val = $keyval[1]
                # Remove extraneous quotations around a string value.
                if ("'""".Contains($val.Substring(0, 1))) {
                    $val = $val.Substring(1, $val.Length - 2)
                }
                $pyvenvConfig[$keyval[0]] = $val
                Write-Verbose "Adding Key: '$($keyval[0])'='$val'"
            }
        }
    }
    return $pyvenvConfig
 }
 <# Begin Activate script --------------------------------------------------- #>
 # Determine the containing directory of this script
 $VenvExecPath = Split-Path -Parent $MyInvocation.MyCommand.Definition
 $VenvExecDir = Get-Item -Path $VenvExecPath
 Write-Verbose "Activation script is located in path: '$VenvExecPath'"
 Write-Verbose "VenvExecDir Fullname: '$($VenvExecDir.FullName)"
 Write-Verbose "VenvExecDir Name: '$($VenvExecDir.Name)"
 # Set values required in priority: CmdLine, ConfigFile, Default
 # First, get the location of the virtual environment, it might not be
 # VenvExecDir if specified on the command line.
 if ($VenvDir) {
    Write-Verbose "VenvDir given as parameter, using '$VenvDir' to determine values"
 }
 else {
    Write-Verbose "VenvDir not given as a parameter, using parent directory name as VenvDir."
    $VenvDir = $VenvExecDir.Parent.FullName.TrimEnd("\\/")
    Write-Verbose "VenvDir=$VenvDir"
 }
 # Next, read the `pyvenv.cfg` file to determine any required value such
 # as `prompt`.
 $pyvenvCfg = Get-PyVenvConfig -ConfigDir $VenvDir
 # Next, set the prompt from the command line, or the config file, or
 # just use the name of the virtual environment folder.
 if ($Prompt) {
    Write-Verbose "Prompt specified as argument, using '$Prompt'"
 }
 else {
    Write-Verbose "Prompt not specified as argument to script, checking pyvenv.cfg value"
    if ($pyvenvCfg -and $pyvenvCfg['prompt']) {
        Write-Verbose "  Setting based on value in pyvenv.cfg='$($pyvenvCfg['prompt'])'"
        $Prompt = $pyvenvCfg['prompt'];
    }
    else {
        Write-Verbose "  Setting prompt based on parent's directory's name. (Is the directory name passed to venv module when creating the virtual environment)"
        Write-Verbose "  Got leaf-name of $VenvDir='$(Split-Path -Path $venvDir -Leaf)'"
        $Prompt = Split-Path -Path $venvDir -Leaf
    }
 }
 Write-Verbose "Prompt = '$Prompt'"
 Write-Verbose "VenvDir='$VenvDir'"
 # Deactivate any currently active virtual environment, but leave the
 # deactivate function in place.
 deactivate -nondestructive
 # Now set the environment variable VIRTUAL_ENV, used by many tools to determine
 # that there is an activated venv.
 $env:VIRTUAL_ENV = $VenvDir
 if (-not $Env:VIRTUAL_ENV_DISABLE_PROMPT) {
    Write-Verbose "Setting prompt to '$Prompt'"
    # Set the prompt to include the env name
    # Make sure _OLD_VIRTUAL_PROMPT is global
    function global:_OLD_VIRTUAL_PROMPT { "" }
    Copy-Item -Path function:prompt -Destination function:_OLD_VIRTUAL_PROMPT
    New-Variable -Name _PYTHON_VENV_PROMPT_PREFIX -Description "Python virtual environment prompt prefix" -Scope Global -Option ReadOnly -Visibility Public -Value $Prompt
    function global:prompt {
        Write-Host -NoNewline -ForegroundColor Green "($_PYTHON_VENV_PROMPT_PREFIX) "
        _OLD_VIRTUAL_PROMPT
    }
    $env:VIRTUAL_ENV_PROMPT = $Prompt
 }
 # Clear PYTHONHOME
 if (Test-Path -Path Env:PYTHONHOME) {
    Copy-Item -Path Env:PYTHONHOME -Destination Env:_OLD_VIRTUAL_PYTHONHOME
    Remove-Item -Path Env:PYTHONHOME
 }
 # Add the venv to the PATH
 Copy-Item -Path Env:PATH -Destination Env:_OLD_VIRTUAL_PATH
 $Env:PATH = "$VenvExecDir$([System.IO.Path]::PathSeparator)$Env:PATH"
--- a/venv/bin/activate
+++ b/venv/bin/activate
@ -0,0 +1,63 @@
 # This file must be used with "source bin/activate" *from bash*
 # you cannot run it directly
 deactivate () {
    # reset old environment variables
    if [ -n "${_OLD_VIRTUAL_PATH:-}" ] ; then
        PATH="${_OLD_VIRTUAL_PATH:-}"
        export PATH
        unset _OLD_VIRTUAL_PATH
    fi
    if [ -n "${_OLD_VIRTUAL_PYTHONHOME:-}" ] ; then
        PYTHONHOME="${_OLD_VIRTUAL_PYTHONHOME:-}"
        export PYTHONHOME
        unset _OLD_VIRTUAL_PYTHONHOME
    fi
    # Call hash to forget past commands. Without forgetting
    # past commands the $PATH changes we made may not be respected
    hash -r 2> /dev/null
    if [ -n "${_OLD_VIRTUAL_PS1:-}" ] ; then
        PS1="${_OLD_VIRTUAL_PS1:-}"
        export PS1
        unset _OLD_VIRTUAL_PS1
    fi
    unset VIRTUAL_ENV
    unset VIRTUAL_ENV_PROMPT
    if [ ! "${1:-}" = "nondestructive" ] ; then
    # Self destruct!
        unset -f deactivate
    fi
 }
 # unset irrelevant variables
 deactivate nondestructive
 VIRTUAL_ENV=/home/adminuser/zulip_bots/venv
 export VIRTUAL_ENV
 _OLD_VIRTUAL_PATH="$PATH"
 PATH="$VIRTUAL_ENV/"bin":$PATH"
 export PATH
 # unset PYTHONHOME if set
 # this will fail if PYTHONHOME is set to the empty string (which is bad anyway)
 # could use `if (set -u; : $PYTHONHOME) ;` in bash
 if [ -n "${PYTHONHOME:-}" ] ; then
    _OLD_VIRTUAL_PYTHONHOME="${PYTHONHOME:-}"
    unset PYTHONHOME
 fi
 if [ -z "${VIRTUAL_ENV_DISABLE_PROMPT:-}" ] ; then
    _OLD_VIRTUAL_PS1="${PS1:-}"
    PS1='(venv) '"${PS1:-}"
    export PS1
    VIRTUAL_ENV_PROMPT='(venv) '
    export VIRTUAL_ENV_PROMPT
 fi
 # Call hash to forget past commands. Without forgetting
 # past commands the $PATH changes we made may not be respected
 hash -r 2> /dev/null
--- a/venv/bin/activate.csh
+++ b/venv/bin/activate.csh
@ -0,0 +1,26 @@
 # This file must be used with "source bin/activate.csh" *from csh*.
 # You cannot run it directly.
 # Created by Davide Di Blasi <davidedb@gmail.com>.
 # Ported to Python 3.3 venv by Andrew Svetlov <andrew.svetlov@gmail.com>
 alias deactivate 'test $?_OLD_VIRTUAL_PATH != 0 && setenv PATH "$_OLD_VIRTUAL_PATH" && unset _OLD_VIRTUAL_PATH; rehash; test $?_OLD_VIRTUAL_PROMPT != 0 && set prompt="$_OLD_VIRTUAL_PROMPT" && unset _OLD_VIRTUAL_PROMPT; unsetenv VIRTUAL_ENV; unsetenv VIRTUAL_ENV_PROMPT; test "\!:*" != "nondestructive" && unalias deactivate'
 # Unset irrelevant variables.
 deactivate nondestructive
 setenv VIRTUAL_ENV /home/adminuser/zulip_bots/venv
 set _OLD_VIRTUAL_PATH="$PATH"
 setenv PATH "$VIRTUAL_ENV/"bin":$PATH"
 set _OLD_VIRTUAL_PROMPT="$prompt"
 if (! "$?VIRTUAL_ENV_DISABLE_PROMPT") then
    set prompt = '(venv) '"$prompt"
    setenv VIRTUAL_ENV_PROMPT '(venv) '
 endif
 alias pydoc python -m pydoc
 rehash
--- a/venv/bin/activate.fish
+++ b/venv/bin/activate.fish
@ -0,0 +1,69 @@
 # This file must be used with "source <venv>/bin/activate.fish" *from fish*
 # (https://fishshell.com/); you cannot run it directly.
 function deactivate  -d "Exit virtual environment and return to normal shell environment"
    # reset old environment variables
    if test -n "$_OLD_VIRTUAL_PATH"
        set -gx PATH $_OLD_VIRTUAL_PATH
        set -e _OLD_VIRTUAL_PATH
    end
    if test -n "$_OLD_VIRTUAL_PYTHONHOME"
        set -gx PYTHONHOME $_OLD_VIRTUAL_PYTHONHOME
        set -e _OLD_VIRTUAL_PYTHONHOME
    end
    if test -n "$_OLD_FISH_PROMPT_OVERRIDE"
        set -e _OLD_FISH_PROMPT_OVERRIDE
        # prevents error when using nested fish instances (Issue #93858)
        if functions -q _old_fish_prompt
            functions -e fish_prompt
            functions -c _old_fish_prompt fish_prompt
            functions -e _old_fish_prompt
        end
    end
    set -e VIRTUAL_ENV
    set -e VIRTUAL_ENV_PROMPT
    if test "$argv[1]" != "nondestructive"
        # Self-destruct!
        functions -e deactivate
    end
 end
 # Unset irrelevant variables.
 deactivate nondestructive
 set -gx VIRTUAL_ENV /home/adminuser/zulip_bots/venv
 set -gx _OLD_VIRTUAL_PATH $PATH
 set -gx PATH "$VIRTUAL_ENV/"bin $PATH
 # Unset PYTHONHOME if set.
 if set -q PYTHONHOME
    set -gx _OLD_VIRTUAL_PYTHONHOME $PYTHONHOME
    set -e PYTHONHOME
 end
 if test -z "$VIRTUAL_ENV_DISABLE_PROMPT"
    # fish uses a function instead of an env var to generate the prompt.
    # Save the current fish_prompt function as the function _old_fish_prompt.
    functions -c fish_prompt _old_fish_prompt
    # With the original prompt function renamed, we can override with our own.
    function fish_prompt
        # Save the return status of the last command.
        set -l old_status $status
        # Output the venv prompt; color taken from the blue of the Python logo.
        printf "%s%s%s" (set_color 4B8BBE) '(venv) ' (set_color normal)
        # Restore the return status of the previous command.
        echo "exit $old_status" | .
        # Output the original/"old" prompt.
        _old_fish_prompt
    end
    set -gx _OLD_FISH_PROMPT_OVERRIDE "$VIRTUAL_ENV"
    set -gx VIRTUAL_ENV_PROMPT '(venv) '
 end
--- a/venv/bin/coloredlogs
+++ b/venv/bin/coloredlogs
@ -0,0 +1,8 @@
 #!/home/adminuser/zulip_bots/venv/bin/python3.11
 # -*- coding: utf-8 -*-
 import re
 import sys
 from coloredlogs.cli import main
 if __name__ == '__main__':
    sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
    sys.exit(main())
--- a/venv/bin/distro
+++ b/venv/bin/distro
@ -0,0 +1,8 @@
 #!/home/adminuser/zulip_bots/venv/bin/python3.11
 # -*- coding: utf-8 -*-
 import re
 import sys
 from distro.distro import main
 if __name__ == '__main__':
    sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
    sys.exit(main())
--- a/venv/bin/dotenv
+++ b/venv/bin/dotenv
@ -0,0 +1,8 @@
 #!/home/adminuser/zulip_bots/venv/bin/python3.11
 # -*- coding: utf-8 -*-
 import re
 import sys
 from dotenv.__main__ import cli
 if __name__ == '__main__':
    sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
    sys.exit(cli())
--- a/venv/bin/f2py
+++ b/venv/bin/f2py
@ -0,0 +1,8 @@
 #!/home/adminuser/zulip_bots/venv/bin/python3.11
 # -*- coding: utf-8 -*-
 import re
 import sys
 from numpy.f2py.f2py2e import main
 if __name__ == '__main__':
    sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
    sys.exit(main())
--- a/venv/bin/fastavro
+++ b/venv/bin/fastavro
@ -0,0 +1,8 @@
 #!/home/adminuser/zulip_bots/venv/bin/python3.11
 # -*- coding: utf-8 -*-
 import re
 import sys
 from fastavro.__main__ import main
 if __name__ == '__main__':
    sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
    sys.exit(main())
--- a/venv/bin/flask
+++ b/venv/bin/flask
@ -0,0 +1,8 @@
 #!/home/adminuser/zulip_bots/venv/bin/python3.11
 # -*- coding: utf-8 -*-
 import re
 import sys
 from flask.cli import main
 if __name__ == '__main__':
    sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
    sys.exit(main())
--- a/venv/bin/httpx
+++ b/venv/bin/httpx
@ -0,0 +1,8 @@
 #!/home/adminuser/zulip_bots/venv/bin/python3.11
 # -*- coding: utf-8 -*-
 import re
 import sys
 from httpx import main
 if __name__ == '__main__':
    sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
    sys.exit(main())
--- a/venv/bin/huggingface-cli
+++ b/venv/bin/huggingface-cli
@ -0,0 +1,8 @@
 #!/home/adminuser/zulip_bots/venv/bin/python3.11
 # -*- coding: utf-8 -*-
 import re
 import sys
 from huggingface_hub.commands.huggingface_cli import main
 if __name__ == '__main__':
    sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
    sys.exit(main())
--- a/venv/bin/humanfriendly
+++ b/venv/bin/humanfriendly
@ -0,0 +1,8 @@
 #!/home/adminuser/zulip_bots/venv/bin/python3.11
 # -*- coding: utf-8 -*-
 import re
 import sys
 from humanfriendly.cli import main
 if __name__ == '__main__':
    sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
    sys.exit(main())
--- a/venv/bin/isympy
+++ b/venv/bin/isympy
@ -0,0 +1,8 @@
 #!/home/adminuser/zulip_bots/venv/bin/python3.11
 # -*- coding: utf-8 -*-
 import re
 import sys
 from isympy import main
 if __name__ == '__main__':
    sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
    sys.exit(main())
--- a/venv/bin/markdown-it
+++ b/venv/bin/markdown-it
@ -0,0 +1,8 @@
 #!/home/adminuser/zulip_bots/venv/bin/python3.11
 # -*- coding: utf-8 -*-
 import re
 import sys
 from markdown_it.cli.parse import main
 if __name__ == '__main__':
    sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
    sys.exit(main())
--- a/venv/bin/nltk
+++ b/venv/bin/nltk
@ -0,0 +1,8 @@
 #!/home/adminuser/zulip_bots/venv/bin/python3.11
 # -*- coding: utf-8 -*-
 import re
 import sys
 from nltk.cli import cli
 if __name__ == '__main__':
    sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
    sys.exit(cli())
--- a/venv/bin/nomic
+++ b/venv/bin/nomic
@ -0,0 +1,8 @@
 #!/home/adminuser/zulip_bots/venv/bin/python3.11
 # -*- coding: utf-8 -*-
 import re
 import sys
 from nomic.cli import cli
 if __name__ == '__main__':
    sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
    sys.exit(cli())
--- a/venv/bin/normalizer
+++ b/venv/bin/normalizer
@ -0,0 +1,8 @@
 #!/home/adminuser/zulip_bots/venv/bin/python3.11
 # -*- coding: utf-8 -*-
 import re
 import sys
 from charset_normalizer import cli
 if __name__ == '__main__':
    sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
    sys.exit(cli.cli_detect())
--- a/venv/bin/numpy-config
+++ b/venv/bin/numpy-config
@ -0,0 +1,8 @@
 #!/home/adminuser/zulip_bots/venv/bin/python3.11
 # -*- coding: utf-8 -*-
 import re
 import sys
 from numpy._configtool import main
 if __name__ == '__main__':
    sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
    sys.exit(main())
--- a/venv/bin/onnxruntime_test
+++ b/venv/bin/onnxruntime_test
@ -0,0 +1,8 @@
 #!/home/adminuser/zulip_bots/venv/bin/python3.11
 # -*- coding: utf-8 -*-
 import re
 import sys
 from onnxruntime.tools.onnxruntime_test import main
 if __name__ == '__main__':
    sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
    sys.exit(main())
--- a/venv/bin/openai
+++ b/venv/bin/openai
@ -0,0 +1,8 @@
 #!/home/adminuser/zulip_bots/venv/bin/python3.11
 # -*- coding: utf-8 -*-
 import re
 import sys
 from openai.cli import main
 if __name__ == '__main__':
    sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
    sys.exit(main())
--- a/venv/bin/pip
+++ b/venv/bin/pip
@ -0,0 +1,8 @@
 #!/home/adminuser/zulip_bots/venv/bin/python3.11
 # -*- coding: utf-8 -*-
 import re
 import sys
 from pip._internal.cli.main import main
 if __name__ == '__main__':
    sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
    sys.exit(main())
--- a/venv/bin/pip3
+++ b/venv/bin/pip3
@ -0,0 +1,8 @@
 #!/home/adminuser/zulip_bots/venv/bin/python3.11
 # -*- coding: utf-8 -*-
 import re
 import sys
 from pip._internal.cli.main import main
 if __name__ == '__main__':
    sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
    sys.exit(main())
--- a/venv/bin/pip3.11
+++ b/venv/bin/pip3.11
@ -0,0 +1,8 @@
 #!/home/adminuser/zulip_bots/venv/bin/python3.11
 # -*- coding: utf-8 -*-
 import re
 import sys
 from pip._internal.cli.main import main
 if __name__ == '__main__':
    sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
    sys.exit(main())
--- a/venv/bin/pygmentize
+++ b/venv/bin/pygmentize
@ -0,0 +1,8 @@
 #!/home/adminuser/zulip_bots/venv/bin/python3.11
 # -*- coding: utf-8 -*-
 import re
 import sys
 from pygments.cmdline import main
 if __name__ == '__main__':
    sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
    sys.exit(main())
--- a/venv/bin/pyrsa-decrypt
+++ b/venv/bin/pyrsa-decrypt
@ -0,0 +1,8 @@
 #!/home/adminuser/zulip_bots/venv/bin/python3.11
 # -*- coding: utf-8 -*-
 import re
 import sys
 from rsa.cli import decrypt
 if __name__ == '__main__':
    sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
    sys.exit(decrypt())
--- a/venv/bin/pyrsa-encrypt
+++ b/venv/bin/pyrsa-encrypt
@ -0,0 +1,8 @@
 #!/home/adminuser/zulip_bots/venv/bin/python3.11
 # -*- coding: utf-8 -*-
 import re
 import sys
 from rsa.cli import encrypt
 if __name__ == '__main__':
    sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
    sys.exit(encrypt())
--- a/venv/bin/pyrsa-keygen
+++ b/venv/bin/pyrsa-keygen
@ -0,0 +1,8 @@
 #!/home/adminuser/zulip_bots/venv/bin/python3.11
 # -*- coding: utf-8 -*-
 import re
 import sys
 from rsa.cli import keygen
 if __name__ == '__main__':
    sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
    sys.exit(keygen())
--- a/venv/bin/pyrsa-priv2pub
+++ b/venv/bin/pyrsa-priv2pub
@ -0,0 +1,8 @@
 #!/home/adminuser/zulip_bots/venv/bin/python3.11
 # -*- coding: utf-8 -*-
 import re
 import sys
 from rsa.util import private_to_public
 if __name__ == '__main__':
    sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
    sys.exit(private_to_public())
--- a/venv/bin/pyrsa-sign
+++ b/venv/bin/pyrsa-sign
@ -0,0 +1,8 @@
 #!/home/adminuser/zulip_bots/venv/bin/python3.11
 # -*- coding: utf-8 -*-
 import re
 import sys
 from rsa.cli import sign
 if __name__ == '__main__':
    sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
    sys.exit(sign())
--- a/venv/bin/pyrsa-verify
+++ b/venv/bin/pyrsa-verify
@ -0,0 +1,8 @@
 #!/home/adminuser/zulip_bots/venv/bin/python3.11
 # -*- coding: utf-8 -*-
 import re
 import sys
 from rsa.cli import verify
 if __name__ == '__main__':
    sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
    sys.exit(verify())
--- a/venv/bin/python
+++ b/venv/bin/python
@ -0,0 +1 @@
 python3.11
--- a/venv/bin/python3
+++ b/venv/bin/python3
@ -0,0 +1 @@
 python3.11
--- a/venv/bin/python3.11
+++ b/venv/bin/python3.11
@ -0,0 +1 @@
 /usr/bin/python3.11
--- a/venv/bin/tqdm
+++ b/venv/bin/tqdm
@ -0,0 +1,8 @@
 #!/home/adminuser/zulip_bots/venv/bin/python3.11
 # -*- coding: utf-8 -*-
 import re
 import sys
 from tqdm.cli import main
 if __name__ == '__main__':
    sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
    sys.exit(main())
--- a/venv/bin/uvicorn
+++ b/venv/bin/uvicorn
@ -0,0 +1,8 @@
 #!/home/adminuser/zulip_bots/venv/bin/python3.11
 # -*- coding: utf-8 -*-
 import re
 import sys
 from uvicorn.main import main
 if __name__ == '__main__':
    sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
    sys.exit(main())
--- a/venv/bin/watchfiles
+++ b/venv/bin/watchfiles
@ -0,0 +1,8 @@
 #!/home/adminuser/zulip_bots/venv/bin/python3.11
 # -*- coding: utf-8 -*-
 import re
 import sys
 from watchfiles.cli import cli
 if __name__ == '__main__':
    sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
    sys.exit(cli())
--- a/Show More
+++ b/Show More