first commit

This commit is contained in:
ilia.gurielidze 2025-05-16 18:00:22 +04:00
commit 8484b0b882
20833 changed files with 3827911 additions and 0 deletions

1
.cursorignore Normal file
View File

@ -0,0 +1 @@
venv/

26
.env Normal file
View File

@ -0,0 +1,26 @@
# Flask configuration
FLASK_ENV=development
SECRET_KEY=dev-secret-key
# Zulip DB configuration
ZULIP_DB_URI=postgresql://zulip:BlackMoonSky89@zulip.lci.ge:5432/zulip
# ChromaDB configuration
CHROMA_DB_PATH=./chromadb
# Embedding model configuration
USE_NOMIC_EMBEDDINGS=true
OLLAMA_MODEL=nomic-embed-text
OLLAMA_HOST=http://localhost:11434
# AI model configuration
# OpenAI GPT-4o (new)
OPENAI_API_KEY=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
# Gemini API (legacy)
GEMINI_API_KEY=AIzaSyD_VYKUcleCUkAxZj1sX3pWLHvGk0HDe9s
# Zulip Bot configuration
ZULIP_BOT_EMAIL=IT_bot-bot@zulip.lci.ge
ZULIP_BOT_API_KEY=ta8x0Rwlf5yLlZutETiTZbHFtQMVOv1z
ZULIP_SITE=https://zulip.lci.ge

79
README.md Normal file
View File

@ -0,0 +1,79 @@
# Zulip Bot Service
This is a Zulip bot service that provides AI-powered responses to user queries.
## Running as a Service with PM2
The application can be set up to run as a background service using PM2, which ensures it starts automatically on system boot and restarts if it crashes.
### Prerequisites
1. Node.js and npm installed on your system
2. PM2 installed globally (`npm install -g pm2`)
3. Python 3.11+ and required dependencies
### Installation
1. Make sure all required environment variables are set in your `.env` file:
```
ZULIP_BOT_EMAIL=your-bot@example.com
ZULIP_BOT_API_KEY=your-api-key
ZULIP_SITE=https://your-zulip-instance.com
OPENAI_API_KEY=your-openai-api-key
```
2. Make the setup script executable:
```bash
chmod +x pm2_start.sh
```
3. Run the setup script:
```bash
./pm2_start.sh
```
4. The script will:
- Install PM2 if not already installed
- Start the bot as a background service
- Configure PM2 to run at system startup
- Provide instructions for any required sudo commands
### Managing the Service
- **Check status**: `pm2 status`
- **View logs**: `pm2 logs zulip-bot`
- **Restart**: `pm2 restart zulip-bot`
- **Stop**: `pm2 stop zulip-bot`
- **Start (if stopped)**: `pm2 start zulip-bot`
### Troubleshooting
If the service fails to start:
1. Check logs for errors: `pm2 logs zulip-bot`
2. Ensure all environment variables are properly set
3. Verify that the Flask app works by running it directly: `./run_app.sh`
## Manual Setup
If you prefer to run the bot without PM2:
1. Activate the virtual environment:
```bash
source venv/bin/activate
```
2. Run the Flask app:
```bash
./run_app.sh
```
## Development
For development purposes, you can run the Flask app in debug mode:
```bash
export FLASK_APP=app
export FLASK_DEBUG=1
flask run --port=5100
```

Binary file not shown.

146
app/__init__.py Normal file
View File

@ -0,0 +1,146 @@
"""
Main application entry point for the Zulip Bot application.
"""
import os
from flask import Flask, request, jsonify
from app.config import load_config
def create_app(config_name=None):
"""Create and configure the Flask application."""
app = Flask(__name__)
# Load configuration
config = load_config(config_name)
app.config.from_object(config)
# Set DEBUG mode for the app
app.config['DEBUG'] = True
# Override any environment flags to disable safety filters
os.environ['GEMINI_NO_SAFETY'] = 'true'
# Apply NumPy compatibility patch for ChromaDB
from app.utils import patch_chromadb_numpy
patch_chromadb_numpy()
# Initialize database connections
from app.db import init_db
init_db(app)
# Check if we're in the main process or a Flask reloader worker
# When Flask reloads in debug mode, it sets an environment variable
# We only want to start services in the main process to avoid duplication
is_flask_reloader_process = os.environ.get('WERKZEUG_RUN_MAIN') == 'true'
is_main_process = not os.environ.get('WERKZEUG_RUN_MAIN')
# Only start services in the main process or if --no-reload is used
# This prevents duplicate services when using Flask's debug mode
should_start_services = is_flask_reloader_process or is_main_process
# Initialize message sync service and bot service regardless of process
# but only start them in the appropriate process
from app.utils.sync_service import MessageSyncService
sync_service = MessageSyncService(sync_interval=60) # Sync every 60 seconds
# Store sync_service in app context so it can be accessed elsewhere
app.sync_service = sync_service
# Initialize Zulip bot service
from app.utils.bot_service import ZulipBotService
bot_service = ZulipBotService()
# Store bot_service in app context so it can be accessed elsewhere
app.bot_service = bot_service
# Start the services in a better way (avoiding deprecated before_first_request)
# But only if this is the main process or Flask reloader's main thread
with app.app_context():
# Add logging to help diagnose any issues
app.logger.info(f"App initialization, should_start_services={should_start_services}, "
f"is_main_process={is_main_process}, is_flask_reloader_process={is_flask_reloader_process}")
if should_start_services:
# Start the sync service
app.logger.info("Starting sync service...")
sync_service.start()
# Start the bot service and log the result
app.logger.info("Starting Zulip bot service...")
if bot_service.thread and bot_service.thread.is_alive():
app.logger.info("Bot service is already running, not starting again")
else:
bot_service.start()
app.logger.info("Bot service started successfully")
else:
app.logger.info("Skipping service startup in Flask reloader process")
# Register a shutdown function to stop the services
@app.teardown_appcontext
def stop_services(exception=None):
if hasattr(app, 'sync_service'):
app.sync_service.stop()
if hasattr(app, 'bot_service'):
app.bot_service.stop()
# Register blueprints
# This will be implemented later
@app.route('/health')
def health_check():
"""Simple health check endpoint."""
return jsonify({'status': 'ok'})
@app.route('/sync/now')
def trigger_sync():
"""Trigger an immediate sync."""
if hasattr(app, 'sync_service'):
app.sync_service.sync_now()
return jsonify({'status': 'sync_triggered'})
return jsonify({'status': 'error', 'message': 'Sync service not available'}), 500
@app.route('/bot/status')
def bot_status():
"""Get the status of the bot service."""
if hasattr(app, 'bot_service') and app.bot_service.thread and app.bot_service.thread.is_alive():
return jsonify({'status': 'running'})
return jsonify({'status': 'stopped'})
@app.route('/bot/start', methods=['POST'])
def start_bot():
"""Start the bot service."""
if hasattr(app, 'bot_service'):
app.bot_service.start()
return jsonify({'status': 'started'})
return jsonify({'status': 'error', 'message': 'Bot service not available'}), 500
@app.route('/bot/stop', methods=['POST'])
def stop_bot():
"""Stop the bot service."""
if hasattr(app, 'bot_service'):
app.bot_service.stop()
return jsonify({'status': 'stopped'})
return jsonify({'status': 'error', 'message': 'Bot service not available'}), 500
@app.route('/bot/test', methods=['POST'])
def test_bot():
"""Send a test message to verify the bot is working."""
if not hasattr(app, 'bot_service'):
return jsonify({'status': 'error', 'message': 'Bot service not available'}), 500
data = request.get_json()
if not data or 'recipient' not in data or 'content' not in data:
return jsonify({'status': 'error', 'message': 'Missing required fields: recipient, content'}), 400
result = app.bot_service.send_test_message(data['recipient'], data['content'])
return jsonify({'status': 'sent', 'result': result})
@app.route('/bot/reset-cache', methods=['POST'])
def reset_bot_cache():
"""Reset the bot's message cache to fix issues with message processing."""
if not hasattr(app, 'bot_service'):
return jsonify({'status': 'error', 'message': 'Bot service not available'}), 500
result = app.bot_service.reset_cache()
return jsonify({'status': 'success', 'message': result})
return app

Binary file not shown.

106
app/config/__init__.py Normal file
View File

@ -0,0 +1,106 @@
"""
Configuration module for the application.
Loads environment variables and provides configuration values.
"""
import os
from dotenv import load_dotenv
# Load environment variables from .env file (if it exists)
load_dotenv()
class Config:
"""Configuration class for the application."""
# Zulip API settings
ZULIP_EMAIL = os.getenv("ZULIP_EMAIL", "IT_bot-bot@zulip.lci.ge")
ZULIP_API_KEY = os.getenv("ZULIP_API_KEY", "ta8x0Rwlf5yLlZutETiTZbHFtQMVOv1z")
ZULIP_SITE = os.getenv("ZULIP_SITE", "https://zulip.lci.ge")
# Zulip database settings
ZULIP_DB_HOST = os.getenv("ZULIP_DB_HOST", "zulip.lci.ge")
ZULIP_DB_PORT = os.getenv("ZULIP_DB_PORT", "5432")
ZULIP_DB_NAME = os.getenv("ZULIP_DB_NAME", "zulip")
ZULIP_DB_USER = os.getenv("ZULIP_DB_USER", "zulip")
ZULIP_DB_PASSWORD = os.getenv("ZULIP_DB_PASSWORD", "BlackMoonSky89")
# Database URL
SQLALCHEMY_DATABASE_URI = f"postgresql://{ZULIP_DB_USER}:{ZULIP_DB_PASSWORD}@{ZULIP_DB_HOST}:{ZULIP_DB_PORT}/{ZULIP_DB_NAME}"
# ChromaDB settings
CHROMADB_PATH = os.getenv("CHROMADB_PATH", "./chromadb")
CHROMADB_COLLECTION = os.getenv("CHROMADB_COLLECTION", "zulip_messages")
# Channels to monitor (IT Discussions, IT Knowledge, IT Support)
CHANNELS_TO_MONITOR = [
"IT Discussions",
"IT Knowledge",
"IT Support"
]
# AI model settings
# OpenAI settings (primary)
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
OPENAI_MODEL = os.getenv("OPENAI_MODEL", "gpt-4o")
# Gemini API settings (legacy)
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY", "AIzaSyD_VYKUcleCUkAxZj1sX3pWLHvGk0HDe9s")
# Embedding settings
USE_NOMIC_EMBEDDINGS = os.getenv("USE_NOMIC_EMBEDDINGS", "False").lower() == "true"
COHERE_API_KEY = os.getenv("COHERE_API_KEY", "4sCOTMgEg5rXeXU0XMmPeucSBMl5xd4FMhyV2UDW")
# Ollama settings
OLLAMA_MODEL = os.getenv("OLLAMA_MODEL", "nomic-embed-text")
OLLAMA_HOST = os.getenv("OLLAMA_HOST", "http://localhost:11434")
# Flask settings
SECRET_KEY = os.getenv("SECRET_KEY", "your_secret_key_here")
DEBUG = os.getenv("DEBUG", "True").lower() == "true"
# Bot settings
BOT_NAME = "IT_Bot"
BOT_TRIGGER = f"@**{BOT_NAME}**"
# Rate limiting settings
RATE_LIMIT_PERIOD = int(os.getenv("RATE_LIMIT_PERIOD", "60")) # 60 seconds
RATE_LIMIT_REQUESTS = int(os.getenv("RATE_LIMIT_REQUESTS", "10")) # 10 requests per period
class DevelopmentConfig(Config):
"""Development configuration."""
DEBUG = True
class ProductionConfig(Config):
"""Production configuration."""
DEBUG = False
class TestingConfig(Config):
"""Testing configuration."""
TESTING = True
# Use a test database
SQLALCHEMY_DATABASE_URI = os.getenv('TEST_SQLALCHEMY_DATABASE_URI', 'postgresql://zulip:BlackMoonSky89@zulip.lci.ge:5432/zulip_test')
# Use a test ChromaDB path
CHROMADB_PATH = os.getenv('TEST_CHROMADB_PATH', './chromadb_test')
# Configuration dictionary
config_dict = {
'development': DevelopmentConfig,
'production': ProductionConfig,
'testing': TestingConfig,
'default': DevelopmentConfig
}
def load_config(config_name=None):
"""
Load the appropriate configuration based on environment variables or the provided config_name.
Args:
config_name (str, optional): Name of the configuration to load. Defaults to None.
Returns:
Config: Configuration object
"""
if not config_name:
config_name = os.getenv('FLASK_ENV', 'default')
return config_dict.get(config_name, config_dict['default'])

Binary file not shown.

105
app/db/__init__.py Normal file
View File

@ -0,0 +1,105 @@
"""
Database module for the application.
Handles connections to PostgreSQL (Zulip DB) and ChromaDB.
"""
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker, scoped_session
from sqlalchemy.ext.declarative import declarative_base
import chromadb
# SQLAlchemy base class for models
Base = declarative_base()
# Global variables for SQLAlchemy
db_engine = None
db_session = None
# Global variable for ChromaDB
chroma_client = None
chroma_collection = None
def init_db(app):
"""
Initialize database connections.
Args:
app: Flask application object
"""
global db_engine, db_session, chroma_client, chroma_collection
# Initialize SQLAlchemy engine and session
db_engine = create_engine(app.config['SQLALCHEMY_DATABASE_URI'])
db_session = scoped_session(sessionmaker(autocommit=False, autoflush=False, bind=db_engine))
# Set query property for models
Base.query = db_session.query_property()
# Initialize ChromaDB
try:
# Set allow_reset to True to prevent "Add of existing embedding ID" warnings
chroma_client = chromadb.PersistentClient(
path=app.config['CHROMADB_PATH'],
settings=chromadb.Settings(
allow_reset=True,
anonymized_telemetry=False,
is_persistent=True
)
)
# Import here to avoid circular imports
from app.db.chroma_service import CustomEmbeddingFunction
# Create embedding function with setting from config
try:
# Always use Ollama since it's more reliable
embedding_function = CustomEmbeddingFunction(use_nomic=False)
# Get or create ChromaDB collection for Zulip messages with custom embedding function
chroma_collection = chroma_client.get_or_create_collection(
name=app.config.get('CHROMADB_COLLECTION', 'zulip_messages'),
metadata={
"hnsw:space": "cosine",
"hnsw:allow_replace_deleted": True # Allow replacing deleted vectors
},
embedding_function=embedding_function
)
except Exception as e:
print(f"Error with embedding function: {e}")
print("Creating collection without embedding function")
# Create collection without embedding function
chroma_collection = chroma_client.get_or_create_collection(
name=app.config.get('CHROMADB_COLLECTION', 'zulip_messages'),
metadata={
"hnsw:space": "cosine",
"hnsw:allow_replace_deleted": True # Allow replacing deleted vectors
}
)
except Exception as e:
print(f"Critical error initializing ChromaDB: {e}")
print("ChromaDB functionality will not be available")
chroma_client = None
chroma_collection = None
# Register teardown function to remove database sessions
@app.teardown_appcontext
def shutdown_session(exception=None):
"""Remove the database session at the end of the request."""
db_session.remove()
def get_db_session():
"""
Get the current database session.
Returns:
SQLAlchemy session object
"""
return db_session
def get_chroma_collection():
"""
Get the ChromaDB collection for Zulip messages.
Returns:
ChromaDB collection object
"""
return chroma_collection

Binary file not shown.

Binary file not shown.

Binary file not shown.

418
app/db/chroma_service.py Normal file
View File

@ -0,0 +1,418 @@
"""
Service for storing and retrieving embedded messages in ChromaDB.
"""
import json
from datetime import datetime
from typing import List, Dict, Any, Optional, Union
import chromadb
from chromadb.utils import embedding_functions
from app.db import get_chroma_collection
from app.utils.embeddings import EmbeddingService
from app.utils.contextual_retrieval.context_service import ContextService
from app.utils.contextual_retrieval.bm25_service import BM25Service
from app.config import Config
import logging
# Set up logging
logger = logging.getLogger("chroma_service")
class CustomEmbeddingFunction(embedding_functions.EmbeddingFunction):
"""Custom embedding function using our EmbeddingService."""
def __init__(self, use_nomic: bool = True):
"""
Initialize the custom embedding function.
Args:
use_nomic: Whether to use Nomic (True) or Ollama (False) for embeddings
"""
self.use_nomic = use_nomic
def __call__(self, texts: List[str]) -> List[List[float]]:
"""
Generate embeddings for a list of texts.
Args:
texts: List of texts to generate embeddings for
Returns:
List of embeddings as float arrays
"""
return EmbeddingService.get_embeddings(texts, use_nomic=self.use_nomic)
class ChromaDBService:
"""Service for storing and retrieving embedded messages in ChromaDB."""
# Use Ollama embeddings by default for reliability
_embedding_function = CustomEmbeddingFunction(use_nomic=False)
@staticmethod
def format_message_content(content, channel_name, subject, sender_name, date_sent):
"""
Format message content with metadata but without contextual enrichment.
Args:
content (str): Original message content
channel_name (str): Name of the channel
subject (str): Subject of the message
sender_name (str): Name of the sender
date_sent (datetime): Date the message was sent
Returns:
str: Formatted message content with basic metadata
"""
# Format date in a readable format
date_str = date_sent.strftime("%Y-%m-%d %H:%M:%S")
# Replace None values with empty strings
content = content or ""
channel_name = channel_name or "Unknown Channel"
subject = subject or "No Subject"
sender_name = sender_name or "Unknown Sender"
# Return plain content with minimal metadata prefix
return f"Channel: {channel_name} | Subject: {subject} | Sent by: {sender_name} | Date: {date_str}\n\n{content}"
@staticmethod
def sanitize_metadata(metadata):
"""
Sanitize metadata to ensure no None values.
Args:
metadata (dict): Metadata dictionary
Returns:
dict: Sanitized metadata with no None values
"""
sanitized = {}
for key, value in metadata.items():
if value is None:
if key == "channel":
sanitized[key] = "Unknown Channel"
elif key == "subject":
sanitized[key] = "No Subject"
elif key == "sender":
sanitized[key] = "Unknown Sender"
elif key == "timestamp":
sanitized[key] = datetime.now().isoformat()
else:
sanitized[key] = ""
else:
sanitized[key] = value
return sanitized
@staticmethod
def add_message(message_id, content, channel_name, subject, sender_name, date_sent):
"""
Add a message to the ChromaDB collection with contextual information.
Args:
message_id (str): ID of the message
content (str): Content of the message
channel_name (str): Name of the channel
subject (str): Subject of the message
sender_name (str): Name of the sender
date_sent (datetime): Date the message was sent
Returns:
bool: True if successful, False otherwise
"""
try:
# Check if message already exists to avoid duplicates
if ChromaDBService.message_exists(message_id):
logger.info(f"Message ID {message_id} already exists in ChromaDB, skipping")
return True
collection = get_chroma_collection()
# Create metadata and sanitize to prevent None values
metadata = {
"channel": channel_name,
"subject": subject,
"sender": sender_name,
"timestamp": date_sent.isoformat() if date_sent else datetime.now().isoformat(),
"source": "zulip"
}
# Sanitize metadata to replace None values
metadata = ChromaDBService.sanitize_metadata(metadata)
# Format the content to include structured context information
formatted_content = ChromaDBService.format_message_content(
content, channel_name, subject, sender_name, date_sent
)
# Generate embeddings using our custom embedding function
embeddings = ChromaDBService._embedding_function([formatted_content])
# Add to ChromaDB
collection.add(
ids=[str(message_id)],
documents=[formatted_content],
metadatas=[metadata],
embeddings=embeddings if embeddings else None
)
# Also add to BM25 index for hybrid search
BM25Service.add_document(formatted_content, str(message_id))
logger.info(f"Successfully added message ID {message_id} to ChromaDB")
return True
except Exception as e:
logger.error(f"Error adding message to ChromaDB: {e}")
return False
@staticmethod
def search_similar(query_text, n_results=5, filter_criteria=None, use_hybrid=True, _internal_call=False):
"""
Search for similar messages in ChromaDB with improved contextual relevance.
Args:
query_text (str): Text to search for
n_results (int): Number of results to return
filter_criteria (dict): Metadata filter criteria
use_hybrid (bool): Whether to use hybrid search or just vector search
_internal_call (bool): Internal parameter to prevent circular calls
Returns:
dict: Search results from ChromaDB
"""
try:
logger.info("Using temporary ChromaDB client to prevent duplicate embeddings")
collection = get_chroma_collection()
# If hybrid search is disabled or this is an internal call from HybridSearchService,
# fall back to vector-only search to prevent circular references
if not use_hybrid or _internal_call:
try:
# Generate query embedding locally instead of using the collection's embedding function
query_embedding = EmbeddingService.get_ollama_embeddings([query_text])[0]
# Perform search with embeddings using API directly to prevent collection modifications
# Create a temporary read-only client just for search to avoid modifying the main collection
temp_client = chromadb.PersistentClient(
path=Config.CHROMADB_PATH,
settings=chromadb.Settings(
anonymized_telemetry=False,
is_persistent=True,
allow_reset=False
)
)
# Get the existing collection without an embedding function
temp_collection = temp_client.get_collection(
name=Config.CHROMADB_COLLECTION or "zulip_messages"
)
# Perform search with embeddings
results = temp_collection.query(
query_embeddings=[query_embedding],
n_results=n_results,
where=filter_criteria,
include=["metadatas", "documents", "distances"]
)
# Close temporary client
del temp_client
return results
except Exception as e:
logger.error(f"Error with vector search: {e}")
logger.info("Falling back to direct text query")
# Fallback to direct text query if embeddings fail
# But use a similar approach with a temporary client
try:
# Create temporary client just for search
temp_client = chromadb.PersistentClient(
path=Config.CHROMADB_PATH,
settings=chromadb.Settings(
anonymized_telemetry=False,
is_persistent=True,
allow_reset=False
)
)
# Get the existing collection without an embedding function
temp_collection = temp_client.get_collection(
name=Config.CHROMADB_COLLECTION or "zulip_messages"
)
# Use CustomEmbeddingFunction for just this query
from app.db.chroma_service import CustomEmbeddingFunction
embedding_func = CustomEmbeddingFunction(use_nomic=False)
# Get embedding for query
query_embedding = embedding_func([query_text])[0]
# Search using the embedding
results = temp_collection.query(
query_embeddings=[query_embedding],
n_results=n_results,
where=filter_criteria,
include=["metadatas", "documents", "distances"]
)
# Close temporary client
del temp_client
return results
except Exception as text_query_error:
logger.error(f"Error with text query: {text_query_error}")
# Last resort, just get all documents and do a simple text search
all_docs = collection.get(where=filter_criteria, include=["metadatas", "documents", "embeddings"])
# Return an empty result structure if no docs found
if not all_docs or not all_docs.get('ids'):
return {"ids": [[]], "documents": [[]], "metadatas": [[]], "distances": [[]]}
return {"ids": [all_docs['ids'][:n_results]],
"documents": [all_docs['documents'][:n_results]],
"metadatas": [all_docs['metadatas'][:n_results]],
"distances": [[1.0] * min(n_results, len(all_docs['ids']))]}
# Use BM25 + vector search from hybrid search module
# We're not calling it directly here to avoid circular imports
try:
from app.utils.contextual_retrieval.hybrid_search import HybridSearchService
# Use hybrid search
results = HybridSearchService.hybrid_search(
query=query_text,
n_results=n_results,
filter_criteria=filter_criteria,
rerank=True # Enable reranking
)
# Convert to ChromaDB query result format
formatted_results = {
'ids': [[doc['id'] for doc in results]],
'documents': [[doc['content'] for doc in results]],
'metadatas': [[doc.get('metadata', {}) for doc in results]],
'distances': [[1.0 - doc.get('combined_score', 0) for doc in results]]
}
return formatted_results
except ImportError:
logger.warning("Hybrid search module not available, falling back to vector search")
# Fall back to vector search if hybrid search module not available
# Create temporary client for search
temp_client = chromadb.PersistentClient(
path=Config.CHROMADB_PATH,
settings=chromadb.Settings(
anonymized_telemetry=False,
is_persistent=True,
allow_reset=False
)
)
# Get the existing collection without an embedding function
temp_collection = temp_client.get_collection(
name=Config.CHROMADB_COLLECTION or "zulip_messages"
)
# Generate embedding
query_embedding = EmbeddingService.get_ollama_embeddings([query_text])[0]
# Perform search
results = temp_collection.query(
query_embeddings=[query_embedding],
n_results=n_results,
where=filter_criteria,
include=["metadatas", "documents", "distances"]
)
# Close temporary client
del temp_client
return results
except Exception as e:
logger.error(f"Error searching ChromaDB: {e}")
# Return an empty result set rather than None
return {"ids": [[]], "documents": [[]], "metadatas": [[]], "distances": [[]]}
@staticmethod
def delete_message(message_id):
"""
Delete a message from ChromaDB.
Args:
message_id (str): ID of the message to delete
Returns:
bool: True if successful, False otherwise
"""
try:
collection = get_chroma_collection()
collection.delete(ids=[str(message_id)])
# Also update BM25 index - for simplicity, we'll rebuild it from ChromaDB
# In a production scenario, you might want a more efficient approach
all_results = collection.get()
if all_results and all_results['ids']:
BM25Service.index_documents(all_results['documents'], all_results['ids'])
return True
except Exception as e:
logger.error(f"Error deleting message from ChromaDB: {e}")
return False
@staticmethod
def get_message_by_id(message_id):
"""
Get a message from ChromaDB by ID.
Args:
message_id (str): ID of the message to retrieve
Returns:
dict: Message data or None if not found
"""
try:
collection = get_chroma_collection()
result = collection.get(ids=[str(message_id)])
if result['ids'] and len(result['ids']) > 0:
return {
'id': result['ids'][0],
'content': result['documents'][0],
'metadata': result['metadatas'][0]
}
return None
except RecursionError:
logger.error(f"Recursion error when getting message ID {message_id} from ChromaDB")
return None
except Exception as e:
logger.error(f"Error getting message from ChromaDB: {e}")
return None
@staticmethod
def message_exists(message_id):
"""
Check if a message exists in ChromaDB.
Args:
message_id (str): ID of the message to check
Returns:
bool: True if exists, False otherwise
"""
try:
collection = get_chroma_collection()
result = collection.get(ids=[str(message_id)], include=[])
return len(result['ids']) > 0
except Exception as e:
logger.error(f"Error checking if message exists in ChromaDB: {e}")
return False
@staticmethod
def switch_embedding_method(use_nomic: bool):
"""
Switch between Nomic and Ollama embedding methods.
Args:
use_nomic: Whether to use Nomic (True) or Ollama (False)
"""
ChromaDBService._embedding_function = CustomEmbeddingFunction(use_nomic=use_nomic)

View File

@ -0,0 +1,120 @@
"""
Database integration service.
Combines functionality from both Zulip and ChromaDB services.
"""
from datetime import datetime
from app.db.zulip_service import ZulipDatabaseService
from app.db.chroma_service import ChromaDBService
from app.utils.contextual_retrieval.hybrid_search import HybridSearchService
class DatabaseIntegrationService:
"""
Service for integrating between Zulip DB and ChromaDB.
Handles the synchronization of messages from Zulip to ChromaDB.
"""
@staticmethod
def sync_messages_to_chromadb(days_ago=30, limit=1000):
"""
Sync recent messages from Zulip to ChromaDB.
Args:
days_ago (int): Number of days to look back
limit (int): Maximum number of messages to sync
Returns:
dict: Statistics about the sync operation
"""
# Get messages from Zulip
messages = ZulipDatabaseService.get_messages_from_it_channels(days_ago=days_ago, limit=limit)
stats = {
"total_messages": len(messages),
"new_messages": 0,
"already_existing": 0,
"failed": 0
}
# Process each message
for message in messages:
# Check if message already exists in ChromaDB
if ChromaDBService.message_exists(message.id):
stats["already_existing"] += 1
continue
# Get channel name for the message
channel_name = ZulipDatabaseService.get_channel_name_for_message(message)
# Get sender name (we don't have that information readily available from the query)
# In a real implementation, we would join with the UserProfile table
sender_name = f"User ID: {message.sender_id}"
# Add message to ChromaDB
success = ChromaDBService.add_message(
message_id=message.id,
content=message.content,
channel_name=channel_name,
subject=message.subject,
sender_name=sender_name,
date_sent=message.date_sent
)
if success:
stats["new_messages"] += 1
else:
stats["failed"] += 1
return stats
@staticmethod
def search_knowledge_base(query_text, n_results=5, filter_channel=None, use_hybrid=True, use_reranking=True):
"""
Search for messages in the knowledge base using hybrid search.
Args:
query_text (str): Text to search for
n_results (int): Number of results to return
filter_channel (str): Optional channel name to filter results
use_hybrid (bool): Whether to use hybrid search or just vector search
use_reranking (bool): Whether to apply reranking to the results
Returns:
list: List of search results
"""
# Prepare filter criteria
filter_criteria = None
if filter_channel:
filter_criteria = {"channel": filter_channel}
# Decide which search method to use
if use_hybrid:
# Use the hybrid search service
results = HybridSearchService.hybrid_search(
query=query_text,
n_results=n_results,
filter_criteria=filter_criteria,
rerank=use_reranking
)
return results
else:
# Use the standard ChromaDB search
results = ChromaDBService.search_similar(
query_text=query_text,
n_results=n_results,
filter_criteria=filter_criteria,
use_hybrid=False
)
# Format results
formatted_results = []
if results and results['ids'] and len(results['ids'][0]) > 0:
for i in range(len(results['ids'][0])):
formatted_results.append({
'id': results['ids'][0][i],
'content': results['documents'][0][i],
'metadata': results['metadatas'][0][i],
'score': results['distances'][0][i] if 'distances' in results else None
})
return formatted_results

209
app/db/zulip_service.py Normal file
View File

@ -0,0 +1,209 @@
"""
Service for querying messages from the Zulip database.
"""
from datetime import datetime, timedelta
from sqlalchemy import and_, or_
from app.db import get_db_session
from app.models.zulip import Message, Stream, Recipient, UserProfile, IT_RECIPIENT_IDS
class ZulipDatabaseService:
"""Service for querying messages from the Zulip database."""
@staticmethod
def get_messages_from_it_channels(days_ago=None, limit=1000, since=None):
"""
Get recent messages from IT channels.
Args:
days_ago (int): Number of days to look back (optional)
limit (int): Maximum number of messages to return
since (datetime): Get messages after this datetime (optional)
Returns:
list: List of Message objects
"""
session = get_db_session()
# Build the query based on parameters
query = session.query(Message).filter(
Message.recipient_id.in_(IT_RECIPIENT_IDS)
)
# Add date filter if specified
if since:
query = query.filter(Message.date_sent >= since)
elif days_ago:
start_date = datetime.now() - timedelta(days=days_ago)
query = query.filter(Message.date_sent >= start_date)
# Get results
messages = query.order_by(Message.id.desc()).limit(limit).all()
return messages
@staticmethod
def get_messages_newer_than_id(message_id, limit=100):
"""
Get messages with ID greater than the specified ID.
Args:
message_id (int): Get messages with ID greater than this
limit (int): Maximum number of messages to return
Returns:
list: List of Message objects
"""
session = get_db_session()
messages = session.query(Message).filter(
and_(
Message.recipient_id.in_(IT_RECIPIENT_IDS),
Message.id > message_id
)
).order_by(Message.id.asc()).limit(limit).all()
return messages
@staticmethod
def get_message_by_id(message_id):
"""
Get a specific message by ID.
Args:
message_id (int): ID of the message to retrieve
Returns:
Message: Message object or None if not found
"""
session = get_db_session()
return session.query(Message).filter(Message.id == message_id).first()
@staticmethod
def search_messages(search_term, days_ago=365, limit=100):
"""
Search for messages containing a specific term.
Args:
search_term (str): Term to search for
days_ago (int): Number of days to look back
limit (int): Maximum number of messages to return
Returns:
list: List of Message objects matching the search
"""
session = get_db_session()
start_date = datetime.now() - timedelta(days=days_ago)
# Use the tsquery system if available, otherwise fall back to LIKE
messages = session.query(Message).filter(
and_(
Message.recipient_id.in_(IT_RECIPIENT_IDS),
Message.date_sent >= start_date,
or_(
Message.content.ilike(f'%{search_term}%'),
Message.subject.ilike(f'%{search_term}%')
)
)
).order_by(Message.date_sent.desc()).limit(limit).all()
return messages
@staticmethod
def get_channel_name_for_message(message):
"""
Get the channel name for a message.
Args:
message (Message): Message object
Returns:
str: Channel name or "Unknown Channel" if not found
"""
session = get_db_session()
try:
if not message or not message.recipient_id:
return "Unknown Channel"
# First, get the recipient to determine type
recipient = session.query(Recipient).filter(
Recipient.id == message.recipient_id
).first()
if not recipient:
return "Unknown Channel"
# Check recipient type (1 = stream, 2 = user, 3 = huddle)
if recipient.type != 1:
# For direct messages or huddles
return "Direct Message" if recipient.type == 2 else "Group Message"
# For stream messages, get the stream name
stream = session.query(Stream).filter(
Stream.recipient_id == message.recipient_id
).first()
# Return the name or a default value
return stream.name if stream and stream.name else "Unknown Channel"
except Exception as e:
# Log the error but don't crash - return a default value
print(f"Error getting channel name for message {message.id if message else 'unknown'}: {e}")
return "Unknown Channel"
@staticmethod
def get_sender_name_for_message(message):
"""
Get the sender name for a message.
Args:
message (Message): Message object
Returns:
str: Sender full name or 'Unknown User' if not found
"""
session = get_db_session()
try:
if not message or not message.sender_id:
return "Unknown User"
user = session.query(UserProfile).filter(
UserProfile.id == message.sender_id
).first()
return user.full_name if user and user.full_name else "Unknown User"
except Exception as e:
# Log the error but don't crash - return a default value
print(f"Error getting sender name for message {message.id if message else 'unknown'}: {e}")
return "Unknown User"
@staticmethod
def count_messages_up_to_id(message_id, since=None):
"""
Count messages with ID less than or equal to the specified ID.
Args:
message_id (int): Count messages with ID <= this
since (datetime): Only count messages after this datetime (optional)
Returns:
int: Count of messages
"""
session = get_db_session()
# Build the query
query = session.query(Message).filter(
and_(
Message.recipient_id.in_(IT_RECIPIENT_IDS),
Message.id <= message_id
)
)
# Add date filter if specified
if since:
query = query.filter(Message.date_sent >= since)
# Count the messages
count = query.count()
return count

10
app/models/__init__.py Normal file
View File

@ -0,0 +1,10 @@
"""
Models module for the application.
Contains SQLAlchemy model definitions for Zulip database tables.
"""
from app.db import Base
# Import models to make them available through the models module
from app.models.zulip import Recipient, Stream, Message, UserProfile, IT_CHANNELS, IT_RECIPIENT_IDS
# This will be populated as we define models in the next steps

Binary file not shown.

Binary file not shown.

96
app/models/zulip.py Normal file
View File

@ -0,0 +1,96 @@
"""
SQLAlchemy models for the Zulip database tables.
"""
from sqlalchemy import Column, Integer, String, Text, Boolean, SmallInteger, DateTime, ForeignKey, BigInteger
from sqlalchemy.orm import relationship
from app.db import Base
class Recipient(Base):
"""
Model for zerver_recipient table in Zulip DB.
Recipients can be of different types (e.g., stream, user, huddle).
"""
__tablename__ = 'zerver_recipient'
__table_args__ = {'schema': 'zulip'}
id = Column(Integer, primary_key=True)
type_id = Column(Integer)
type = Column(SmallInteger) # 1 for stream, 2 for user, 3 for huddle
# Relationships
messages = relationship("Message", back_populates="recipient")
stream = relationship("Stream", back_populates="recipient", uselist=False)
class Stream(Base):
"""
Model for zerver_stream table in Zulip DB.
Represents a Zulip channel (called stream in Zulip terminology).
"""
__tablename__ = 'zerver_stream'
__table_args__ = {'schema': 'zulip'}
id = Column(BigInteger, primary_key=True)
name = Column(String)
date_created = Column(DateTime)
deactivated = Column(Boolean)
description = Column(String)
rendered_description = Column(Text)
invite_only = Column(Boolean)
recipient_id = Column(Integer, ForeignKey('zulip.zerver_recipient.id'))
realm_id = Column(Integer)
# Relationships
recipient = relationship("Recipient", back_populates="stream")
class Message(Base):
"""
Model for zerver_message table in Zulip DB.
Represents a message sent in Zulip.
"""
__tablename__ = 'zerver_message'
__table_args__ = {'schema': 'zulip'}
id = Column(Integer, primary_key=True)
sender_id = Column(Integer, ForeignKey('zulip.zerver_userprofile.id'))
recipient_id = Column(Integer, ForeignKey('zulip.zerver_recipient.id'))
subject = Column(String)
content = Column(Text)
rendered_content = Column(Text)
date_sent = Column(DateTime)
type = Column(SmallInteger) # 1 for stream message, 2 for private message
has_attachment = Column(Boolean)
has_image = Column(Boolean)
has_link = Column(Boolean)
is_channel_message = Column(Boolean)
realm_id = Column(Integer)
# Relationships
sender = relationship("UserProfile", back_populates="messages")
recipient = relationship("Recipient", back_populates="messages")
class UserProfile(Base):
"""
Model for zerver_userprofile table in Zulip DB.
Represents a Zulip user.
"""
__tablename__ = 'zerver_userprofile'
__table_args__ = {'schema': 'zulip'}
id = Column(Integer, primary_key=True)
email = Column(String)
full_name = Column(String)
is_active = Column(Boolean)
realm_id = Column(Integer)
# Relationships
messages = relationship("Message", back_populates="sender")
# Constants for the channels we're monitoring
IT_CHANNELS = {
"IT Discussions": 5, # id = 5, recipient_id = 16
"IT Knowledge": 17, # id = 17, recipient_id = 47
"IT Support": 16 # id = 16, recipient_id = 43
}
# Recipient IDs for the channels we're monitoring
IT_RECIPIENT_IDS = [16, 47, 43]

33
app/utils/__init__.py Normal file
View File

@ -0,0 +1,33 @@
"""
Utilities module for the application.
Contains helper functions and utilities for the application.
"""
import importlib
import sys
import numpy as np
def patch_chromadb_numpy():
"""
Patch ChromaDB to use np.nan instead of np.NaN for NumPy 2.0 compatibility.
This function uses monkey patching to replace the old np.NaN reference in the
brute_force_index.py file of ChromaDB with the new np.nan (lowercase).
"""
try:
# Get the module where the error occurs
from chromadb.segment.impl.vector import brute_force_index
# Patch the module to use np.nan instead of np.NaN
if not hasattr(np, 'NaN'):
np.NaN = np.nan
print("NumPy compatibility patch applied for ChromaDB")
return True
except ImportError:
print("Could not patch ChromaDB: module not found")
return False
except Exception as e:
print(f"Error patching ChromaDB: {e}")
return False
# This module will be populated with utility functions in later steps

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

372
app/utils/ai_service.py Normal file
View File

@ -0,0 +1,372 @@
"""
AI service for OpenAI API integration.
This module provides a class for generating responses using the OpenAI API.
It handles authentication, prompt engineering, error handling, and retries.
"""
import os
import time
import logging
import hashlib
import functools
from datetime import datetime, timedelta
from typing import List, Dict, Any, Optional, Tuple
from openai import OpenAI
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger("ai_service")
# Simple in-memory cache for responses
RESPONSE_CACHE = {}
CACHE_TTL = 3600 # 1 hour in seconds
class OpenAIService:
"""Service for generating responses using the OpenAI API."""
def __init__(self, api_key: Optional[str] = None,
model_name: str = "gpt-4o",
enable_cache: bool = True,
cache_ttl: int = CACHE_TTL,
rate_limit: int = 60): # 60 requests per minute
"""
Initialize the OpenAI service.
Args:
api_key: API key for OpenAI. If None, uses OPENAI_API_KEY environment variable.
model_name: Name of the OpenAI model to use.
enable_cache: Whether to enable response caching.
cache_ttl: Time-to-live for cached responses in seconds.
rate_limit: Maximum number of requests allowed per minute.
"""
self.api_key = api_key or os.getenv("OPENAI_API_KEY")
if not self.api_key:
raise ValueError("OpenAI API key not provided. Set OPENAI_API_KEY environment variable or pass api_key parameter.")
self.model_name = model_name
self.enable_cache = enable_cache
self.cache_ttl = cache_ttl
self.rate_limit = rate_limit
# Rate limiting state
self.request_timestamps = []
# Configure OpenAI API
self.client = OpenAI(api_key=self.api_key)
logger.info(f"Initialized OpenAIService with model: {model_name}")
def _check_rate_limit(self):
"""
Check if the rate limit has been reached.
Waits if necessary to stay within the rate limit.
"""
current_time = time.time()
# Remove timestamps older than 60 seconds
self.request_timestamps = [ts for ts in self.request_timestamps if current_time - ts < 60]
# Check if we've reached the rate limit
if len(self.request_timestamps) >= self.rate_limit:
# Calculate how long to wait
oldest_timestamp = min(self.request_timestamps)
sleep_time = 60 - (current_time - oldest_timestamp)
if sleep_time > 0:
logger.warning(f"Rate limit reached. Waiting {sleep_time:.2f} seconds...")
time.sleep(sleep_time)
# Add current timestamp to the list
self.request_timestamps.append(time.time())
def _detect_language(self, text: str) -> str:
"""
Detect the language of a text string.
Args:
text: The text to detect the language of.
Returns:
A language code, e.g. 'en' for English, 'ka' for Georgian.
"""
try:
# Use a very small prompt to detect language
if not text:
return 'en' # Default to English for empty text
# Simple language detection using a dedicated small request
response = self.client.chat.completions.create(
model=self.model_name,
messages=[
{"role": "system", "content": "You are a language detection service. Respond with only the ISO language code ('en' for English, 'ka' for Georgian, etc.)."},
{"role": "user", "content": f"Detect the language of this text: {text[:100]}"}
],
max_tokens=10,
temperature=0
)
language_code = response.choices[0].message.content.strip().lower()
logger.info(f"Detected language: {language_code}")
# Validate and default to English for any issues
if language_code not in ['en', 'ka']:
return 'en'
return language_code
except Exception as e:
logger.error(f"Error detecting language: {e}")
return 'en' # Default to English on error
def _generate_cache_key(self, query: str, context: List[Dict[str, Any]]) -> str:
"""
Generate a cache key for the query and context.
Args:
query: The query string.
context: The context documents.
Returns:
A string hash key for caching.
"""
# Create a string representation of the context
context_str = ""
for doc in context:
if 'content' in doc:
context_str += doc['content'][:100] # Use just the beginning for performance
# Create a hash of the query and context
key_str = query + context_str
return hashlib.md5(key_str.encode('utf-8')).hexdigest()
def _get_cached_response(self, cache_key: str) -> Optional[str]:
"""
Get a cached response if available and not expired.
Args:
cache_key: The cache key.
Returns:
The cached response, or None if not found or expired.
"""
if not self.enable_cache:
return None
if cache_key in RESPONSE_CACHE:
timestamp, response = RESPONSE_CACHE[cache_key]
# Check if the cache entry has expired
if time.time() - timestamp < self.cache_ttl:
logger.info("Using cached response")
return response
# Remove expired cache entry
del RESPONSE_CACHE[cache_key]
return None
def _cache_response(self, cache_key: str, response: str):
"""
Cache a response.
Args:
cache_key: The cache key.
response: The response to cache.
"""
if not self.enable_cache:
return
RESPONSE_CACHE[cache_key] = (time.time(), response)
# Clean up expired cache entries if cache is getting large
if len(RESPONSE_CACHE) > 1000: # Arbitrary limit
self._cleanup_cache()
def _cleanup_cache(self):
"""Clean up expired cache entries."""
current_time = time.time()
keys_to_delete = []
for key, (timestamp, _) in RESPONSE_CACHE.items():
if current_time - timestamp >= self.cache_ttl:
keys_to_delete.append(key)
for key in keys_to_delete:
del RESPONSE_CACHE[key]
logger.info(f"Cleaned up {len(keys_to_delete)} expired cache entries")
def generate_response(self, query: str, context: List[Dict[str, Any]],
max_retries: int = 3, temperature: float = 0.7) -> str:
"""
Generate a response using the OpenAI API.
Args:
query: The user's query.
context: A list of relevant context documents from ChromaDB.
Each document should be a dict with 'content' and 'metadata' keys.
max_retries: Maximum number of retry attempts for API failures.
temperature: Controls randomness in the response. Lower is more deterministic.
Returns:
The generated response text.
"""
# Check rate limit
self._check_rate_limit()
# Detect language
language = self._detect_language(query)
# Check cache
cache_key = self._generate_cache_key(query, context)
cached_response = self._get_cached_response(cache_key)
if cached_response:
return cached_response
# Construct the messages using the context
messages = self._construct_messages(query, context, language)
# Try to generate response with retries
retry_count = 0
while retry_count <= max_retries:
try:
logger.info(f"Attempting to generate response (attempt {retry_count+1}/{max_retries+1})")
# Generate with OpenAI API
response = self.client.chat.completions.create(
model=self.model_name,
messages=messages,
temperature=temperature,
max_tokens=4096,
top_p=0.8
)
# Extract the response text
response_text = response.choices[0].message.content
# Cache the response
self._cache_response(cache_key, response_text)
# Return the response text
return response_text
except Exception as e:
retry_count += 1
wait_time = 2 ** retry_count # Exponential backoff
# Log more details about the error
logger.error(f"API call error: {type(e).__name__}: {str(e)}")
if retry_count <= max_retries:
logger.warning(f"API call failed: {str(e)}. Retrying in {wait_time} seconds. (Attempt {retry_count}/{max_retries})")
time.sleep(wait_time)
else:
logger.error(f"API call failed after {max_retries} retries: {str(e)}")
# Return a simple response when all retries fail
return f"I'm here to help with IT questions. However, I'm currently experiencing technical difficulties. Please try again later or contact IT support directly."
def _construct_messages(self, query: str, context: List[Dict[str, Any]], language: str = 'en') -> List[Dict[str, str]]:
"""
Construct message list with the query and context.
Args:
query: The user's query.
context: A list of relevant context documents from ChromaDB.
language: The language code (e.g., 'en', 'ka').
Returns:
List of message dictionaries for the OpenAI API.
"""
# System instruction based on language
if language == 'ka':
system_instruction = """როგორც IT_Bot, თქვენი როლია ორგანიზაციაში IT ტექნიკური დახმარების გაწევა:
## როლი და მიზანი:
- თქვენ ხართ ორგანიზაციის IT დახმარების ბოტი, რომელიც ეხმარება თანამშრომლებს ტექნიკური საკითხების გადაჭრაში.
- გამოიყენეთ მოცემული ისტორიული კონტექსტი ზუსტი და სასარგებლო პასუხების გასაცემად.
- როდესაც კონტექსტი ამბობს რომ რაიმე პრობლემა შეიძლება არსებობდეს, ჩათვალეთ რომ ეს მართლაც პრობლემაა.
## პასუხების მიდგომა:
1. გამოიყენეთ მოცემული კონტექსტი პასუხების შესაქმნელად. თუ კონტექსტში მოცემულია კონკრეტული IT საკითხები და მათი გადაწყვეტა, გამოიყენეთ ეს ინფორმაცია.
2. თუ კონტექსტი შეიცავს ინფორმაციას მსგავსი პრობლემის შესახებ, გააანალიზეთ, როგორ გადაიჭრა ეს პრობლემა წარსულში.
3. მითითებები და ცოდნა მოცემული კონტექსტიდან პრიორიტეტული უნდა იყოს ზოგად ცოდნასთან შედარებით.
4. თუ კითხვა არ უკავშირდება IT თემებს, მიუთითეთ მომხმარებელს, რომ დაუკავშირდეს IT მხარდაჭერას.
5. დეტალური, ნაბიჯ-ნაბიჯ ინსტრუქციები მიაწოდეთ, როცა სთხოვენ ტექნიკური პრობლემის გადაჭრას.
## პასუხის ფორმატი:
- მკაფიო, ზუსტი და კონკრეტული პასუხები გაეცით.
- პასუხები დააფორმატეთ ადვილად წასაკითხად, გამოიყენეთ პუნქტები და ქვესათაურები, როცა საჭიროა.
- მიაწოდეთ კონკრეტული ბრძანებები, კოდის მაგალითები ან ინსტრუქციები, როცა საჭიროა.
- არ გამოიყენოთ [Reference X] ფორმატი პასუხებში - ინფორმაცია პირდაპირ ჩასვით პასუხში წყაროზე მითითების გარეშე."""
else: # Default to English
system_instruction = """As IT_Bot, your role is to provide technical IT support within the organization:
## Role and Purpose:
- You are an IT support bot for the organization, helping employees resolve technical issues.
- Use the provided historical context to give accurate and helpful responses.
- When context mentions that there may be an issue with something, assume there is an issue.
## Response Approach:
1. Use the provided context to craft your answers. If the context contains specific IT issues and resolutions, use that information.
2. If the context contains information about similar problems, analyze how the problem was resolved in the past.
3. Guidance and knowledge from the provided context should take precedence over general knowledge.
4. If a question is unrelated to IT topics, direct the user to contact IT support.
5. Provide detailed, step-by-step instructions when asked about resolving a technical issue.
## Response Format:
- Respond with clear, precise, and specific answers.
- Format answers for easy reading, using bullet points and subheadings when appropriate.
- Provide specific commands, code examples, or instructions when relevant.
- IMPORTANT: DO NOT use reference numbers like [Reference X] in your responses. Instead, directly incorporate the relevant information into your answer without citing sources."""
# Process the context data
context_text = ""
if context:
# Sort context by relevance (assuming they're already in relevance order)
context_text = "Reference information from IT knowledge base:\n\n"
for i, doc in enumerate(context):
if 'content' in doc:
# Create a more structured reference entry
content = doc['content']
# Build a descriptive reference header with metadata
ref_details = []
if 'metadata' in doc and doc['metadata']:
metadata = doc['metadata']
if 'subject' in metadata and metadata['subject']:
ref_details.append(f"Topic: {metadata['subject']}")
if 'channel' in metadata and metadata['channel']:
ref_details.append(f"Channel: {metadata['channel']}")
if 'sender' in metadata and metadata['sender']:
ref_details.append(f"From: {metadata['sender']}")
if 'timestamp' in metadata and metadata['timestamp']:
try:
# Try to format the timestamp in a more readable way
date_str = metadata['timestamp'][:10] # Just use the date part
ref_details.append(f"Date: {date_str}")
except:
pass
# Create a detailed reference header with all the metadata
ref_header = f"Context {i+1}"
if ref_details:
ref_header += f": {' | '.join(ref_details)}"
# Format each reference entry
context_text += f"[{ref_header}]\n{content}\n\n"
# Create messages array for the chat completions API
messages = [
{"role": "system", "content": system_instruction}
]
# Add context as a separate message from the system if available
if context_text:
messages.append({"role": "system", "content": context_text})
# Add the user query
messages.append({"role": "user", "content": query})
return messages
# For backwards compatibility, provide GeminiService as an alias for OpenAIService
GeminiService = OpenAIService

402
app/utils/bot_service.py Normal file
View File

@ -0,0 +1,402 @@
"""
Zulip bot service for handling interactions with Zulip.
"""
import os
import re
import logging
import threading
import time
import hashlib
import tempfile
from typing import Optional, List, Dict, Any
import zulip
from app.db.chroma_service import ChromaDBService
from app.utils.ai_service import GeminiService
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger("bot_service")
class ZulipBotService:
"""Service for handling Zulip bot interactions."""
# Singleton instance
_instance = None
_lock = threading.Lock()
_process_id = os.getpid() # Store the process ID when this module is loaded
def __new__(cls, *args, **kwargs):
with cls._lock:
current_pid = os.getpid()
if cls._instance is None or cls._process_id != current_pid:
logger.info(f"Creating new ZulipBotService singleton instance for process {current_pid}")
cls._instance = super(ZulipBotService, cls).__new__(cls)
cls._instance._initialized = False
cls._process_id = current_pid # Update the stored process ID
return cls._instance
def __init__(self,
email: Optional[str] = None,
api_key: Optional[str] = None,
site: Optional[str] = None,
chroma_service: Optional[ChromaDBService] = None,
ai_service: Optional[GeminiService] = None):
"""Initialize the Zulip bot service."""
with self._lock:
# Skip initialization if already initialized (singleton pattern)
if self._initialized:
return
# Load config from environment variables if not provided
self.email = email or os.getenv("ZULIP_BOT_EMAIL")
self.api_key = api_key or os.getenv("ZULIP_BOT_API_KEY")
self.site = site or os.getenv("ZULIP_SITE")
if not all([self.email, self.api_key, self.site]):
raise ValueError("Missing Zulip configuration. Set ZULIP_BOT_EMAIL, ZULIP_BOT_API_KEY, and ZULIP_SITE env variables.")
# Initialize Zulip client
self.client = zulip.Client(
email=self.email,
api_key=self.api_key,
site=self.site
)
# Initialize services
self.chroma_service = chroma_service or ChromaDBService()
self.ai_service = ai_service or GeminiService()
# Thread for message handling
self.thread = None
self.running = False
# Simple set to track processed message IDs
self.processed_message_ids = set()
# Bot identification pattern - exact match for IT_Bot mention in Zulip format
self.bot_mention_pattern = re.compile(r'@\*\*IT_Bot\*\*')
# Default response for empty queries
self.default_response = "Hello. If you have a technical question, please ask. If you require assistance with non-technical matters, please contact IT support."
# Track backoff state for rate limiting
self._backoff_time = 1 # Start with 1 second backoff
self._consecutive_rate_limit_errors = 0
self._max_backoff_time = 60 # Maximum backoff of 60 seconds
# Mark as initialized
self._initialized = True
logger.info("Initialized ZulipBotService")
def start(self):
"""Start the bot service in a separate thread."""
with self._lock:
if self.thread and self.thread.is_alive():
logger.warning("Bot service is already running")
return
self.running = True
self.thread = threading.Thread(target=self._message_loop)
self.thread.daemon = True
self.thread.start()
logger.info("Started ZulipBotService")
def stop(self):
"""Stop the bot service."""
with self._lock:
if not self.thread or not self.thread.is_alive():
logger.warning("Bot service is not running")
return
self.running = False
self.thread.join(timeout=5.0)
logger.info("Stopped ZulipBotService")
def _message_loop(self):
"""Main message handling loop."""
# How far back to check for mentions (in seconds)
# Default to 60 seconds, but can be adjusted
lookback_period = 60
while self.running:
try:
# Get messages that mention the bot
new_messages = self._check_for_mentions(lookback_period)
# Process new messages
for message in new_messages:
self._process_message(message)
# Add a small delay between processing messages
time.sleep(0.5)
# Clean up old processed message IDs periodically
if len(self.processed_message_ids) > 1000:
self.processed_message_ids = set(list(self.processed_message_ids)[-1000:])
# Wait before checking again (reduces API usage)
time.sleep(5.0)
except Exception as e:
logger.error(f"Error in message loop: {str(e)}")
# Apply backoff on errors to avoid hammering the API
if "API usage exceeded rate limit" in str(e):
self._consecutive_rate_limit_errors += 1
backoff_time = min(self._backoff_time * 2, self._max_backoff_time)
logger.info(f"Rate limit hit, backing off for {backoff_time} seconds")
time.sleep(backoff_time)
self._backoff_time = backoff_time
else:
# For other errors, just wait a bit
time.sleep(3)
def _check_for_mentions(self, lookback_period):
"""
Check for new messages that mention the bot.
Args:
lookback_period: How far back to check for mentions (in seconds)
Returns:
List of messages that mention the bot
"""
# Calculate the timestamp for the lookback period
lookback_timestamp = int(time.time() - lookback_period)
try:
# If we've had rate limit errors, apply backoff
if self._consecutive_rate_limit_errors > 0:
backoff_delay = min(self._backoff_time, self._max_backoff_time)
logger.info(f"Rate limit backoff: waiting {backoff_delay} seconds before API call")
time.sleep(backoff_delay)
# Get all messages that mention the bot
# Use the request endpoint for more control
request = {
"anchor": "newest",
"num_before": 100,
"num_after": 0,
"narrow": [
{"operator": "is", "operand": "mentioned"},
{"operator": "streams", "operand": "public"}
],
"client_gravatar": False,
"apply_markdown": False
}
result = self.client.get_messages(request)
# Reset backoff if request was successful
if result.get("result") == "success":
if self._consecutive_rate_limit_errors > 0:
logger.info("Successful API call, resetting rate limit backoff")
self._consecutive_rate_limit_errors = 0
self._backoff_time = 1
else:
logger.error(f"Failed to get messages: {result.get('msg', 'Unknown error')}")
return []
# Filter messages
new_messages = []
for message in result.get("messages", []):
# Skip if we've already processed this message
if message["id"] in self.processed_message_ids:
continue
# Skip messages not sent after our lookback time
if message.get("timestamp", 0) < lookback_timestamp:
continue
# Skip messages from the bot itself
if message.get("sender_email") == self.email:
continue
# Check if the bot is actually mentioned in the content
if self.bot_mention_pattern.search(message.get("content", "")):
# Add to processed set and new message list
self.processed_message_ids.add(message["id"])
new_messages.append(message)
if new_messages:
logger.info(f"Found {len(new_messages)} new mention(s) of the bot")
return new_messages
except Exception as e:
if "API usage exceeded rate limit" in str(e):
self._consecutive_rate_limit_errors += 1
self._backoff_time = min(self._backoff_time * 2, self._max_backoff_time)
logger.error(f"Error checking for mentions: {str(e)} (backoff: {self._backoff_time}s)")
else:
logger.error(f"Error checking for mentions: {str(e)}")
return []
def _process_message(self, message):
"""
Process a message and send a response.
Args:
message: The message to process.
"""
try:
# Extract content
content = message.get("content", "")
# Log detailed information
logger.info(f"Processing message ID: {message.get('id')}")
# Extract user query (remove the bot mention)
query = self.bot_mention_pattern.sub("", content).strip()
# Log the incoming message
logger.info(f"Extracted query: {query[:50]}...")
# If query is empty, provide the default response
if not query:
logger.info(f"Empty query received, sending default response")
self._send_response(message, self.default_response)
return
# Retrieve relevant context from ChromaDB
context = self._retrieve_context(query)
# Generate response using the AI service
response_text = self.ai_service.generate_response(query, context)
# Send the response
self._send_response(message, response_text)
except Exception as e:
logger.error(f"Error processing message: {str(e)}")
self._send_response(message,
"I apologize, but I encountered an error while processing your request. "
"Please try again or contact the IT support team if the issue persists.")
def _retrieve_context(self, query, n_results=40):
"""
Retrieve relevant context from ChromaDB with enhanced relevance.
Args:
query: The user's query.
n_results: Number of results to retrieve.
Returns:
A list of relevant context documents.
"""
try:
# Search for similar documents in ChromaDB
search_results = self.chroma_service.search_similar(query, n_results=n_results)
if not search_results:
logger.warning(f"No context found for query: {query[:50]}...")
return []
# Extract documents and metadata
documents = []
# Check if there are documents in the results
if search_results.get("documents") and len(search_results.get("documents", [])) > 0:
# Get the documents and their metadata
docs = search_results.get("documents", [[]])[0]
metas = search_results.get("metadatas", [[]])[0]
# Calculate a simple relevance score for each document based on position
relevance_scores = []
for i, (doc, metadata) in enumerate(zip(docs, metas)):
# Create a document with its metadata
if isinstance(doc, list) and len(doc) > 0:
doc = doc[0] # Handle nested lists
# Include relevance position in metadata
if metadata:
metadata["relevance_position"] = i + 1
# Store document with enhanced metadata
documents.append({
"content": doc,
"metadata": metadata,
})
logger.info(f"Retrieved {len(documents)} context documents for query: {query[:30]}...")
return documents
except Exception as e:
logger.error(f"Error retrieving context: {str(e)}")
return []
def _send_response(self, original_message, response_text):
"""
Send a response to a message.
Args:
original_message: The original message being responded to.
response_text: The text of the response to send.
"""
try:
message_type = original_message.get("type")
if message_type == "stream":
# For stream messages, respond in the same stream and topic
response = {
"type": "stream",
"to": original_message.get("display_recipient"),
"subject": original_message.get("subject"),
"content": response_text
}
else:
# For private messages, respond to the sender
response = {
"type": "private",
"to": [original_message.get("sender_email")],
"content": response_text
}
result = self.client.send_message(response)
if result.get("result") != "success":
error_msg = result.get("msg", "Unknown error")
logger.error(f"Failed to send response: {error_msg}")
else:
logger.info(f"Sent response to message: {original_message.get('id')}")
except Exception as e:
logger.error(f"Error sending response: {str(e)}")
def send_test_message(self, recipient, content):
"""
Send a test message to verify the bot is working.
Args:
recipient: The recipient of the message (email for private, channel name for stream).
content: The content of the message.
Returns:
The result of the API call.
"""
if "@" in recipient:
# Private message
message = {
"type": "private",
"to": [recipient],
"content": content
}
else:
# Stream message
message = {
"type": "stream",
"to": recipient,
"subject": "Bot Test",
"content": content
}
result = self.client.send_message(message)
logger.info(f"Sent test message to {recipient}, result: {result.get('result')}")
return result
def reset_cache(self):
"""Reset message cache."""
with self._lock:
logger.info("Resetting message caches")
self.processed_message_ids = set()
return "Message cache reset successfully"

View File

@ -0,0 +1,8 @@
"""
Contextual Retrieval package for enhancing RAG systems.
This package implements advanced retrieval techniques based on Anthropic's Contextual Retrieval:
- Contextual Embeddings: Adding rich context to chunks before embedding
- Contextual BM25: Using BM25 for exact matching with context-enhanced chunks
- Reranking: Further improving results by reranking retrieved chunks
"""

View File

@ -0,0 +1,181 @@
"""
BM25 Service for exact keyword matching in retrieval.
This service implements the BM25 algorithm for better lexical search,
complementing the semantic search provided by vector embeddings.
"""
import os
import pickle
import numpy as np
from typing import Dict, List, Optional, Tuple, Union
from rank_bm25 import BM25Okapi
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
# Download NLTK resources
try:
nltk.data.find('tokenizers/punkt')
except LookupError:
nltk.download('punkt', quiet=True)
try:
nltk.data.find('corpora/stopwords')
except LookupError:
nltk.download('stopwords', quiet=True)
class BM25Service:
"""Service for BM25-based search."""
# BM25 index and corpus
_bm25 = None
_corpus = []
_doc_ids = []
_index_path = os.path.join("chromadb", "bm25_index.pkl")
@staticmethod
def preprocess_text(text: str) -> List[str]:
"""
Preprocess text for BM25 indexing.
Args:
text (str): Text to preprocess
Returns:
List[str]: List of preprocessed tokens
"""
# Convert to lowercase
text = text.lower()
# Remove special characters and digits
text = re.sub(r'[^\w\s]', ' ', text)
text = re.sub(r'\d+', ' ', text)
# Tokenize
tokens = word_tokenize(text)
# Remove stopwords
stop_words = set(stopwords.words('english'))
tokens = [token for token in tokens if token not in stop_words and len(token) > 1]
return tokens
@staticmethod
def index_documents(documents: List[str], doc_ids: List[str]) -> None:
"""
Create a BM25 index for a list of documents.
Args:
documents (List[str]): List of document contents
doc_ids (List[str]): List of document IDs
"""
# Preprocess documents
tokenized_corpus = [BM25Service.preprocess_text(doc) for doc in documents]
# Create BM25 index
BM25Service._bm25 = BM25Okapi(tokenized_corpus)
BM25Service._corpus = documents
BM25Service._doc_ids = doc_ids
# Save index to disk
BM25Service.save_index()
@staticmethod
def add_document(document: str, doc_id: str) -> None:
"""
Add a single document to the BM25 index.
Args:
document (str): Document content
doc_id (str): Document ID
"""
# Create index if it doesn't exist
if BM25Service._bm25 is None:
BM25Service.load_index()
if BM25Service._bm25 is None:
BM25Service.index_documents([document], [doc_id])
return
# Add document to corpus
BM25Service._corpus.append(document)
BM25Service._doc_ids.append(doc_id)
# Preprocess document
tokenized_doc = BM25Service.preprocess_text(document)
# Rebuild index
tokenized_corpus = [BM25Service.preprocess_text(doc) for doc in BM25Service._corpus]
BM25Service._bm25 = BM25Okapi(tokenized_corpus)
# Save index to disk
BM25Service.save_index()
@staticmethod
def search(query: str, top_k: int = 5) -> List[Tuple[str, float]]:
"""
Search for documents using BM25.
Args:
query (str): Query text
top_k (int): Number of results to return
Returns:
List[Tuple[str, float]]: List of (doc_id, score) tuples
"""
# Load index if it doesn't exist
if BM25Service._bm25 is None:
BM25Service.load_index()
if BM25Service._bm25 is None:
return []
# Preprocess query
tokenized_query = BM25Service.preprocess_text(query)
# Get scores
scores = BM25Service._bm25.get_scores(tokenized_query)
# Get top-k documents
top_indices = np.argsort(scores)[::-1][:top_k]
# Return (doc_id, score) pairs
results = []
for idx in top_indices:
if idx < len(BM25Service._doc_ids):
results.append((BM25Service._doc_ids[idx], scores[idx]))
return results
@staticmethod
def save_index() -> None:
"""Save BM25 index to disk."""
try:
# Create directory if it doesn't exist
os.makedirs(os.path.dirname(BM25Service._index_path), exist_ok=True)
# Save index
with open(BM25Service._index_path, 'wb') as f:
pickle.dump({
'bm25': BM25Service._bm25,
'corpus': BM25Service._corpus,
'doc_ids': BM25Service._doc_ids
}, f)
except Exception as e:
print(f"Error saving BM25 index: {e}")
@staticmethod
def load_index() -> None:
"""Load BM25 index from disk."""
try:
if os.path.exists(BM25Service._index_path):
with open(BM25Service._index_path, 'rb') as f:
data = pickle.load(f)
BM25Service._bm25 = data.get('bm25')
BM25Service._corpus = data.get('corpus', [])
BM25Service._doc_ids = data.get('doc_ids', [])
except Exception as e:
print(f"Error loading BM25 index: {e}")
# Initialize with empty index
BM25Service._bm25 = None
BM25Service._corpus = []
BM25Service._doc_ids = []

View File

@ -0,0 +1,112 @@
"""
Context Service for generating rich contextual descriptions for messages.
This service uses LLMs to generate contextual descriptions for messages,
which improves retrieval by providing more context to the embedding process.
"""
import os
import time
from typing import Dict, List, Optional, Union
from openai import OpenAI
from app.config import Config
class ContextService:
"""Service for generating rich contextual descriptions for messages."""
# Initialize OpenAI client
client = OpenAI(api_key=Config.OPENAI_API_KEY)
# Cache for context generation to reduce API calls
_context_cache = {}
@staticmethod
def generate_context(content: str, metadata: Dict) -> str:
"""
Generate a rich contextual description for a message.
Args:
content (str): The original message content
metadata (Dict): Metadata about the message (channel, subject, sender, timestamp)
Returns:
str: A rich contextual description
"""
# Create a cache key from content and metadata
cache_key = f"{content[:100]}_{metadata.get('channel')}_{metadata.get('subject')}"
# Check if we have this context cached
if cache_key in ContextService._context_cache:
return ContextService._context_cache[cache_key]
try:
# Create messages for context generation
messages = [
{
"role": "system",
"content": "You are a context generation assistant. Generate a short, succinct context description for the given message. The context should situate this message within its domain and highlight key information that would be helpful for retrieval. Keep the context under 100 words."
},
{
"role": "user",
"content": f"""
Message details:
- Channel: {metadata.get('channel', 'Unknown')}
- Subject: {metadata.get('subject', 'Unknown')}
- Sender: {metadata.get('sender', 'Unknown')}
- Timestamp: {metadata.get('timestamp', 'Unknown')}
Message content:
{content}
"""
}
]
# Generate the context using OpenAI
response = ContextService.client.chat.completions.create(
model="gpt-4o",
messages=messages,
max_tokens=150,
temperature=0.3
)
# Extract the response text
context = response.choices[0].message.content.strip()
# If the context is too long, truncate it
if len(context) > 500:
context = context[:497] + "..."
# Cache the result
ContextService._context_cache[cache_key] = context
return context
except Exception as e:
print(f"Error generating context: {e}")
# Fallback to a simple context based on metadata
channel = metadata.get('channel', 'Unknown')
subject = metadata.get('subject', 'Unknown')
fallback_context = f"This message is from the {channel} channel and discusses {subject}."
# Cache the fallback
ContextService._context_cache[cache_key] = fallback_context
return fallback_context
@staticmethod
def contextualize_content(content: str, metadata: Dict) -> str:
"""
Add rich contextual description to a message.
Args:
content (str): The original message content
metadata (Dict): Metadata about the message
Returns:
str: The content with context prepended
"""
# Generate the context
context = ContextService.generate_context(content, metadata)
# Add the context to the content
return f"CONTEXT: {context}\n\nCONTENT: {content}"

View File

@ -0,0 +1,160 @@
"""
Hybrid Search Service that combines vector search and BM25 search.
This service implements hybrid search by combining results from vector-based
semantic search and BM25 lexical search using rank fusion.
"""
import numpy as np
from typing import Dict, List, Optional, Tuple, Union
from app.db.chroma_service import ChromaDBService
from app.utils.contextual_retrieval.bm25_service import BM25Service
from app.utils.contextual_retrieval.reranker_service import RerankerService
import logging
# Set up logging
logger = logging.getLogger("hybrid_search")
class HybridSearchService:
"""Service for hybrid search combining vector search and BM25."""
@staticmethod
def hybrid_search(query: str, n_results: int = 5, filter_criteria: Optional[Dict] = None,
rerank: bool = True, semantic_weight: float = 0.7) -> List[Dict]:
"""
Perform hybrid search using vector search and BM25.
Args:
query (str): Query text
n_results (int): Number of results to return
filter_criteria (Dict): Metadata filter criteria
rerank (bool): Whether to apply reranking
semantic_weight (float): Weight for semantic search (0-1)
Returns:
List[Dict]: Search results
"""
try:
# Get more results than requested for fusion
vector_n = n_results * 3
bm25_n = n_results * 3
# Perform vector search - use _internal_call=True to prevent circular imports
vector_results = ChromaDBService.search_similar(
query_text=query,
n_results=vector_n,
filter_criteria=filter_criteria,
_internal_call=True # This prevents circular calls
)
# Extract vector search results
vec_docs = []
if vector_results and 'documents' in vector_results and len(vector_results['documents']) > 0:
for i in range(len(vector_results['documents'][0])):
vec_docs.append({
'id': vector_results['ids'][0][i],
'content': vector_results['documents'][0][i],
'metadata': vector_results['metadatas'][0][i],
'vector_score': 1.0 - min(vector_results['distances'][0][i], 1.0),
'rank': i + 1 # 1-based rank
})
# Perform BM25 search
bm25_results = BM25Service.search(query, top_k=bm25_n)
# Extract BM25 search results and normalize scores
bm25_docs = []
if bm25_results:
# Get max score for normalization
max_score = max([score for _, score in bm25_results]) if bm25_results else 1.0
# Create a set of doc IDs already in vector results to avoid duplicate lookups
existing_doc_ids = {doc['id'] for doc in vec_docs}
for i, (doc_id, score) in enumerate(bm25_results):
# Skip duplicate lookups
if doc_id in existing_doc_ids:
continue
# Get document content from ChromaDB (if available)
try:
doc_data = ChromaDBService.get_message_by_id(doc_id)
if doc_data:
bm25_docs.append({
'id': doc_id,
'content': doc_data['content'],
'metadata': doc_data['metadata'],
'bm25_score': score / max_score if max_score > 0 else 0,
'rank': i + 1 # 1-based rank
})
except Exception as e:
logger.warning(f"Error retrieving document {doc_id}: {e}")
continue
# Combine results using reciprocal rank fusion
fused_docs = HybridSearchService._fuse_results(vec_docs, bm25_docs, semantic_weight)
# Apply reranking if requested
if rerank and len(fused_docs) > 0:
try:
return RerankerService.rerank(query, fused_docs, top_k=n_results)
except Exception as e:
logger.warning(f"Reranking failed: {e}, returning non-reranked results")
return fused_docs[:n_results]
# Otherwise just return the top n fused results
return fused_docs[:n_results]
except Exception as e:
logger.error(f"Error in hybrid search: {e}")
# Return empty results on error
return []
@staticmethod
def _fuse_results(vec_docs: List[Dict], bm25_docs: List[Dict],
semantic_weight: float = 0.7) -> List[Dict]:
"""
Fuse results from vector search and BM25 search.
Args:
vec_docs (List[Dict]): Vector search results
bm25_docs (List[Dict]): BM25 search results
semantic_weight (float): Weight for semantic search (0-1)
Returns:
List[Dict]: Fused search results
"""
# Create a map of document IDs to documents
doc_map = {}
# Process vector search results
for doc in vec_docs:
doc_id = doc['id']
if doc_id not in doc_map:
doc_map[doc_id] = doc.copy()
doc_map[doc_id]['combined_score'] = doc.get('vector_score', 0) * semantic_weight
else:
# Update existing document
doc_map[doc_id]['vector_score'] = doc.get('vector_score', 0)
doc_map[doc_id]['combined_score'] = (
doc_map[doc_id].get('combined_score', 0) +
doc.get('vector_score', 0) * semantic_weight
)
# Process BM25 search results
for doc in bm25_docs:
doc_id = doc['id']
if doc_id not in doc_map:
doc_map[doc_id] = doc.copy()
doc_map[doc_id]['combined_score'] = doc.get('bm25_score', 0) * (1 - semantic_weight)
else:
# Update existing document
doc_map[doc_id]['bm25_score'] = doc.get('bm25_score', 0)
doc_map[doc_id]['combined_score'] = (
doc_map[doc_id].get('combined_score', 0) +
doc.get('bm25_score', 0) * (1 - semantic_weight)
)
# Convert map to list and sort by combined score
results = list(doc_map.values())
results.sort(key=lambda x: x.get('combined_score', 0), reverse=True)
return results

View File

@ -0,0 +1,249 @@
"""
Reranker Service for improving search results by reranking candidate documents.
This service uses a custom reranking approach combining multiple signals
to improve the relevance of search results.
"""
import re
import numpy as np
from typing import Dict, List, Optional, Tuple, Union
import logging
# Set up logging
logger = logging.getLogger("reranker_service")
class RerankerService:
"""Service for reranking search results using a custom approach."""
# Cache for reranked results
_rerank_cache = {}
@staticmethod
def rerank(query: str, documents: List[Dict], top_k: int = 20) -> List[Dict]:
"""
Rerank documents based on relevance to query using a multi-factor approach.
Args:
query (str): Query text
documents (List[Dict]): List of document dictionaries with 'id' and 'content'
top_k (int): Number of results to return
Returns:
List[Dict]: Reranked documents
"""
# Return all documents if there are fewer than top_k
if len(documents) <= top_k:
return documents
# Create cache key
cache_key = f"{query}_{sorted([doc.get('id', '') for doc in documents])}"
# Check if we have this reranking cached
if cache_key in RerankerService._rerank_cache:
return RerankerService._rerank_cache[cache_key][:top_k]
try:
# Prepare query
query_terms = RerankerService._tokenize(query)
query_lower = query.lower()
# Calculate multi-factor relevance score for each document
scored_docs = []
for doc in documents:
content = doc.get('content', '')
content_lower = content.lower()
# 1. Term frequency scoring (similar to BM25)
term_score = RerankerService._calculate_term_score(content_lower, query_terms)
# 2. Exact phrase matching
phrase_score = RerankerService._calculate_phrase_score(content_lower, query_lower)
# 3. Semantic similarity (use existing score if available)
semantic_score = RerankerService._get_semantic_score(doc)
# 4. Document position bonus
position_score = RerankerService._calculate_position_score(content_lower, query_terms)
# 5. Document length normalization
length_factor = RerankerService._calculate_length_factor(content)
# Calculate final combined score
# Weights can be adjusted based on performance
final_score = (
0.35 * term_score +
0.30 * phrase_score +
0.25 * semantic_score +
0.10 * position_score
) * length_factor
scored_doc = doc.copy()
scored_doc['score'] = final_score
scored_doc['_term_score'] = term_score
scored_doc['_phrase_score'] = phrase_score
scored_doc['_semantic_score'] = semantic_score
scored_doc['_position_score'] = position_score
scored_docs.append(scored_doc)
# Sort by final score (highest first)
scored_docs.sort(key=lambda x: x.get('score', 0), reverse=True)
# Take the top_k
result = scored_docs[:top_k]
# Clean up diagnostic scores before returning
for doc in result:
doc.pop('_term_score', None)
doc.pop('_phrase_score', None)
doc.pop('_semantic_score', None)
doc.pop('_position_score', None)
# Cache the results
RerankerService._rerank_cache[cache_key] = result
return result
except Exception as e:
logger.error(f"Error reranking documents: {e}")
# Fallback: simple sorting based on combined_score if available
documents.sort(key=lambda x: x.get('combined_score', 0), reverse=True)
return documents[:top_k]
@staticmethod
def _tokenize(text: str) -> List[str]:
"""
Tokenize a string into terms.
Args:
text (str): Text to tokenize
Returns:
List[str]: List of tokens
"""
# Simple tokenization by splitting on whitespace and removing punctuation
tokens = re.findall(r'\b\w+\b', text.lower())
return tokens
@staticmethod
def _calculate_term_score(content: str, query_terms: List[str]) -> float:
"""
Calculate term frequency score.
Args:
content (str): Document content
query_terms (List[str]): Query terms
Returns:
float: Term frequency score
"""
score = 0
content_tokens = RerankerService._tokenize(content)
# Simple term frequency calculation
for term in query_terms:
term_count = content_tokens.count(term)
score += term_count
# Normalize by document length
if len(content_tokens) > 0:
score = score / len(content_tokens)
return score
@staticmethod
def _calculate_phrase_score(content: str, query: str) -> float:
"""
Calculate exact phrase matching score.
Args:
content (str): Document content
query (str): Original query
Returns:
float: Phrase matching score
"""
# Count exact matches of the query in the content
exact_matches = content.count(query)
# Calculating score for sentence fragments
score = exact_matches * 2.0 # Higher weight for exact matches
# Check for partial matches if no exact matches
if exact_matches == 0 and len(query) > 5:
# Generate query n-grams (only for longer queries)
query_parts = [query[i:i+4] for i in range(0, len(query)-3)]
for part in query_parts:
if len(part) >= 4: # Only consider meaningful parts
score += 0.2 * content.count(part)
return min(score, 10.0) # Cap to avoid extremely high scores
@staticmethod
def _get_semantic_score(doc: Dict) -> float:
"""
Extract semantic similarity score from document.
Args:
doc (Dict): Document
Returns:
float: Semantic similarity score
"""
# Use vector_score if available (from vector search)
if 'vector_score' in doc:
return doc['vector_score']
# Use combined_score as fallback
if 'combined_score' in doc:
return doc['combined_score']
return 0.5 # Default middle value if no scores available
@staticmethod
def _calculate_position_score(content: str, query_terms: List[str]) -> float:
"""
Calculate score based on position of match in document.
Earlier matches often indicate higher relevance.
Args:
content (str): Document content
query_terms (List[str]): Query terms
Returns:
float: Position score
"""
score = 0
# Check for terms in the first 20% of the document
first_section = content[:int(len(content) * 0.2)]
for term in query_terms:
if term in first_section:
score += 0.5
return min(score, 1.0) # Normalize to maximum of 1.0
@staticmethod
def _calculate_length_factor(content: str) -> float:
"""
Calculate length normalization factor.
Prevents extremely short documents from ranking too high.
Args:
content (str): Document content
Returns:
float: Length normalization factor
"""
token_count = len(RerankerService._tokenize(content))
# Penalize very short documents
if token_count < 10:
return 0.7
# Slightly favor mid-sized documents
if 20 <= token_count <= 300:
return 1.1
return 1.0 # Neutral factor for other documents

111
app/utils/embeddings.py Normal file
View File

@ -0,0 +1,111 @@
"""
Embeddings utilities using Ollama and Nomic.
"""
import os
import requests
import numpy as np
from typing import List, Optional, Union
import ollama
from app.config import Config
class EmbeddingService:
"""Service for generating embeddings using Ollama and Nomic."""
@staticmethod
def get_ollama_embeddings(texts: List[str], model: Optional[str] = None) -> List[List[float]]:
"""
Generate embeddings using Ollama.
Args:
texts: List of texts to generate embeddings for
model: Ollama model to use for embeddings (default from config)
Returns:
List of embeddings as float arrays
"""
if model is None:
# Use model from config
model = Config.OLLAMA_MODEL
# Set Ollama host from config
ollama.host = Config.OLLAMA_HOST
embeddings = []
for text in texts:
try:
# Call Ollama API for embeddings
response = ollama.embeddings(model=model, prompt=text)
embedding = response.get("embedding", [])
embeddings.append(embedding)
except Exception as e:
print(f"Error generating Ollama embedding: {e}")
# Return a zero embedding as fallback
embeddings.append([0.0] * 768) # typical dimension for text embeddings
return embeddings
@staticmethod
def get_nomic_embeddings(texts: List[str]) -> List[List[float]]:
"""
Generate embeddings using Nomic.
Args:
texts: List of texts to generate embeddings for
Returns:
List of embeddings as float arrays
"""
try:
# The new version of Nomic requires a Cohere API key, so we'll fall back to Ollama
# if we don't have one configured
cohere_api_key = Config.COHERE_API_KEY
if not cohere_api_key:
print("No Cohere API key found for Nomic embeddings, falling back to Ollama")
return EmbeddingService.get_ollama_embeddings(texts)
# Dynamically import nomic embedders to avoid startup errors if not available
from nomic.embedders import CohereEmbedder
# Create a Nomic embedding model using CohereEmbedder with API key
embedding_model = CohereEmbedder(cohere_api_key=cohere_api_key)
# Generate embeddings for the texts
embeddings = []
for text in texts:
embedding = embedding_model.embed(text)
embeddings.append(embedding)
return embeddings
except Exception as e:
print(f"Error generating Nomic embeddings: {e}")
# Fall back to Ollama embeddings
print("Falling back to Ollama embeddings")
return EmbeddingService.get_ollama_embeddings(texts)
@staticmethod
def get_embeddings(texts: Union[str, List[str]], use_nomic: Optional[bool] = None) -> List[List[float]]:
"""
Generate embeddings using either Nomic or Ollama.
Args:
texts: Text or list of texts to generate embeddings for
use_nomic: Whether to use Nomic (True) or Ollama (False), defaults to config setting
Returns:
List of embeddings as float arrays
"""
# Convert single text to list
if isinstance(texts, str):
texts = [texts]
# If use_nomic is not specified, use the config setting
if use_nomic is None:
use_nomic = Config.USE_NOMIC_EMBEDDINGS
# Generate embeddings using chosen method
if use_nomic:
return EmbeddingService.get_nomic_embeddings(texts)
else:
return EmbeddingService.get_ollama_embeddings(texts)

217
app/utils/sync_service.py Normal file
View File

@ -0,0 +1,217 @@
"""
Message synchronization service.
Handles periodic fetching of new messages from Zulip and adds them to ChromaDB.
"""
import os
import time
import logging
import threading
import pickle
from datetime import datetime, timedelta
from app.db.zulip_service import ZulipDatabaseService
from app.db.chroma_service import ChromaDBService
# Configure logger
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger("sync_service")
class MessageSyncService:
"""Service for synchronizing messages from Zulip to ChromaDB."""
# File to store the last synced message ID
_SYNC_STATE_FILE = "sync_state.pickle"
def __init__(self, sync_interval=60, state_dir=None):
"""
Initialize the message sync service.
Args:
sync_interval (int): Sync interval in seconds (default: 60)
state_dir (str): Directory to store sync state file (default: current directory)
"""
self.sync_interval = sync_interval
self.is_running = False
self.sync_thread = None
self.state_dir = state_dir or os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
self.last_sync_time = None
self.last_message_id = None
self.batch_size = 50 # Default batch size
# Load the last synced state if available
self._load_sync_state()
def _set_batch_size(self, batch_size):
"""Set the batch size for syncing messages."""
if batch_size > 0:
self.batch_size = batch_size
logger.info(f"Set batch size to {batch_size}")
else:
logger.warning(f"Invalid batch size: {batch_size}, using default")
def _get_state_file_path(self):
"""Get the full path to the sync state file."""
return os.path.join(self.state_dir, self._SYNC_STATE_FILE)
def _load_sync_state(self):
"""Load the last sync state from disk."""
try:
state_file = self._get_state_file_path()
if os.path.exists(state_file):
with open(state_file, 'rb') as f:
state = pickle.load(f)
self.last_sync_time = state.get('last_sync_time')
self.last_message_id = state.get('last_message_id')
logger.info(f"Loaded sync state: last_sync_time={self.last_sync_time}, last_message_id={self.last_message_id}")
else:
logger.info("No previous sync state found, starting fresh")
except Exception as e:
logger.error(f"Error loading sync state: {e}")
def _save_sync_state(self):
"""Save the current sync state to disk."""
try:
state = {
'last_sync_time': self.last_sync_time,
'last_message_id': self.last_message_id
}
state_file = self._get_state_file_path()
with open(state_file, 'wb') as f:
pickle.dump(state, f)
logger.info(f"Saved sync state: {state}")
except Exception as e:
logger.error(f"Error saving sync state: {e}")
def _sync_messages(self):
"""
Sync new messages from Zulip to ChromaDB.
This method fetches new messages from the Zulip database that haven't been
synchronized yet and adds them to ChromaDB.
"""
try:
# Set default sync time if not set yet
if not self.last_sync_time:
# Start with messages from the last 7 days if no previous sync
self.last_sync_time = datetime.now() - timedelta(days=7)
# Get messages newer than the last sync time
logger.info(f"Fetching messages since {self.last_sync_time} or ID > {self.last_message_id}")
# Get new messages
messages = []
if self.last_message_id:
# Get messages with ID greater than the last processed message ID
messages = ZulipDatabaseService.get_messages_newer_than_id(self.last_message_id, limit=self.batch_size)
else:
# Get messages from IT channels since the last sync time
messages = ZulipDatabaseService.get_messages_from_it_channels(
since=self.last_sync_time,
limit=self.batch_size
)
if not messages:
logger.info("No new messages found to sync")
return
logger.info(f"Found {len(messages)} new messages to sync")
# Add messages to ChromaDB
synced_count = 0
already_exists_count = 0
highest_message_id = self.last_message_id or 0
# Get a list of unique message IDs
unique_message_ids = set(message.id for message in messages)
logger.info(f"Found {len(unique_message_ids)} unique message IDs out of {len(messages)} messages")
for message in messages:
message_id = message.id
# Update highest message ID seen
if message_id > highest_message_id:
highest_message_id = message_id
channel_name = ZulipDatabaseService.get_channel_name_for_message(message)
sender_name = ZulipDatabaseService.get_sender_name_for_message(message)
# Check if this message already exists in ChromaDB to avoid duplicates
if ChromaDBService.message_exists(message_id):
already_exists_count += 1
logger.debug(f"Message {message_id} already exists in ChromaDB, skipping")
continue
# Add the message to ChromaDB
success = ChromaDBService.add_message(
message_id=message_id,
content=message.content,
channel_name=channel_name,
subject=message.subject,
sender_name=sender_name,
date_sent=message.date_sent
)
if success:
synced_count += 1
else:
logger.warning(f"Failed to add message {message_id} to ChromaDB")
# Update the last sync time and message ID
self.last_sync_time = datetime.now()
if highest_message_id > (self.last_message_id or 0):
self.last_message_id = highest_message_id
# Save the sync state
self._save_sync_state()
logger.info(f"Sync completed. Added {synced_count} new messages to ChromaDB. Skipped {already_exists_count} existing messages. Last message ID: {self.last_message_id}")
except Exception as e:
logger.error(f"Error syncing messages: {e}")
def _sync_loop(self):
"""Main sync loop."""
while self.is_running:
try:
self._sync_messages()
# Sleep for the specified interval
for _ in range(self.sync_interval):
if not self.is_running:
break
time.sleep(1)
except Exception as e:
logger.error(f"Error in sync loop: {e}")
# Sleep a bit before retrying to avoid tight error loops
time.sleep(5)
def start(self):
"""Start the message sync service."""
if self.is_running:
logger.warning("Sync service is already running")
return
logger.info(f"Starting message sync service with interval {self.sync_interval} seconds")
self.is_running = True
self.sync_thread = threading.Thread(target=self._sync_loop)
self.sync_thread.daemon = True
self.sync_thread.start()
def stop(self):
"""Stop the message sync service."""
if not self.is_running:
logger.warning("Sync service is not running")
return
logger.info("Stopping message sync service")
self.is_running = False
if self.sync_thread:
self.sync_thread.join(timeout=10)
logger.info("Sync service stopped")
def sync_now(self):
"""Manually trigger a sync operation."""
logger.info("Manual sync triggered")
self._sync_messages()

BIN
chromadb/bm25_index.pkl Normal file

Binary file not shown.

BIN
chromadb/chroma.sqlite3 Normal file

Binary file not shown.

141
compare_all_messages.py Executable file
View File

@ -0,0 +1,141 @@
#!/usr/bin/env python
"""
Simple script to compare ALL messages in Zulip to ChromaDB with no restrictions.
"""
import os
import sys
import logging
from collections import defaultdict
from datetime import datetime
# Configure logging
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger("compare_all_messages")
# Add the current directory to the path so we can import the app module
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
# Apply NumPy compatibility patch for ChromaDB
from app.utils import patch_chromadb_numpy
patch_chromadb_numpy()
from app import create_app
from app.db import get_chroma_collection, get_db_session
from app.db.zulip_service import ZulipDatabaseService
from app.models.zulip import Message
def main():
"""Main function to compare Zulip messages with ChromaDB entries."""
logger.info("Starting simple comparison of ALL messages")
# Create the Flask app (needed for context)
app = create_app()
with app.app_context():
print("\n====================================================")
print("COMPARING ALL ZULIP MESSAGES WITH CHROMADB")
print(f"Started at: {datetime.now()}")
print("====================================================\n")
try:
# Get Zulip DB session
session = get_db_session()
# Get ALL messages from Zulip
print("Fetching all messages from Zulip...")
zulip_messages = session.query(Message).all()
zulip_ids = set(str(msg.id) for msg in zulip_messages)
# Get channel counts
channel_counts = defaultdict(int)
for message in zulip_messages:
channel_name = ZulipDatabaseService.get_channel_name_for_message(message)
if channel_name is None:
channel_name = "Unknown Channel"
channel_counts[channel_name] += 1
# Print Zulip stats
print(f"\nZulip has {len(zulip_messages)} total messages across {len(channel_counts)} channels")
# Get ChromaDB collection
collection = get_chroma_collection()
if not collection:
print("ERROR: Failed to get ChromaDB collection")
return
# Get all entries from ChromaDB
print("Fetching all entries from ChromaDB...")
chroma_result = collection.get(include=['metadatas'])
if not chroma_result or 'ids' not in chroma_result or not chroma_result['ids']:
print("No entries found in ChromaDB")
return
# Get unique ChromaDB IDs
chroma_ids = set(chroma_result['ids'])
# Get channel counts for ChromaDB
chroma_channel_counts = defaultdict(int)
for i, _ in enumerate(chroma_result['ids']):
if chroma_result.get('metadatas') and len(chroma_result['metadatas']) > i:
metadata = chroma_result['metadatas'][i]
channel = metadata.get('channel', 'Unknown')
chroma_channel_counts[channel] += 1
# Print ChromaDB stats
print(f"ChromaDB has {len(chroma_result['ids'])} total entries")
print(f"ChromaDB has {len(chroma_ids)} unique entries")
# Calculate missing and extra
missing_from_chromadb = zulip_ids - chroma_ids
extra_in_chromadb = chroma_ids - zulip_ids
# Calculate overall sync percentage
sync_percentage = (len(chroma_ids) / len(zulip_ids) * 100) if zulip_ids else 0
# Print comparison results
print("\n====================================================")
print("COMPARISON RESULTS")
print("====================================================")
print(f"Zulip total messages: {len(zulip_messages)}")
print(f"ChromaDB total entries: {len(chroma_result['ids'])}")
print(f"ChromaDB unique entries: {len(chroma_ids)}")
print(f"Sync percentage: {sync_percentage:.2f}%")
print(f"Messages in Zulip but not in ChromaDB: {len(missing_from_chromadb)}")
print(f"Entries in ChromaDB not in Zulip: {len(extra_in_chromadb)}")
# Print channel comparison
print("\nCHANNEL COMPARISON:")
print("-" * 70)
print(f"{'Channel':<25} {'Zulip':<10} {'ChromaDB':<10} {'Diff':<10} {'%':<10}")
print("-" * 70)
all_channels = sorted(set(channel_counts.keys()) | set(chroma_channel_counts.keys()))
for channel in all_channels:
zulip_count = channel_counts.get(channel, 0)
chroma_count = chroma_channel_counts.get(channel, 0)
diff = zulip_count - chroma_count
percentage = (chroma_count / zulip_count * 100) if zulip_count > 0 else 0
print(f"{channel[:25]:<25} {zulip_count:<10} {chroma_count:<10} {diff:<10} {percentage:.2f}%")
# Print recommendations
print("\n====================================================")
print("RECOMMENDATIONS")
print("====================================================")
if sync_percentage < 100:
print("- Run ./sync_all_messages.py to sync missing messages")
else:
print("- All messages are synced!")
print(f"\nComparison completed at: {datetime.now()}")
except Exception as e:
print(f"Error during comparison: {e}")
logger.error(f"Error during comparison: {e}")
if __name__ == "__main__":
main()

333
compare_messages.py Executable file
View File

@ -0,0 +1,333 @@
#!/usr/bin/env python
"""
Script to compare the number of messages in Zulip channels to ChromaDB.
This script will gather statistics on message counts from both Zulip DB and ChromaDB,
then generate a report showing discrepancies between the two.
"""
import os
import sys
import logging
from collections import defaultdict, Counter
from datetime import datetime, timedelta
import argparse
# Configure logging
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger("compare_messages")
# Add the current directory to the path so we can import the app module
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
# Apply NumPy compatibility patch for ChromaDB
from app.utils import patch_chromadb_numpy
patch_chromadb_numpy()
from app import create_app
from app.db import get_chroma_collection, get_db_session
from app.db.zulip_service import ZulipDatabaseService
from app.models.zulip import Message, Stream, Recipient, UserProfile
from sqlalchemy import and_, not_, or_
from app.config import Config
def get_excluded_user_ids():
"""Get the user IDs of IT_Bot and ai_bot."""
session = get_db_session()
excluded_users = session.query(UserProfile).filter(
UserProfile.full_name.in_(['IT_Bot', 'ai_bot'])
).all()
excluded_user_ids = [user.id for user in excluded_users]
logger.info(f"Excluding messages from users: {[u.full_name for u in excluded_users]} (IDs: {excluded_user_ids})")
return excluded_user_ids
def get_sandbox_recipient_id():
"""Get the recipient ID for the sandbox channel."""
session = get_db_session()
sandbox_stream = session.query(Stream).filter(
Stream.name == 'sandbox'
).first()
if sandbox_stream:
logger.info(f"Excluding messages from sandbox channel (recipient_id={sandbox_stream.recipient_id})")
return sandbox_stream.recipient_id
else:
logger.warning("Sandbox channel not found")
return None
def get_zulip_message_counts(days=30):
"""
Get message counts from Zulip database for all channels except sandbox,
also excluding IT_Bot and ai_bot messages.
Args:
days: Number of days to look back
Returns:
dict: Channel name to message count mapping
"""
logger.info(f"Getting message counts from Zulip DB for the last {days} days")
try:
session = get_db_session()
# Get excluded user IDs (IT_Bot and ai_bot)
excluded_user_ids = get_excluded_user_ids()
# Get sandbox recipient ID to exclude
sandbox_recipient_id = get_sandbox_recipient_id()
# Build filters
since_date = datetime.now() - timedelta(days=days)
filters = [Message.date_sent >= since_date]
# Add filter for excluded users
if excluded_user_ids:
filters.append(not_(Message.sender_id.in_(excluded_user_ids)))
# Add filter for excluded recipient (sandbox)
if sandbox_recipient_id:
filters.append(Message.recipient_id != sandbox_recipient_id)
# Get all messages
messages = session.query(Message).filter(and_(*filters)).all()
# Get all channels except sandbox
streams = session.query(Stream).filter(
Stream.deactivated == False
).all()
# Filter out sandbox
included_streams = [stream for stream in streams
if stream.recipient_id != sandbox_recipient_id]
# Print the list of channels being analyzed
channels = [(stream.name, stream.recipient_id) for stream in included_streams]
channels.sort(key=lambda x: x[0])
logger.info(f"Analyzing messages from {len(channels)} channels:")
for channel_name, recipient_id in channels:
logger.info(f"- {channel_name} (recipient_id={recipient_id})")
# Count messages by channel
channel_counts = defaultdict(int)
message_ids = set()
for message in messages:
channel_name = ZulipDatabaseService.get_channel_name_for_message(message)
if channel_name and channel_name != "sandbox":
channel_counts[channel_name] += 1
message_ids.add(str(message.id)) # Convert to string for comparison with ChromaDB
# Print the message counts by channel
logger.info(f"Message counts by channel:")
for channel, count in sorted(channel_counts.items()):
logger.info(f"- {channel}: {count} messages")
return {
'channel_counts': dict(channel_counts),
'total_count': len(messages),
'unique_count': len(message_ids),
'message_ids': message_ids
}
except Exception as e:
logger.error(f"Error getting Zulip message counts: {e}")
return {'channel_counts': {}, 'total_count': 0, 'unique_count': 0, 'message_ids': set()}
def get_chromadb_message_counts():
"""
Get message counts from ChromaDB.
Returns:
dict: Statistics about ChromaDB messages
"""
logger.info("Getting message counts from ChromaDB")
try:
collection = get_chroma_collection()
if not collection:
logger.error("Failed to get ChromaDB collection")
return {'channel_counts': {}, 'total_count': 0, 'unique_count': 0, 'message_ids': set()}
# Get all entries
result = collection.get(include=['metadatas'])
if not result or 'ids' not in result or not result['ids']:
logger.info("No entries found in ChromaDB")
return {'channel_counts': {}, 'total_count': 0, 'unique_count': 0, 'message_ids': set()}
# Count messages by channel
channel_counts = defaultdict(int)
message_ids = set()
for i, message_id in enumerate(result['ids']):
# Extract channel from metadata
if result.get('metadatas') and len(result['metadatas']) > i:
metadata = result['metadatas'][i]
channel = metadata.get('channel', 'Unknown')
if channel != "sandbox":
channel_counts[channel] += 1
# Add to message_ids set
message_ids.add(message_id)
# Count duplicates
id_counts = Counter(result['ids'])
duplicates = {message_id: count for message_id, count in id_counts.items() if count > 1}
# Print the message counts by channel
logger.info(f"ChromaDB message counts by channel:")
for channel, count in sorted(channel_counts.items()):
logger.info(f"- {channel}: {count} messages")
return {
'channel_counts': dict(channel_counts),
'total_count': len(result['ids']),
'unique_count': len(message_ids),
'message_ids': message_ids,
'duplicate_count': len(duplicates),
'duplicates': duplicates
}
except Exception as e:
logger.error(f"Error getting ChromaDB message counts: {e}")
return {'channel_counts': {}, 'total_count': 0, 'unique_count': 0, 'message_ids': set()}
def compare_counts(zulip_counts, chromadb_counts, days):
"""
Compare message counts between Zulip and ChromaDB.
Args:
zulip_counts: Counts from Zulip DB
chromadb_counts: Counts from ChromaDB
days: Number of days looked back
Returns:
dict: Comparison statistics
"""
logger.info("Comparing message counts")
# Get message IDs in Zulip but not in ChromaDB
zulip_ids = set(zulip_counts['message_ids'])
chroma_ids = set(chromadb_counts['message_ids'])
# Convert all IDs to strings for comparison
zulip_ids = {str(id) for id in zulip_ids}
chroma_ids = {str(id) for id in chroma_ids}
missing_from_chromadb = zulip_ids - chroma_ids
# Get message IDs in ChromaDB but not in Zulip (within the timeframe)
extra_in_chromadb = chroma_ids - zulip_ids
# Channel comparison
channel_comparison = {}
all_channels = set(zulip_counts['channel_counts'].keys()) | set(chromadb_counts['channel_counts'].keys())
for channel in all_channels:
zulip_count = zulip_counts['channel_counts'].get(channel, 0)
chromadb_count = chromadb_counts['channel_counts'].get(channel, 0)
difference = zulip_count - chromadb_count
channel_comparison[channel] = {
'zulip_count': zulip_count,
'chromadb_count': chromadb_count,
'difference': difference,
'percentage': (chromadb_count / zulip_count * 100) if zulip_count > 0 else 0
}
return {
'channel_comparison': channel_comparison,
'missing_from_chromadb': missing_from_chromadb,
'missing_count': len(missing_from_chromadb),
'extra_in_chromadb': extra_in_chromadb,
'extra_count': len(extra_in_chromadb),
'zulip_total': zulip_counts['total_count'],
'chromadb_total': chromadb_counts['total_count'],
'zulip_unique': zulip_counts['unique_count'],
'chromadb_unique': chromadb_counts['unique_count'],
'duplicate_count': chromadb_counts.get('duplicate_count', 0),
'days': days
}
def print_comparison_report(comparison):
"""
Print a report of the comparison.
Args:
comparison: Comparison statistics
"""
print("\n" + "=" * 80)
print(f"ZULIP TO CHROMADB COMPARISON REPORT (Last {comparison['days']} days)")
print("=" * 80)
print("\nSUMMARY:")
print(f"Zulip total messages: {comparison['zulip_total']}")
print(f"Zulip unique messages: {comparison['zulip_unique']}")
print(f"ChromaDB total entries: {comparison['chromadb_total']}")
print(f"ChromaDB unique entries: {comparison['chromadb_unique']}")
print(f"Duplicate entries in ChromaDB: {comparison['duplicate_count']}")
sync_percentage = (comparison['chromadb_unique'] / comparison['zulip_unique'] * 100) if comparison['zulip_unique'] > 0 else 0
print(f"Overall sync rate: {sync_percentage:.2f}%")
print(f"Messages in Zulip but missing from ChromaDB: {comparison['missing_count']}")
print(f"Entries in ChromaDB not found in recent Zulip data: {comparison['extra_count']}")
print("\nCHANNEL BREAKDOWN:")
print("-" * 80)
print(f"{'Channel':<25} {'Zulip':<10} {'ChromaDB':<10} {'Diff':<10} {'Sync %':<10}")
print("-" * 80)
for channel, stats in sorted(comparison['channel_comparison'].items()):
print(f"{channel:<25} {stats['zulip_count']:<10} {stats['chromadb_count']:<10} {stats['difference']:<10} {stats['percentage']:.2f}%")
if comparison['missing_count'] > 0:
print("\nMISSING MESSAGE IDS (Sample):")
print(", ".join(str(mid) for mid in list(comparison['missing_from_chromadb'])[:10]))
if comparison['duplicate_count'] > 0:
print("\nDUPLICATE ENTRIES DETECTED")
print(f"Total messages with duplicates: {comparison['duplicate_count']}")
print("\n" + "=" * 80)
print("RECOMMENDATIONS:")
if comparison['duplicate_count'] > 0:
print("- Run ./fix_duplicate_entries.py to remove duplicate entries")
if comparison['missing_count'] > 0:
print("- Run python sync_all_channels.py --force --days {0} to sync missing messages".format(comparison['days']))
if sync_percentage < 95:
print("- Investigate sync service settings and DB connection issues")
print("=" * 80 + "\n")
def main():
"""Main entry point."""
parser = argparse.ArgumentParser(description="Compare Zulip channel messages to ChromaDB entries")
parser.add_argument("--days", type=int, default=30, help="Number of days to look back in Zulip history")
args = parser.parse_args()
logger.info("Starting message comparison")
# Create the Flask app (needed for context)
app = create_app()
with app.app_context():
# Get message counts
zulip_counts = get_zulip_message_counts(days=args.days)
chromadb_counts = get_chromadb_message_counts()
# Compare counts
comparison = compare_counts(zulip_counts, chromadb_counts, args.days)
# Print report
print_comparison_report(comparison)
logger.info("Comparison completed")
if __name__ == "__main__":
main()

23
ecosystem.config.js Normal file
View File

@ -0,0 +1,23 @@
module.exports = {
apps: [
{
name: 'zulip-bot',
script: './run_app.sh',
interpreter: '/bin/bash',
instances: 1,
autorestart: true,
watch: false,
max_memory_restart: '500M',
env: {
NODE_ENV: 'production',
FLASK_APP: 'app',
FLASK_RUN_PORT: 5100
},
log_date_format: 'YYYY-MM-DD HH:mm:ss',
error_file: 'logs/zulip-bot-error.log',
out_file: 'logs/zulip-bot-out.log',
merge_logs: true,
time: true
}
]
};

5618
logs/zulip-bot-error.log Normal file

File diff suppressed because it is too large Load Diff

47
logs/zulip-bot-out.log Normal file
View File

@ -0,0 +1,47 @@
2025-05-14T17:36:16: Checking for processes on port 5100...
2025-05-14T17:36:16: No process found on port 5100
2025-05-14T17:36:16: Activating virtual environment...
2025-05-14T17:36:16: Starting Flask app on port 5100...
2025-05-14T17:36:17: NumPy compatibility patch applied for ChromaDB
2025-05-14T17:36:17: * Serving Flask app 'app'
2025-05-14T17:36:17: * Debug mode: on
2025-05-14T17:38:41: Flask app stopped
2025-05-14T17:38:41: Checking for processes on port 5100...
2025-05-14T17:38:41: No process found on port 5100
2025-05-14T17:38:41: Activating virtual environment...
2025-05-14T17:38:41: Starting Flask app on port 5100...
2025-05-14T17:38:42: NumPy compatibility patch applied for ChromaDB
2025-05-14T17:38:42: * Serving Flask app 'app'
2025-05-14T17:38:42: * Debug mode: on
2025-05-14T17:38:42: Flask app stopped
2025-05-14T17:38:42: Checking for processes on port 5100...
2025-05-14T17:38:42: Killing process 2093957 on port 5100
2025-05-14T17:38:42: Activating virtual environment...
2025-05-14T17:38:42: Starting Flask app on port 5100...
2025-05-14T17:38:43: NumPy compatibility patch applied for ChromaDB
2025-05-14T17:38:43: * Serving Flask app 'app'
2025-05-14T17:38:43: * Debug mode: on
2025-05-14T17:38:51: Flask app stopped
2025-05-14T17:38:51: Checking for processes on port 5100...
2025-05-14T17:38:51: No process found on port 5100
2025-05-14T17:38:51: Activating virtual environment...
2025-05-14T17:38:51: Starting Flask app on port 5100...
2025-05-14T17:38:52: NumPy compatibility patch applied for ChromaDB
2025-05-14T17:38:52: * Serving Flask app 'app'
2025-05-14T17:38:52: * Debug mode: on
2025-05-15T09:29:44: Flask app stopped
2025-05-15T09:29:44: Checking for processes on port 5100...
2025-05-15T09:29:44: No process found on port 5100
2025-05-15T09:29:44: Activating virtual environment...
2025-05-15T09:29:44: Starting Flask app on port 5100...
2025-05-15T09:29:45: NumPy compatibility patch applied for ChromaDB
2025-05-15T09:29:45: * Serving Flask app 'app'
2025-05-15T09:29:45: * Debug mode: on
2025-05-15T09:29:46: Flask app stopped
2025-05-15T09:29:46: Checking for processes on port 5100...
2025-05-15T09:29:46: No process found on port 5100
2025-05-15T09:29:46: Activating virtual environment...
2025-05-15T09:29:46: Starting Flask app on port 5100...
2025-05-15T09:29:47: NumPy compatibility patch applied for ChromaDB
2025-05-15T09:29:47: * Serving Flask app 'app'
2025-05-15T09:29:47: * Debug mode: on

38
pm2_start.sh Executable file
View File

@ -0,0 +1,38 @@
#!/bin/bash
# Create logs directory if it doesn't exist
mkdir -p logs
# Make sure the run_app.sh script is executable
chmod +x run_app.sh
# Check if PM2 is installed
if ! command -v pm2 &> /dev/null; then
echo "PM2 is not installed. Installing..."
npm install -g pm2
fi
# Start the application with PM2
echo "Starting Zulip Bot service with PM2..."
pm2 start ecosystem.config.js
# Save the current PM2 configuration
echo "Saving PM2 configuration..."
pm2 save
# Configure PM2 to start on boot (may require sudo)
echo "Setting up PM2 to start on system boot..."
if sudo pm2 startup | grep -q "sudo"; then
# If the output contains sudo commands, extract and run them
sudo_cmd=$(sudo pm2 startup | grep "sudo" | tail -n 1)
echo "Run the following command with sudo privileges to enable PM2 on startup:"
echo "$sudo_cmd"
else
echo "PM2 startup configuration completed."
fi
echo "PM2 service setup complete. Zulip Bot is now running as a service."
echo "To check status: pm2 status"
echo "To view logs: pm2 logs zulip-bot"
echo "To restart: pm2 restart zulip-bot"
echo "To stop: pm2 stop zulip-bot"

81
project_config.md Normal file
View File

@ -0,0 +1,81 @@
# Project Configuration (LTM)
*This file contains the stable, long-term context for the project.*
*It should be updated infrequently, primarily when core goals, tech, or patterns change.*
---
## Core Goal
Develop a Python-based Flask application that integrates with Zulip to:
* Connect to a Zulip PostgreSQL database.
* Retrieve messages from the specified channels: **IT Discussions, IT Knowledge, IT Support**.
* Embed these messages into ChromaDB for efficient retrieval.
* Implement a Zulip bot named **IT\_Bot** that responds to user queries when mentioned using the format `@**IT_Bot**`.
* Generate context-based responses using the Gemini API.
---
## Tech Stack
* **Backend:** Python, Flask
* **Database:** PostgreSQL (Zulip DB), ChromaDB
* **AI Integration:** Gemini API
* **Bot Framework:** Zulip Bot API
* **Environment Management:** Virtualenv or Conda
* **Version Control:** Git
---
## Critical Patterns & Conventions
* **Database Access:**
* Store database credentials securely (e.g., environment variables or a secrets manager).
* Use SQLAlchemy ORM for structured queries.
* **Message Retrieval:**
* Implement periodic tasks to pull messages from the channels.
* Ensure idempotent operations to prevent duplicates in ChromaDB.
* **Embedding Strategy:**
* Embed messages with metadata (e.g., channel name, timestamp, user ID).
* **Bot Activation:**
* The bot listens for `@**IT_Bot**` mentions.
* Upon activation, relevant context is fetched from ChromaDB, and a response is generated using the Gemini API.
* **Error Handling:**
* Implement structured logging.
* Gracefully handle API rate limits and database connection errors.
* **Security:**
* Store credentials and API keys in environment variables.
* Implement rate limiting to prevent abuse.
---
## Key Constraints
* **Channels Monitored:** IT Discussions, IT Knowledge, IT Support
* **Response Trigger:** Mentions of `@**IT_Bot**`
* **Language Support:** English, Georgian
* **Message Volume:** Approximately 500 messages per day.
* **Deployment:** Local network server
* **Zulip Bot Config:**
```
[api]
email=IT_bot-bot@zulip.lci.ge
key=ta8x0Rwlf5yLlZutETiTZbHFtQMVOv1z
site=https://zulip.lci.ge
```
* **Database Connection:** `zulip:BlackMoonSky89@zulip.lci.ge:5432/zulip`
* **Gemini API Key:** `AIzaSyD_VYKUcleCUkAxZj1sX3pWLHvGk0HDe9s`

14
requirements.txt Normal file
View File

@ -0,0 +1,14 @@
Flask==2.2.3
Werkzeug==2.2.3
SQLAlchemy==2.0.9
psycopg2-binary==2.9.6
python-dotenv==1.0.0
chromadb==0.4.6
zulip==0.8.2
google-generativeai==0.3.1
ollama==0.1.5
nomic==2.0.3
cohere==5.15.0
rank-bm25==0.2.2
nltk==3.8.1
openai==1.30.4

87
reset_chromadb.py Executable file
View File

@ -0,0 +1,87 @@
#!/usr/bin/env python3
"""
Script to reset the ChromaDB completely and properly.
This fixes issues with the vector database that cause "Add of existing embedding ID" warnings.
"""
import os
import shutil
import logging
import chromadb
from chromadb.utils import embedding_functions
from app.utils.embeddings import EmbeddingService
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger("reset_chromadb")
def main():
"""Main function to reset ChromaDB."""
try:
# Default ChromaDB path used in the application
chromadb_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "chromadb")
logger.info(f"Preparing to reset ChromaDB at {chromadb_path}")
# First check if the directory exists
if not os.path.exists(chromadb_path):
logger.info("ChromaDB directory doesn't exist yet. Creating a fresh one.")
os.makedirs(chromadb_path, exist_ok=True)
logger.info("ChromaDB directory created successfully.")
return
# Backup the existing ChromaDB directory
backup_path = f"{chromadb_path}_backup"
logger.info(f"Creating backup of ChromaDB at {backup_path}")
# Remove old backup if it exists
if os.path.exists(backup_path):
logger.info("Removing old backup")
shutil.rmtree(backup_path)
# Create backup
shutil.copytree(chromadb_path, backup_path)
logger.info("Backup created successfully")
# Delete the ChromaDB directory
logger.info("Removing existing ChromaDB directory")
shutil.rmtree(chromadb_path)
# Create fresh ChromaDB
logger.info("Creating fresh ChromaDB")
os.makedirs(chromadb_path, exist_ok=True)
# Initialize a fresh ChromaDB client and create a new collection
logger.info("Initializing fresh ChromaDB client")
client = chromadb.PersistentClient(
path=chromadb_path,
settings=chromadb.Settings(
allow_reset=True,
anonymized_telemetry=False
)
)
# Create a custom embedding function
class CustomEmbeddingFunction(embedding_functions.EmbeddingFunction):
def __call__(self, texts):
return EmbeddingService.get_ollama_embeddings(texts)
# Create a fresh collection
logger.info("Creating fresh collection")
collection = client.create_collection(
name="zulip_messages",
metadata={
"hnsw:space": "cosine"
},
embedding_function=CustomEmbeddingFunction()
)
logger.info("ChromaDB reset completed successfully")
logger.info(f"To restore the backup if needed, delete {chromadb_path} and rename {backup_path} to {chromadb_path}")
except Exception as e:
logger.error(f"Error resetting ChromaDB: {e}")
logger.error("ChromaDB reset failed. Please check the error and try again.")
if __name__ == "__main__":
main()

26
run_app.sh Executable file
View File

@ -0,0 +1,26 @@
#!/bin/bash
# Kill any process using port 5100
echo "Checking for processes on port 5100..."
pid=$(lsof -ti:5100)
if [ -n "$pid" ]; then
echo "Killing process $pid on port 5100"
kill -9 $pid
else
echo "No process found on port 5100"
fi
# Activate virtual environment
echo "Activating virtual environment..."
source venv/bin/activate
# Set Flask environment variables
export FLASK_APP=app
export FLASK_RUN_PORT=5100
# Run the Flask app
echo "Starting Flask app on port 5100..."
flask run --port=5100 --no-reload
# This script won't reach here unless the flask app is interrupted
echo "Flask app stopped"

13
setup.sh Executable file
View File

@ -0,0 +1,13 @@
#!/bin/bash
# Create a virtual environment
python3.11 -m venv venv
# Activate the virtual environment
source venv/bin/activate
# Install the required packages
pip install -r requirements.txt
echo "Setup completed successfully!"
echo "To activate the virtual environment, run: source venv/bin/activate"

580
sync_all_channels.py Executable file
View File

@ -0,0 +1,580 @@
#!/usr/bin/env python
"""
Script to sync messages from all Zulip channels (except sandbox) to ChromaDB.
This script also excludes messages from IT_Bot and ai_bot users.
"""
import os
import sys
import argparse
import logging
import signal
import time
from datetime import datetime, timedelta
import pickle
# Add the current directory to the path so we can import the app module
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
# Apply NumPy compatibility patch for ChromaDB
from app.utils import patch_chromadb_numpy
patch_chromadb_numpy()
from app import create_app
from app.db.zulip_service import ZulipDatabaseService
from app.db.chroma_service import ChromaDBService
from app.models.zulip import Message, Stream, Recipient, UserProfile
from sqlalchemy import and_, not_, or_
from app.db import get_db_session
# Configure logging
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger("sync_all_channels")
# Global flag for graceful shutdown
is_shutting_down = False
# Signal handler for CTRL+C
def signal_handler(sig, frame):
global is_shutting_down
logger.info("Received shutdown signal, completing current operation before exiting...")
is_shutting_down = True
# Register signal handler
signal.signal(signal.SIGINT, signal_handler)
signal.signal(signal.SIGTERM, signal_handler)
class AllChannelSyncService:
"""Service for syncing messages from all channels except sandbox."""
# File to store the last synced message ID
_SYNC_STATE_FILE = "all_channels_sync_state.pickle"
def __init__(self, batch_size=200, include_direct_messages=False):
"""
Initialize the sync service.
Args:
batch_size (int): Number of messages to process in each batch
include_direct_messages (bool): Whether to include direct messages
"""
self.batch_size = batch_size
self.last_sync_time = None
self.last_message_id = None
self.state_dir = os.path.dirname(os.path.abspath(__file__))
self.channels_to_sync = []
self.include_direct_messages = include_direct_messages
# Load the last synced state if available
self._load_sync_state()
def _get_state_file_path(self):
"""Get the full path to the sync state file."""
return os.path.join(self.state_dir, self._SYNC_STATE_FILE)
def _load_sync_state(self):
"""Load the last sync state from disk."""
try:
state_file = self._get_state_file_path()
if os.path.exists(state_file):
with open(state_file, 'rb') as f:
state = pickle.load(f)
self.last_sync_time = state.get('last_sync_time')
self.last_message_id = state.get('last_message_id')
logger.info(f"Loaded sync state: last_sync_time={self.last_sync_time}, last_message_id={self.last_message_id}")
else:
logger.info("No previous sync state found, starting fresh")
except Exception as e:
logger.error(f"Error loading sync state: {e}")
def _save_sync_state(self, channel_counts=None):
"""Save the current sync state to disk."""
try:
state = {
'last_sync_time': self.last_sync_time,
'last_message_id': self.last_message_id
}
if channel_counts:
state['channel_counts'] = channel_counts
state_file = self._get_state_file_path()
# Save to a temporary file first, then rename to avoid corruption if interrupted
temp_file = state_file + '.temp'
with open(temp_file, 'wb') as f:
pickle.dump(state, f)
f.flush()
os.fsync(f.fileno()) # Ensure data is written to disk
# Rename the temp file to the actual state file (atomic operation)
os.rename(temp_file, state_file)
logger.info(f"Saved sync state: {state}")
except Exception as e:
logger.error(f"Error saving sync state: {e}")
def get_excluded_user_ids(self):
"""Get the user IDs of IT_Bot and ai_bot."""
session = get_db_session()
excluded_users = session.query(UserProfile).filter(
UserProfile.full_name.in_(['IT_Bot', 'ai_bot'])
).all()
excluded_user_ids = [user.id for user in excluded_users]
logger.info(f"Excluding messages from users: {[u.full_name for u in excluded_users]} (IDs: {excluded_user_ids})")
return excluded_user_ids
def get_sandbox_recipient_id(self):
"""Get the recipient ID for the sandbox channel."""
session = get_db_session()
sandbox_stream = session.query(Stream).filter(
Stream.name == 'sandbox'
).first()
if sandbox_stream:
logger.info(f"Excluding messages from sandbox channel (recipient_id={sandbox_stream.recipient_id})")
return sandbox_stream.recipient_id
else:
logger.warning("Sandbox channel not found")
return None
def get_channels_to_sync(self):
"""Get all active channels except sandbox with their recipient IDs."""
session = get_db_session()
sandbox_recipient_id = self.get_sandbox_recipient_id()
# Get all active streams
streams = session.query(Stream).filter(
Stream.deactivated == False
).all()
# Filter out sandbox
included_streams = [stream for stream in streams
if stream.recipient_id != sandbox_recipient_id]
# Create a list of channels to sync with their recipient IDs
channels = [(stream.name, stream.recipient_id) for stream in included_streams]
# Sort by channel name
channels.sort(key=lambda x: x[0])
# Print the list of channels
logger.info(f"Found {len(channels)} channels to sync:")
for channel_name, recipient_id in channels:
logger.info(f"- {channel_name} (recipient_id={recipient_id})")
self.channels_to_sync = channels
# Return just the recipient IDs for filtering
recipient_ids = [recipient_id for _, recipient_id in channels]
return recipient_ids
def get_messages_newer_than_id(self, message_id, excluded_user_ids, excluded_recipient_id):
"""Get messages with ID greater than the specified ID."""
session = get_db_session()
# Build filters
filters = [Message.id > message_id]
# Add filter for excluded users
if excluded_user_ids:
filters.append(not_(Message.sender_id.in_(excluded_user_ids)))
# Add filter for excluded recipient (sandbox)
if excluded_recipient_id:
filters.append(Message.recipient_id != excluded_recipient_id)
messages = session.query(Message).filter(
and_(*filters)
).order_by(Message.id.asc()).limit(self.batch_size).all()
return messages
def get_messages_for_timeframe(self, since, excluded_user_ids, excluded_recipient_id, limit=1000, all_messages=False):
"""
Get messages from the specified timeframe.
Args:
since (datetime): Get messages after this datetime
excluded_user_ids (list): User IDs to exclude
excluded_recipient_id (int): Recipient ID to exclude
limit (int): Maximum number of messages to return
all_messages (bool): If True, ignore the since parameter and get all messages
Returns:
list: List of Message objects
"""
session = get_db_session()
# Build filters
filters = []
# Add date filter if specified and not getting all messages
if since and not all_messages:
filters.append(Message.date_sent >= since)
# Add filter for excluded users
if excluded_user_ids:
filters.append(not_(Message.sender_id.in_(excluded_user_ids)))
# Add filter for excluded recipient (sandbox)
if excluded_recipient_id:
filters.append(Message.recipient_id != excluded_recipient_id)
# Get results
query = session.query(Message)
if filters:
query = query.filter(and_(*filters))
messages = query.order_by(Message.id.desc()).limit(limit).all()
return messages
def get_channel_message_counts(self, since, excluded_user_ids, excluded_recipient_id, all_messages=False):
"""Get message counts by channel for the specified timeframe."""
session = get_db_session()
# Build filters
filters = []
# Add date filter if specified and not getting all messages
if since and not all_messages:
filters.append(Message.date_sent >= since)
# Add filter for excluded users
if excluded_user_ids:
filters.append(not_(Message.sender_id.in_(excluded_user_ids)))
# Add filter for excluded recipient (sandbox)
if excluded_recipient_id:
filters.append(Message.recipient_id != excluded_recipient_id)
# Get all messages
query = session.query(Message)
if filters:
query = query.filter(and_(*filters))
messages = query.all()
# Count messages by channel
channel_counts = {}
for message in messages:
channel_name = ZulipDatabaseService.get_channel_name_for_message(message)
if channel_name:
if channel_name not in channel_counts:
channel_counts[channel_name] = 0
channel_counts[channel_name] += 1
# Sort by channel name
sorted_counts = {k: channel_counts[k] for k in sorted(channel_counts.keys())}
# Print the message counts by channel
logger.info(f"Message counts by channel:")
for channel, count in sorted_counts.items():
logger.info(f"- {channel}: {count} messages")
return sorted_counts
def sync_messages(self, days=None, force=False, max_messages=5000, all_messages=False):
"""
Sync messages from all Zulip channels to ChromaDB.
Args:
days (int): Number of days to look back for messages (default: use sync state)
force (bool): Whether to force sync all messages from the lookback period
max_messages (int): Maximum total number of messages to sync
all_messages (bool): If True, ignore date filtering and sync all messages
"""
global is_shutting_down
try:
# Get excluded user IDs (IT_Bot and ai_bot)
excluded_user_ids = self.get_excluded_user_ids()
# Get sandbox recipient ID to exclude
excluded_recipient_id = self.get_sandbox_recipient_id()
# Get all channels to sync and their recipient IDs
self.get_channels_to_sync()
# Reset sync state if forced
if force:
if all_messages:
self.last_sync_time = None
self.last_message_id = None
logger.info("Force syncing ALL messages regardless of date")
elif days:
self.last_sync_time = datetime.now() - timedelta(days=days)
self.last_message_id = None
logger.info(f"Force syncing messages from the last {days} days")
# Set default sync time if not set yet and not syncing all messages
if not self.last_sync_time and not all_messages and not force:
# Start with messages from the last 30 days if no previous sync
self.last_sync_time = datetime.now() - timedelta(days=30 if not days else days)
logger.info(f"No previous sync time, starting from {self.last_sync_time}")
# Count total messages to sync if forcing
total_messages = 0
if force:
since_date = None if all_messages else (datetime.now() - timedelta(days=days if days else 30))
all_messages_count = self.get_messages_for_timeframe(
since=since_date,
excluded_user_ids=excluded_user_ids,
excluded_recipient_id=excluded_recipient_id,
limit=max_messages,
all_messages=all_messages
)
total_messages = len(all_messages_count)
logger.info(f"Found a total of {total_messages} messages to sync")
# Get message counts by channel
self.get_channel_message_counts(since_date, excluded_user_ids, excluded_recipient_id, all_messages=all_messages)
# Run multiple batches of sync
total_synced = 0
already_exists_count = 0
highest_message_id = self.last_message_id or 0
batch_count = 0
# Track synced messages by channel
channel_sync_counts = {}
# Time to save state
last_save_time = time.time()
save_interval = 10 # Save state every 10 seconds
while not is_shutting_down:
batch_count += 1
logger.info(f"Running batch {batch_count}, synced {total_synced} messages so far")
# Get new messages
messages = []
if self.last_message_id:
# Get messages with ID greater than the last processed message ID
messages = self.get_messages_newer_than_id(
self.last_message_id,
excluded_user_ids,
excluded_recipient_id
)
else:
# Get messages since the last sync time or all messages
messages = self.get_messages_for_timeframe(
since=self.last_sync_time,
excluded_user_ids=excluded_user_ids,
excluded_recipient_id=excluded_recipient_id,
limit=self.batch_size,
all_messages=all_messages
)
if not messages:
logger.info("No new messages found to sync")
break
logger.info(f"Found {len(messages)} new messages to sync in batch {batch_count}")
# Process each message
synced_in_batch = 0
for message in messages:
# Check if we need to shutdown
if is_shutting_down:
logger.info("Shutdown requested, saving state and exiting...")
break
message_id = message.id
# Update highest message ID seen
if message_id > highest_message_id:
highest_message_id = message_id
channel_name = ZulipDatabaseService.get_channel_name_for_message(message)
sender_name = ZulipDatabaseService.get_sender_name_for_message(message)
# Skip excluded channels and users
if channel_name == "sandbox":
continue
if sender_name in ["IT_Bot", "ai_bot"]:
continue
# Skip direct messages unless explicitly included
if not self.include_direct_messages and channel_name in ["Direct Message", "Group Message"]:
logger.debug(f"Skipping {channel_name} message {message_id} (use --include-direct-messages to include)")
continue
# Check if this message already exists in ChromaDB to avoid duplicates
if ChromaDBService.message_exists(message_id):
already_exists_count += 1
logger.debug(f"Message {message_id} already exists in ChromaDB, skipping")
continue
# Handle None channel names
if channel_name is None:
channel_name = "Unknown Channel"
logger.warning(f"Found message {message_id} with None channel name, using '{channel_name}' instead")
# Add the message to ChromaDB
try:
success = ChromaDBService.add_message(
message_id=message_id,
content=message.content,
channel_name=channel_name,
subject=message.subject,
sender_name=sender_name,
date_sent=message.date_sent
)
if success:
synced_in_batch += 1
total_synced += 1
# Update channel counts
if channel_name not in channel_sync_counts:
channel_sync_counts[channel_name] = 0
channel_sync_counts[channel_name] += 1
# Update the last message ID after each successful addition
self.last_message_id = message_id
# Save state periodically
current_time = time.time()
if current_time - last_save_time > save_interval:
self.last_sync_time = datetime.now()
self._save_sync_state(channel_sync_counts)
last_save_time = current_time
else:
logger.warning(f"Failed to add message {message_id} to ChromaDB")
except Exception as e:
logger.error(f"Error adding message {message_id} to ChromaDB: {e}")
# Continue with next message
# Update the last sync time and message ID at the end of the batch
self.last_sync_time = datetime.now()
if highest_message_id > (self.last_message_id or 0):
self.last_message_id = highest_message_id
# Save the sync state after each batch
self._save_sync_state(channel_sync_counts)
last_save_time = time.time()
logger.info(f"Batch {batch_count} completed. Added {synced_in_batch} new messages to ChromaDB. " +
f"Total synced: {total_synced}. Last message ID: {self.last_message_id}")
# Check if we've reached the max messages limit
if total_synced >= max_messages:
logger.info(f"Reached max messages limit of {max_messages}")
break
# If this batch had fewer messages than the batch size, we're done
if len(messages) < self.batch_size:
logger.info("Fetched fewer messages than batch size, assuming all messages have been processed")
break
# Final state save with channel statistics
if is_shutting_down:
logger.info("Shutdown signal received, saving final state...")
# Print synced messages by channel
if channel_sync_counts:
logger.info("Messages synced by channel:")
try:
# Use a safe sorting method that handles None keys
sorted_items = sorted(channel_sync_counts.items(),
key=lambda item: item[0] if item[0] is not None else "")
for channel, count in sorted_items:
channel_name = channel if channel is not None else "Unknown Channel"
logger.info(f"- {channel_name}: {count} messages")
except Exception as e:
logger.warning(f"Error displaying channel stats: {e}")
# Fallback display without sorting
for channel, count in channel_sync_counts.items():
channel_name = channel if channel is not None else "Unknown Channel"
logger.info(f"- {channel_name}: {count} messages")
# Return the final stats
stats = {
'last_sync_time': self.last_sync_time,
'last_message_id': self.last_message_id,
'total_synced': total_synced,
'batches': batch_count,
'already_exists': already_exists_count,
'channel_counts': channel_sync_counts
}
logger.info(f"Sync completed. Current state: {stats}")
return stats
except Exception as e:
logger.error(f"Error syncing messages: {e}")
# Save state on error
self._save_sync_state()
return None
def main():
"""Main entry point."""
# Parse command line arguments
parser = argparse.ArgumentParser(description="Sync messages from all Zulip channels to ChromaDB")
parser.add_argument("--days", type=int, help="Number of days to look back for messages")
parser.add_argument("--force", action="store_true", help="Force sync all messages from the lookback period")
parser.add_argument("--batch-size", type=int, default=200, help="Number of messages to process in each batch")
parser.add_argument("--max-messages", type=int, default=10000, help="Maximum total number of messages to sync")
parser.add_argument("--include-direct-messages", action="store_true", help="Include direct and group messages in sync")
parser.add_argument("--all-messages", action="store_true", help="Sync all messages regardless of date")
args = parser.parse_args()
# Create the Flask app
app = create_app()
with app.app_context():
try:
# Initialize sync service
sync_service = AllChannelSyncService(
batch_size=args.batch_size,
include_direct_messages=args.include_direct_messages
)
# Sync messages
stats = sync_service.sync_messages(
days=args.days,
force=args.force,
max_messages=args.max_messages,
all_messages=args.all_messages
)
if stats:
channel_counts = stats.get('channel_counts', {})
print(f"\nSync completed at {datetime.now()}")
print(f"Last sync time: {stats['last_sync_time']}")
print(f"Last message ID: {stats['last_message_id']}")
print(f"Total messages synced: {stats['total_synced']}")
print(f"Number of batches: {stats['batches']}")
print(f"Messages already in DB: {stats['already_exists']}")
if channel_counts:
print("\nMessages synced by channel:")
try:
# Use a safe sorting method that handles None keys
sorted_items = sorted(channel_counts.items(),
key=lambda item: item[0] if item[0] is not None else "")
for channel, count in sorted_items:
channel_name = channel if channel is not None else "Unknown Channel"
print(f"- {channel_name}: {count} messages")
except Exception as e:
# Fallback display without sorting
for channel, count in channel_counts.items():
channel_name = channel if channel is not None else "Unknown Channel"
print(f"- {channel_name}: {count} messages")
except KeyboardInterrupt:
print("\nSync process interrupted by user. State has been saved.")
logger.info("Sync process interrupted by user. State has been saved.")
if __name__ == "__main__":
main()

157
sync_all_messages.py Executable file
View File

@ -0,0 +1,157 @@
#!/usr/bin/env python
"""
Script to sync ALL messages from Zulip to ChromaDB with NO restrictions.
This script will sync everything - all channels, all users, all time periods.
"""
import os
import sys
import logging
from datetime import datetime
# Configure logging
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger("sync_all_messages")
# Add the current directory to the path so we can import the app module
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
# Apply NumPy compatibility patch for ChromaDB
from app.utils import patch_chromadb_numpy
patch_chromadb_numpy()
from app import create_app
from app.db import get_db_session
from app.db.zulip_service import ZulipDatabaseService
from app.db.chroma_service import ChromaDBService
from app.models.zulip import Message
def sync_all_messages():
"""
Sync ALL messages from Zulip to ChromaDB with no restrictions.
All messages are processed in a single pass.
Returns:
dict: Statistics about the sync
"""
logger.info("Starting unrestricted sync of ALL messages in one pass")
session = get_db_session()
total_synced = 0
already_exists = 0
channel_counts = {}
# Get all messages at once
logger.info("Fetching ALL messages from Zulip database")
messages = session.query(Message).order_by(Message.id).all()
total_messages = len(messages)
logger.info(f"Found {total_messages} total messages in Zulip database")
# Process all messages
logger.info("Processing all messages")
for i, message in enumerate(messages):
message_id = message.id
# Log progress at intervals
if i % 500 == 0 and i > 0:
logger.info(f"Progress: {i}/{total_messages} messages processed ({(i/total_messages)*100:.1f}%)")
# Get message details
try:
channel_name = ZulipDatabaseService.get_channel_name_for_message(message)
sender_name = ZulipDatabaseService.get_sender_name_for_message(message)
# Handle None channel names
if channel_name is None:
channel_name = "Unknown Channel"
logger.warning(f"Message {message_id} has None channel name, using '{channel_name}' instead")
# Check if message already exists in ChromaDB
if ChromaDBService.message_exists(message_id):
already_exists += 1
continue
# Add message to ChromaDB
success = ChromaDBService.add_message(
message_id=message_id,
content=message.content,
channel_name=channel_name,
subject=message.subject,
sender_name=sender_name,
date_sent=message.date_sent
)
if success:
total_synced += 1
# Update channel counts
if channel_name not in channel_counts:
channel_counts[channel_name] = 0
channel_counts[channel_name] += 1
else:
logger.warning(f"Failed to add message {message_id} to ChromaDB")
except Exception as e:
logger.error(f"Error processing message {message_id}: {e}")
# Print channel statistics
if channel_counts:
logger.info("Messages synced by channel:")
for channel, count in sorted(channel_counts.items()):
logger.info(f"- {channel}: {count} messages")
# Return statistics
return {
'total_messages': total_messages,
'total_synced': total_synced,
'already_exists': already_exists,
'channel_counts': channel_counts
}
def main():
"""Main entry point."""
try:
# Create the Flask app (needed for context)
app = create_app()
with app.app_context():
print("\n====================================================")
print("STARTING UNRESTRICTED SYNC OF ALL ZULIP MESSAGES")
print(f"Started at: {datetime.now()}")
print("====================================================\n")
# Sync all messages
start_time = datetime.now()
stats = sync_all_messages()
end_time = datetime.now()
duration = end_time - start_time
# Print summary
print("\n====================================================")
print("SYNC COMPLETE")
print(f"Started at: {start_time}")
print(f"Completed at: {end_time}")
print(f"Duration: {duration}")
print(f"Total messages in Zulip: {stats['total_messages']}")
print(f"Total messages synced: {stats['total_synced']}")
print(f"Messages already in ChromaDB: {stats['already_exists']}")
# Print channel counts
if stats['channel_counts']:
print("\nMessages synced by channel:")
for channel, count in sorted(stats['channel_counts'].items()):
print(f"- {channel}: {count} messages")
print("====================================================\n")
except KeyboardInterrupt:
print("\nSync process interrupted by user")
logger.info("Sync process interrupted by user")
except Exception as e:
print(f"\nError during sync: {e}")
logger.error(f"Error during sync: {e}")
if __name__ == "__main__":
main()

132
sync_and_verify.sh Executable file
View File

@ -0,0 +1,132 @@
#!/bin/bash
# Script to sync all messages from all channels (except sandbox) and verify
# they're in ChromaDB
# Set up logging
LOG_FILE="logs/sync_and_verify_$(date +%Y%m%d_%H%M%S).log"
mkdir -p logs
# Make sure scripts are executable
chmod +x sync_all_channels.py
chmod +x compare_messages.py
chmod +x fix_unknown_channels.py
echo "======================================================"
echo " ZULIP CHANNEL SYNC AND VERIFY PROCESS"
echo " $(date)"
echo " Logging to: $LOG_FILE"
echo "======================================================"
echo ""
echo "=====================================================" | tee -a "$LOG_FILE"
echo "SYNC AND VERIFY PROCESS - $(date)" | tee -a "$LOG_FILE"
echo "=====================================================" | tee -a "$LOG_FILE"
# Activate virtual environment if it exists
if [ -d "venv" ]; then
echo "Activating virtual environment..." | tee -a "$LOG_FILE"
source venv/bin/activate
fi
# Set parameters for the sync
DAYS_TO_SYNC=365 # Used for verification only
MAX_MESSAGES=250
FORCE_SYNC=true
INCLUDE_DIRECT_MESSAGES=true
ALL_MESSAGES=true # Sync all messages regardless of date
TOTAL_BATCHES=1000 # Number of batches to run
echo "Configuration:" | tee -a "$LOG_FILE"
echo "- Maximum messages per batch: $MAX_MESSAGES" | tee -a "$LOG_FILE"
echo "- Force sync: $FORCE_SYNC" | tee -a "$LOG_FILE"
echo "- Include direct messages: $INCLUDE_DIRECT_MESSAGES" | tee -a "$LOG_FILE"
echo "- Sync all messages: $ALL_MESSAGES" | tee -a "$LOG_FILE"
echo "- Number of batches: $TOTAL_BATCHES" | tee -a "$LOG_FILE"
echo "- Days for verification: $DAYS_TO_SYNC" | tee -a "$LOG_FILE"
echo "" | tee -a "$LOG_FILE"
# Step 1: Sync messages in multiple batches
echo "" | tee -a "$LOG_FILE"
echo "Step 1: Syncing messages from all channels (except sandbox)..." | tee -a "$LOG_FILE"
echo "This will exclude messages from IT_Bot and ai_bot" | tee -a "$LOG_FILE"
echo "Running $TOTAL_BATCHES batches of $MAX_MESSAGES messages each" | tee -a "$LOG_FILE"
echo "" | tee -a "$LOG_FILE"
# Build the base command
SYNC_CMD="python sync_all_channels.py --max-messages $MAX_MESSAGES"
if [ "$INCLUDE_DIRECT_MESSAGES" = true ]; then
SYNC_CMD="$SYNC_CMD --include-direct-messages"
fi
if [ "$ALL_MESSAGES" = true ]; then
SYNC_CMD="$SYNC_CMD --all-messages"
fi
# Run multiple batches
for ((i=1; i<=$TOTAL_BATCHES; i++))
do
echo "Running batch $i of $TOTAL_BATCHES..." | tee -a "$LOG_FILE"
BATCH_CMD="$SYNC_CMD"
# If ALL_MESSAGES is true, we should use --force for all batches to ensure we get historical data,
# provided that FORCE_SYNC is also enabled.
# If ALL_MESSAGES is false, then --force (if enabled by FORCE_SYNC) applies only to the first batch.
if [ "$FORCE_SYNC" = true ]; then
if [ "$ALL_MESSAGES" = true ] || [ $i -eq 1 ]; then
BATCH_CMD="$BATCH_CMD --force"
fi
fi
echo "Running: $BATCH_CMD" | tee -a "$LOG_FILE"
echo "" | tee -a "$LOG_FILE"
# Run the sync command
$BATCH_CMD | tee -a "$LOG_FILE"
# Pause between batches
if [ $i -lt $TOTAL_BATCHES ]; then
echo "Pausing for 5 seconds between batches..." | tee -a "$LOG_FILE"
sleep 5
fi
done
# Step 2: Fix Unknown Channel entries
echo "" | tee -a "$LOG_FILE"
echo "Step 2: Fixing 'Unknown Channel' entries..." | tee -a "$LOG_FILE"
echo "" | tee -a "$LOG_FILE"
# Run the fix unknown channels script
FIX_CMD="python fix_unknown_channels.py"
echo "Running: $FIX_CMD" | tee -a "$LOG_FILE"
echo "" | tee -a "$LOG_FILE"
$FIX_CMD | tee -a "$LOG_FILE"
# Step 3: Verify all messages are in ChromaDB
echo "" | tee -a "$LOG_FILE"
echo "Step 3: Verifying all messages are in ChromaDB..." | tee -a "$LOG_FILE"
echo "" | tee -a "$LOG_FILE"
# Run comparison with the specified number of days for verification
COMPARE_CMD="python compare_messages.py --days $DAYS_TO_SYNC"
echo "Running: $COMPARE_CMD" | tee -a "$LOG_FILE"
echo "" | tee -a "$LOG_FILE"
$COMPARE_CMD | tee -a "$LOG_FILE"
echo "" | tee -a "$LOG_FILE"
echo "=====================================================" | tee -a "$LOG_FILE"
echo "Sync and verification process completed at $(date)" | tee -a "$LOG_FILE"
echo "See $LOG_FILE for complete log" | tee -a "$LOG_FILE"
echo "=====================================================" | tee -a "$LOG_FILE"
echo ""
echo "======================================================"
echo " SYNC AND VERIFICATION PROCESS COMPLETED"
echo " $(date)"
echo " Log file: $LOG_FILE"
echo "======================================================"
# If we activated a virtual environment, deactivate it
if [ -n "$VIRTUAL_ENV" ]; then
deactivate
fi

141
sync_messages.py Executable file
View File

@ -0,0 +1,141 @@
#!/usr/bin/env python
"""
Script to manually sync messages from Zulip to ChromaDB.
This can be run standalone or as a scheduled cron job.
"""
import os
import sys
import argparse
import logging
from datetime import datetime, timedelta
# Add the current directory to the path so we can import the app module
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
# Apply NumPy compatibility patch for ChromaDB
from app.utils import patch_chromadb_numpy
patch_chromadb_numpy()
from app import create_app
from app.utils.sync_service import MessageSyncService
from app.db.zulip_service import ZulipDatabaseService
# Configure logging
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger("sync_messages")
def sync_messages(days=None, force=False, batch_size=200, max_messages=5000):
"""
Sync messages from Zulip to ChromaDB.
Args:
days (int): Number of days to look back for messages (default: use sync state)
force (bool): Whether to force sync all messages from the lookback period
batch_size (int): Number of messages to process in each batch
max_messages (int): Maximum total number of messages to sync
"""
# Create the Flask app
app = create_app()
with app.app_context():
sync_service = MessageSyncService()
if force and days:
# If force syncing for a specific number of days, reset the sync state
sync_service.last_sync_time = datetime.now() - timedelta(days=days)
sync_service.last_message_id = None
logger.info(f"Force syncing messages from the last {days} days")
# Count total messages to sync
if force:
# Query to get message count
since_date = datetime.now() - timedelta(days=days if days else 30)
all_messages = ZulipDatabaseService.get_messages_from_it_channels(
since=since_date if since_date else None,
limit=5000
)
total_messages = len(all_messages)
logger.info(f"Found a total of {total_messages} messages to sync")
# Run multiple batches of sync
total_synced = 0
batch_count = 0
# In force mode, we need to manually run multiple batches
if force:
while total_synced < min(total_messages, max_messages):
# Manual sync with our custom batch size
logger.info(f"Running batch {batch_count+1}, synced {total_synced} messages so far")
# For first batch, we already reset the sync state above
# For subsequent batches, we'll use the last_message_id that was set
# Run the sync
sync_service._set_batch_size(batch_size)
sync_service.sync_now()
# Update counters
batch_count += 1
# Check how many we've synced by looking at highest message ID
if sync_service.last_message_id:
# We've synced up to this message ID
synced_in_batch = ZulipDatabaseService.count_messages_up_to_id(
sync_service.last_message_id,
since=since_date if since_date else None
)
# Update total (use max to ensure we don't decrease if count is wrong)
total_synced = max(total_synced, synced_in_batch)
logger.info(f"Processed {synced_in_batch} messages out of {total_messages}")
# If we've synced all messages or reached our limit, break
if synced_in_batch >= total_messages or synced_in_batch >= max_messages:
break
else:
# If no message ID was set, something went wrong
logger.warning("No message ID set after sync, may not have found any messages")
break
else:
# Just run a single sync with default settings
sync_service.sync_now()
# Get the stats
stats = {
'last_sync_time': sync_service.last_sync_time,
'last_message_id': sync_service.last_message_id,
'total_synced': total_synced,
'batches': batch_count
}
logger.info(f"Sync completed. Current state: {stats}")
return stats
if __name__ == "__main__":
# Parse command line arguments
parser = argparse.ArgumentParser(description="Sync messages from Zulip to ChromaDB")
parser.add_argument("--days", type=int, help="Number of days to look back for messages")
parser.add_argument("--force", action="store_true", help="Force sync all messages from the lookback period")
parser.add_argument("--batch-size", type=int, default=200, help="Number of messages to process in each batch")
parser.add_argument("--max-messages", type=int, default=5000, help="Maximum total number of messages to sync")
args = parser.parse_args()
# Sync messages
stats = sync_messages(
days=args.days,
force=args.force,
batch_size=args.batch_size,
max_messages=args.max_messages
)
print(f"\nSync completed at {datetime.now()}")
print(f"Last sync time: {stats['last_sync_time']}")
print(f"Last message ID: {stats['last_message_id']}")
print(f"Total messages synced: {stats['total_synced']}")
print(f"Number of batches: {stats['batches']}")

BIN
sync_state.pickle Normal file

Binary file not shown.

42
update_to_openai.sh Executable file
View File

@ -0,0 +1,42 @@
#!/bin/bash
# Script to migrate from Google Gemini to OpenAI GPT-4o
echo "Migrating from Google Gemini to OpenAI GPT-4o..."
# 1. Activate the virtual environment
source venv/bin/activate
# 2. Install OpenAI package
echo "Installing OpenAI package..."
pip install openai==1.30.4
# 3. Prompt for OpenAI API key
read -p "Enter your OpenAI API key: " openai_api_key
# 4. Update the .env file
echo "Updating .env file..."
if grep -q "OPENAI_API_KEY" .env; then
# Replace existing OPENAI_API_KEY
sed -i "s/OPENAI_API_KEY=.*/OPENAI_API_KEY=$openai_api_key/" .env
else
# Add new OPENAI_API_KEY entry
sed -i "/GEMINI_API_KEY/i # OpenAI GPT-4o (new)\nOPENAI_API_KEY=$openai_api_key\n" .env
fi
# 5. Reset and rebuild the ChromaDB
echo "Do you want to reset and rebuild the ChromaDB? (y/n)"
read -p "> " rebuild_db
if [[ $rebuild_db == "y" || $rebuild_db == "Y" ]]; then
echo "Resetting ChromaDB..."
./reset_chromadb.py
echo "Rebuilding database (syncing past 7 days of messages)..."
python sync_messages.py --force --days 7
fi
echo "Migration completed successfully!"
echo "Please restart your application to apply the changes:"
echo " 1. Stop the current process"
echo " 2. Run ./run_app.sh to start with OpenAI integration"

247
venv/bin/Activate.ps1 Normal file
View File

@ -0,0 +1,247 @@
<#
.Synopsis
Activate a Python virtual environment for the current PowerShell session.
.Description
Pushes the python executable for a virtual environment to the front of the
$Env:PATH environment variable and sets the prompt to signify that you are
in a Python virtual environment. Makes use of the command line switches as
well as the `pyvenv.cfg` file values present in the virtual environment.
.Parameter VenvDir
Path to the directory that contains the virtual environment to activate. The
default value for this is the parent of the directory that the Activate.ps1
script is located within.
.Parameter Prompt
The prompt prefix to display when this virtual environment is activated. By
default, this prompt is the name of the virtual environment folder (VenvDir)
surrounded by parentheses and followed by a single space (ie. '(.venv) ').
.Example
Activate.ps1
Activates the Python virtual environment that contains the Activate.ps1 script.
.Example
Activate.ps1 -Verbose
Activates the Python virtual environment that contains the Activate.ps1 script,
and shows extra information about the activation as it executes.
.Example
Activate.ps1 -VenvDir C:\Users\MyUser\Common\.venv
Activates the Python virtual environment located in the specified location.
.Example
Activate.ps1 -Prompt "MyPython"
Activates the Python virtual environment that contains the Activate.ps1 script,
and prefixes the current prompt with the specified string (surrounded in
parentheses) while the virtual environment is active.
.Notes
On Windows, it may be required to enable this Activate.ps1 script by setting the
execution policy for the user. You can do this by issuing the following PowerShell
command:
PS C:\> Set-ExecutionPolicy -ExecutionPolicy RemoteSigned -Scope CurrentUser
For more information on Execution Policies:
https://go.microsoft.com/fwlink/?LinkID=135170
#>
Param(
[Parameter(Mandatory = $false)]
[String]
$VenvDir,
[Parameter(Mandatory = $false)]
[String]
$Prompt
)
<# Function declarations --------------------------------------------------- #>
<#
.Synopsis
Remove all shell session elements added by the Activate script, including the
addition of the virtual environment's Python executable from the beginning of
the PATH variable.
.Parameter NonDestructive
If present, do not remove this function from the global namespace for the
session.
#>
function global:deactivate ([switch]$NonDestructive) {
# Revert to original values
# The prior prompt:
if (Test-Path -Path Function:_OLD_VIRTUAL_PROMPT) {
Copy-Item -Path Function:_OLD_VIRTUAL_PROMPT -Destination Function:prompt
Remove-Item -Path Function:_OLD_VIRTUAL_PROMPT
}
# The prior PYTHONHOME:
if (Test-Path -Path Env:_OLD_VIRTUAL_PYTHONHOME) {
Copy-Item -Path Env:_OLD_VIRTUAL_PYTHONHOME -Destination Env:PYTHONHOME
Remove-Item -Path Env:_OLD_VIRTUAL_PYTHONHOME
}
# The prior PATH:
if (Test-Path -Path Env:_OLD_VIRTUAL_PATH) {
Copy-Item -Path Env:_OLD_VIRTUAL_PATH -Destination Env:PATH
Remove-Item -Path Env:_OLD_VIRTUAL_PATH
}
# Just remove the VIRTUAL_ENV altogether:
if (Test-Path -Path Env:VIRTUAL_ENV) {
Remove-Item -Path env:VIRTUAL_ENV
}
# Just remove VIRTUAL_ENV_PROMPT altogether.
if (Test-Path -Path Env:VIRTUAL_ENV_PROMPT) {
Remove-Item -Path env:VIRTUAL_ENV_PROMPT
}
# Just remove the _PYTHON_VENV_PROMPT_PREFIX altogether:
if (Get-Variable -Name "_PYTHON_VENV_PROMPT_PREFIX" -ErrorAction SilentlyContinue) {
Remove-Variable -Name _PYTHON_VENV_PROMPT_PREFIX -Scope Global -Force
}
# Leave deactivate function in the global namespace if requested:
if (-not $NonDestructive) {
Remove-Item -Path function:deactivate
}
}
<#
.Description
Get-PyVenvConfig parses the values from the pyvenv.cfg file located in the
given folder, and returns them in a map.
For each line in the pyvenv.cfg file, if that line can be parsed into exactly
two strings separated by `=` (with any amount of whitespace surrounding the =)
then it is considered a `key = value` line. The left hand string is the key,
the right hand is the value.
If the value starts with a `'` or a `"` then the first and last character is
stripped from the value before being captured.
.Parameter ConfigDir
Path to the directory that contains the `pyvenv.cfg` file.
#>
function Get-PyVenvConfig(
[String]
$ConfigDir
) {
Write-Verbose "Given ConfigDir=$ConfigDir, obtain values in pyvenv.cfg"
# Ensure the file exists, and issue a warning if it doesn't (but still allow the function to continue).
$pyvenvConfigPath = Join-Path -Resolve -Path $ConfigDir -ChildPath 'pyvenv.cfg' -ErrorAction Continue
# An empty map will be returned if no config file is found.
$pyvenvConfig = @{ }
if ($pyvenvConfigPath) {
Write-Verbose "File exists, parse `key = value` lines"
$pyvenvConfigContent = Get-Content -Path $pyvenvConfigPath
$pyvenvConfigContent | ForEach-Object {
$keyval = $PSItem -split "\s*=\s*", 2
if ($keyval[0] -and $keyval[1]) {
$val = $keyval[1]
# Remove extraneous quotations around a string value.
if ("'""".Contains($val.Substring(0, 1))) {
$val = $val.Substring(1, $val.Length - 2)
}
$pyvenvConfig[$keyval[0]] = $val
Write-Verbose "Adding Key: '$($keyval[0])'='$val'"
}
}
}
return $pyvenvConfig
}
<# Begin Activate script --------------------------------------------------- #>
# Determine the containing directory of this script
$VenvExecPath = Split-Path -Parent $MyInvocation.MyCommand.Definition
$VenvExecDir = Get-Item -Path $VenvExecPath
Write-Verbose "Activation script is located in path: '$VenvExecPath'"
Write-Verbose "VenvExecDir Fullname: '$($VenvExecDir.FullName)"
Write-Verbose "VenvExecDir Name: '$($VenvExecDir.Name)"
# Set values required in priority: CmdLine, ConfigFile, Default
# First, get the location of the virtual environment, it might not be
# VenvExecDir if specified on the command line.
if ($VenvDir) {
Write-Verbose "VenvDir given as parameter, using '$VenvDir' to determine values"
}
else {
Write-Verbose "VenvDir not given as a parameter, using parent directory name as VenvDir."
$VenvDir = $VenvExecDir.Parent.FullName.TrimEnd("\\/")
Write-Verbose "VenvDir=$VenvDir"
}
# Next, read the `pyvenv.cfg` file to determine any required value such
# as `prompt`.
$pyvenvCfg = Get-PyVenvConfig -ConfigDir $VenvDir
# Next, set the prompt from the command line, or the config file, or
# just use the name of the virtual environment folder.
if ($Prompt) {
Write-Verbose "Prompt specified as argument, using '$Prompt'"
}
else {
Write-Verbose "Prompt not specified as argument to script, checking pyvenv.cfg value"
if ($pyvenvCfg -and $pyvenvCfg['prompt']) {
Write-Verbose " Setting based on value in pyvenv.cfg='$($pyvenvCfg['prompt'])'"
$Prompt = $pyvenvCfg['prompt'];
}
else {
Write-Verbose " Setting prompt based on parent's directory's name. (Is the directory name passed to venv module when creating the virtual environment)"
Write-Verbose " Got leaf-name of $VenvDir='$(Split-Path -Path $venvDir -Leaf)'"
$Prompt = Split-Path -Path $venvDir -Leaf
}
}
Write-Verbose "Prompt = '$Prompt'"
Write-Verbose "VenvDir='$VenvDir'"
# Deactivate any currently active virtual environment, but leave the
# deactivate function in place.
deactivate -nondestructive
# Now set the environment variable VIRTUAL_ENV, used by many tools to determine
# that there is an activated venv.
$env:VIRTUAL_ENV = $VenvDir
if (-not $Env:VIRTUAL_ENV_DISABLE_PROMPT) {
Write-Verbose "Setting prompt to '$Prompt'"
# Set the prompt to include the env name
# Make sure _OLD_VIRTUAL_PROMPT is global
function global:_OLD_VIRTUAL_PROMPT { "" }
Copy-Item -Path function:prompt -Destination function:_OLD_VIRTUAL_PROMPT
New-Variable -Name _PYTHON_VENV_PROMPT_PREFIX -Description "Python virtual environment prompt prefix" -Scope Global -Option ReadOnly -Visibility Public -Value $Prompt
function global:prompt {
Write-Host -NoNewline -ForegroundColor Green "($_PYTHON_VENV_PROMPT_PREFIX) "
_OLD_VIRTUAL_PROMPT
}
$env:VIRTUAL_ENV_PROMPT = $Prompt
}
# Clear PYTHONHOME
if (Test-Path -Path Env:PYTHONHOME) {
Copy-Item -Path Env:PYTHONHOME -Destination Env:_OLD_VIRTUAL_PYTHONHOME
Remove-Item -Path Env:PYTHONHOME
}
# Add the venv to the PATH
Copy-Item -Path Env:PATH -Destination Env:_OLD_VIRTUAL_PATH
$Env:PATH = "$VenvExecDir$([System.IO.Path]::PathSeparator)$Env:PATH"

63
venv/bin/activate Normal file
View File

@ -0,0 +1,63 @@
# This file must be used with "source bin/activate" *from bash*
# you cannot run it directly
deactivate () {
# reset old environment variables
if [ -n "${_OLD_VIRTUAL_PATH:-}" ] ; then
PATH="${_OLD_VIRTUAL_PATH:-}"
export PATH
unset _OLD_VIRTUAL_PATH
fi
if [ -n "${_OLD_VIRTUAL_PYTHONHOME:-}" ] ; then
PYTHONHOME="${_OLD_VIRTUAL_PYTHONHOME:-}"
export PYTHONHOME
unset _OLD_VIRTUAL_PYTHONHOME
fi
# Call hash to forget past commands. Without forgetting
# past commands the $PATH changes we made may not be respected
hash -r 2> /dev/null
if [ -n "${_OLD_VIRTUAL_PS1:-}" ] ; then
PS1="${_OLD_VIRTUAL_PS1:-}"
export PS1
unset _OLD_VIRTUAL_PS1
fi
unset VIRTUAL_ENV
unset VIRTUAL_ENV_PROMPT
if [ ! "${1:-}" = "nondestructive" ] ; then
# Self destruct!
unset -f deactivate
fi
}
# unset irrelevant variables
deactivate nondestructive
VIRTUAL_ENV=/home/adminuser/zulip_bots/venv
export VIRTUAL_ENV
_OLD_VIRTUAL_PATH="$PATH"
PATH="$VIRTUAL_ENV/"bin":$PATH"
export PATH
# unset PYTHONHOME if set
# this will fail if PYTHONHOME is set to the empty string (which is bad anyway)
# could use `if (set -u; : $PYTHONHOME) ;` in bash
if [ -n "${PYTHONHOME:-}" ] ; then
_OLD_VIRTUAL_PYTHONHOME="${PYTHONHOME:-}"
unset PYTHONHOME
fi
if [ -z "${VIRTUAL_ENV_DISABLE_PROMPT:-}" ] ; then
_OLD_VIRTUAL_PS1="${PS1:-}"
PS1='(venv) '"${PS1:-}"
export PS1
VIRTUAL_ENV_PROMPT='(venv) '
export VIRTUAL_ENV_PROMPT
fi
# Call hash to forget past commands. Without forgetting
# past commands the $PATH changes we made may not be respected
hash -r 2> /dev/null

26
venv/bin/activate.csh Normal file
View File

@ -0,0 +1,26 @@
# This file must be used with "source bin/activate.csh" *from csh*.
# You cannot run it directly.
# Created by Davide Di Blasi <davidedb@gmail.com>.
# Ported to Python 3.3 venv by Andrew Svetlov <andrew.svetlov@gmail.com>
alias deactivate 'test $?_OLD_VIRTUAL_PATH != 0 && setenv PATH "$_OLD_VIRTUAL_PATH" && unset _OLD_VIRTUAL_PATH; rehash; test $?_OLD_VIRTUAL_PROMPT != 0 && set prompt="$_OLD_VIRTUAL_PROMPT" && unset _OLD_VIRTUAL_PROMPT; unsetenv VIRTUAL_ENV; unsetenv VIRTUAL_ENV_PROMPT; test "\!:*" != "nondestructive" && unalias deactivate'
# Unset irrelevant variables.
deactivate nondestructive
setenv VIRTUAL_ENV /home/adminuser/zulip_bots/venv
set _OLD_VIRTUAL_PATH="$PATH"
setenv PATH "$VIRTUAL_ENV/"bin":$PATH"
set _OLD_VIRTUAL_PROMPT="$prompt"
if (! "$?VIRTUAL_ENV_DISABLE_PROMPT") then
set prompt = '(venv) '"$prompt"
setenv VIRTUAL_ENV_PROMPT '(venv) '
endif
alias pydoc python -m pydoc
rehash

69
venv/bin/activate.fish Normal file
View File

@ -0,0 +1,69 @@
# This file must be used with "source <venv>/bin/activate.fish" *from fish*
# (https://fishshell.com/); you cannot run it directly.
function deactivate -d "Exit virtual environment and return to normal shell environment"
# reset old environment variables
if test -n "$_OLD_VIRTUAL_PATH"
set -gx PATH $_OLD_VIRTUAL_PATH
set -e _OLD_VIRTUAL_PATH
end
if test -n "$_OLD_VIRTUAL_PYTHONHOME"
set -gx PYTHONHOME $_OLD_VIRTUAL_PYTHONHOME
set -e _OLD_VIRTUAL_PYTHONHOME
end
if test -n "$_OLD_FISH_PROMPT_OVERRIDE"
set -e _OLD_FISH_PROMPT_OVERRIDE
# prevents error when using nested fish instances (Issue #93858)
if functions -q _old_fish_prompt
functions -e fish_prompt
functions -c _old_fish_prompt fish_prompt
functions -e _old_fish_prompt
end
end
set -e VIRTUAL_ENV
set -e VIRTUAL_ENV_PROMPT
if test "$argv[1]" != "nondestructive"
# Self-destruct!
functions -e deactivate
end
end
# Unset irrelevant variables.
deactivate nondestructive
set -gx VIRTUAL_ENV /home/adminuser/zulip_bots/venv
set -gx _OLD_VIRTUAL_PATH $PATH
set -gx PATH "$VIRTUAL_ENV/"bin $PATH
# Unset PYTHONHOME if set.
if set -q PYTHONHOME
set -gx _OLD_VIRTUAL_PYTHONHOME $PYTHONHOME
set -e PYTHONHOME
end
if test -z "$VIRTUAL_ENV_DISABLE_PROMPT"
# fish uses a function instead of an env var to generate the prompt.
# Save the current fish_prompt function as the function _old_fish_prompt.
functions -c fish_prompt _old_fish_prompt
# With the original prompt function renamed, we can override with our own.
function fish_prompt
# Save the return status of the last command.
set -l old_status $status
# Output the venv prompt; color taken from the blue of the Python logo.
printf "%s%s%s" (set_color 4B8BBE) '(venv) ' (set_color normal)
# Restore the return status of the previous command.
echo "exit $old_status" | .
# Output the original/"old" prompt.
_old_fish_prompt
end
set -gx _OLD_FISH_PROMPT_OVERRIDE "$VIRTUAL_ENV"
set -gx VIRTUAL_ENV_PROMPT '(venv) '
end

8
venv/bin/coloredlogs Executable file
View File

@ -0,0 +1,8 @@
#!/home/adminuser/zulip_bots/venv/bin/python3.11
# -*- coding: utf-8 -*-
import re
import sys
from coloredlogs.cli import main
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
sys.exit(main())

8
venv/bin/distro Executable file
View File

@ -0,0 +1,8 @@
#!/home/adminuser/zulip_bots/venv/bin/python3.11
# -*- coding: utf-8 -*-
import re
import sys
from distro.distro import main
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
sys.exit(main())

8
venv/bin/dotenv Executable file
View File

@ -0,0 +1,8 @@
#!/home/adminuser/zulip_bots/venv/bin/python3.11
# -*- coding: utf-8 -*-
import re
import sys
from dotenv.__main__ import cli
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
sys.exit(cli())

8
venv/bin/f2py Executable file
View File

@ -0,0 +1,8 @@
#!/home/adminuser/zulip_bots/venv/bin/python3.11
# -*- coding: utf-8 -*-
import re
import sys
from numpy.f2py.f2py2e import main
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
sys.exit(main())

8
venv/bin/fastavro Executable file
View File

@ -0,0 +1,8 @@
#!/home/adminuser/zulip_bots/venv/bin/python3.11
# -*- coding: utf-8 -*-
import re
import sys
from fastavro.__main__ import main
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
sys.exit(main())

8
venv/bin/flask Executable file
View File

@ -0,0 +1,8 @@
#!/home/adminuser/zulip_bots/venv/bin/python3.11
# -*- coding: utf-8 -*-
import re
import sys
from flask.cli import main
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
sys.exit(main())

8
venv/bin/httpx Executable file
View File

@ -0,0 +1,8 @@
#!/home/adminuser/zulip_bots/venv/bin/python3.11
# -*- coding: utf-8 -*-
import re
import sys
from httpx import main
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
sys.exit(main())

8
venv/bin/huggingface-cli Executable file
View File

@ -0,0 +1,8 @@
#!/home/adminuser/zulip_bots/venv/bin/python3.11
# -*- coding: utf-8 -*-
import re
import sys
from huggingface_hub.commands.huggingface_cli import main
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
sys.exit(main())

8
venv/bin/humanfriendly Executable file
View File

@ -0,0 +1,8 @@
#!/home/adminuser/zulip_bots/venv/bin/python3.11
# -*- coding: utf-8 -*-
import re
import sys
from humanfriendly.cli import main
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
sys.exit(main())

8
venv/bin/isympy Executable file
View File

@ -0,0 +1,8 @@
#!/home/adminuser/zulip_bots/venv/bin/python3.11
# -*- coding: utf-8 -*-
import re
import sys
from isympy import main
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
sys.exit(main())

8
venv/bin/markdown-it Executable file
View File

@ -0,0 +1,8 @@
#!/home/adminuser/zulip_bots/venv/bin/python3.11
# -*- coding: utf-8 -*-
import re
import sys
from markdown_it.cli.parse import main
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
sys.exit(main())

8
venv/bin/nltk Executable file
View File

@ -0,0 +1,8 @@
#!/home/adminuser/zulip_bots/venv/bin/python3.11
# -*- coding: utf-8 -*-
import re
import sys
from nltk.cli import cli
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
sys.exit(cli())

8
venv/bin/nomic Executable file
View File

@ -0,0 +1,8 @@
#!/home/adminuser/zulip_bots/venv/bin/python3.11
# -*- coding: utf-8 -*-
import re
import sys
from nomic.cli import cli
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
sys.exit(cli())

8
venv/bin/normalizer Executable file
View File

@ -0,0 +1,8 @@
#!/home/adminuser/zulip_bots/venv/bin/python3.11
# -*- coding: utf-8 -*-
import re
import sys
from charset_normalizer import cli
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
sys.exit(cli.cli_detect())

8
venv/bin/numpy-config Executable file
View File

@ -0,0 +1,8 @@
#!/home/adminuser/zulip_bots/venv/bin/python3.11
# -*- coding: utf-8 -*-
import re
import sys
from numpy._configtool import main
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
sys.exit(main())

8
venv/bin/onnxruntime_test Executable file
View File

@ -0,0 +1,8 @@
#!/home/adminuser/zulip_bots/venv/bin/python3.11
# -*- coding: utf-8 -*-
import re
import sys
from onnxruntime.tools.onnxruntime_test import main
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
sys.exit(main())

8
venv/bin/openai Executable file
View File

@ -0,0 +1,8 @@
#!/home/adminuser/zulip_bots/venv/bin/python3.11
# -*- coding: utf-8 -*-
import re
import sys
from openai.cli import main
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
sys.exit(main())

8
venv/bin/pip Executable file
View File

@ -0,0 +1,8 @@
#!/home/adminuser/zulip_bots/venv/bin/python3.11
# -*- coding: utf-8 -*-
import re
import sys
from pip._internal.cli.main import main
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
sys.exit(main())

8
venv/bin/pip3 Executable file
View File

@ -0,0 +1,8 @@
#!/home/adminuser/zulip_bots/venv/bin/python3.11
# -*- coding: utf-8 -*-
import re
import sys
from pip._internal.cli.main import main
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
sys.exit(main())

8
venv/bin/pip3.11 Executable file
View File

@ -0,0 +1,8 @@
#!/home/adminuser/zulip_bots/venv/bin/python3.11
# -*- coding: utf-8 -*-
import re
import sys
from pip._internal.cli.main import main
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
sys.exit(main())

8
venv/bin/pygmentize Executable file
View File

@ -0,0 +1,8 @@
#!/home/adminuser/zulip_bots/venv/bin/python3.11
# -*- coding: utf-8 -*-
import re
import sys
from pygments.cmdline import main
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
sys.exit(main())

8
venv/bin/pyrsa-decrypt Executable file
View File

@ -0,0 +1,8 @@
#!/home/adminuser/zulip_bots/venv/bin/python3.11
# -*- coding: utf-8 -*-
import re
import sys
from rsa.cli import decrypt
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
sys.exit(decrypt())

8
venv/bin/pyrsa-encrypt Executable file
View File

@ -0,0 +1,8 @@
#!/home/adminuser/zulip_bots/venv/bin/python3.11
# -*- coding: utf-8 -*-
import re
import sys
from rsa.cli import encrypt
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
sys.exit(encrypt())

8
venv/bin/pyrsa-keygen Executable file
View File

@ -0,0 +1,8 @@
#!/home/adminuser/zulip_bots/venv/bin/python3.11
# -*- coding: utf-8 -*-
import re
import sys
from rsa.cli import keygen
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
sys.exit(keygen())

8
venv/bin/pyrsa-priv2pub Executable file
View File

@ -0,0 +1,8 @@
#!/home/adminuser/zulip_bots/venv/bin/python3.11
# -*- coding: utf-8 -*-
import re
import sys
from rsa.util import private_to_public
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
sys.exit(private_to_public())

8
venv/bin/pyrsa-sign Executable file
View File

@ -0,0 +1,8 @@
#!/home/adminuser/zulip_bots/venv/bin/python3.11
# -*- coding: utf-8 -*-
import re
import sys
from rsa.cli import sign
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
sys.exit(sign())

8
venv/bin/pyrsa-verify Executable file
View File

@ -0,0 +1,8 @@
#!/home/adminuser/zulip_bots/venv/bin/python3.11
# -*- coding: utf-8 -*-
import re
import sys
from rsa.cli import verify
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
sys.exit(verify())

1
venv/bin/python Symbolic link
View File

@ -0,0 +1 @@
python3.11

1
venv/bin/python3 Symbolic link
View File

@ -0,0 +1 @@
python3.11

1
venv/bin/python3.11 Symbolic link
View File

@ -0,0 +1 @@
/usr/bin/python3.11

8
venv/bin/tqdm Executable file
View File

@ -0,0 +1,8 @@
#!/home/adminuser/zulip_bots/venv/bin/python3.11
# -*- coding: utf-8 -*-
import re
import sys
from tqdm.cli import main
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
sys.exit(main())

8
venv/bin/uvicorn Executable file
View File

@ -0,0 +1,8 @@
#!/home/adminuser/zulip_bots/venv/bin/python3.11
# -*- coding: utf-8 -*-
import re
import sys
from uvicorn.main import main
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
sys.exit(main())

8
venv/bin/watchfiles Executable file
View File

@ -0,0 +1,8 @@
#!/home/adminuser/zulip_bots/venv/bin/python3.11
# -*- coding: utf-8 -*-
import re
import sys
from watchfiles.cli import cli
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
sys.exit(cli())

Some files were not shown because too many files have changed in this diff Show More