first commit
This commit is contained in:
commit
8484b0b882
1
.cursorignore
Normal file
1
.cursorignore
Normal file
@ -0,0 +1 @@
|
||||
venv/
|
||||
26
.env
Normal file
26
.env
Normal file
@ -0,0 +1,26 @@
|
||||
# Flask configuration
|
||||
FLASK_ENV=development
|
||||
SECRET_KEY=dev-secret-key
|
||||
|
||||
# Zulip DB configuration
|
||||
ZULIP_DB_URI=postgresql://zulip:BlackMoonSky89@zulip.lci.ge:5432/zulip
|
||||
|
||||
# ChromaDB configuration
|
||||
CHROMA_DB_PATH=./chromadb
|
||||
|
||||
# Embedding model configuration
|
||||
USE_NOMIC_EMBEDDINGS=true
|
||||
OLLAMA_MODEL=nomic-embed-text
|
||||
OLLAMA_HOST=http://localhost:11434
|
||||
|
||||
# AI model configuration
|
||||
# OpenAI GPT-4o (new)
|
||||
OPENAI_API_KEY=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
|
||||
|
||||
# Gemini API (legacy)
|
||||
GEMINI_API_KEY=AIzaSyD_VYKUcleCUkAxZj1sX3pWLHvGk0HDe9s
|
||||
|
||||
# Zulip Bot configuration
|
||||
ZULIP_BOT_EMAIL=IT_bot-bot@zulip.lci.ge
|
||||
ZULIP_BOT_API_KEY=ta8x0Rwlf5yLlZutETiTZbHFtQMVOv1z
|
||||
ZULIP_SITE=https://zulip.lci.ge
|
||||
79
README.md
Normal file
79
README.md
Normal file
@ -0,0 +1,79 @@
|
||||
# Zulip Bot Service
|
||||
|
||||
This is a Zulip bot service that provides AI-powered responses to user queries.
|
||||
|
||||
## Running as a Service with PM2
|
||||
|
||||
The application can be set up to run as a background service using PM2, which ensures it starts automatically on system boot and restarts if it crashes.
|
||||
|
||||
### Prerequisites
|
||||
|
||||
1. Node.js and npm installed on your system
|
||||
2. PM2 installed globally (`npm install -g pm2`)
|
||||
3. Python 3.11+ and required dependencies
|
||||
|
||||
### Installation
|
||||
|
||||
1. Make sure all required environment variables are set in your `.env` file:
|
||||
```
|
||||
ZULIP_BOT_EMAIL=your-bot@example.com
|
||||
ZULIP_BOT_API_KEY=your-api-key
|
||||
ZULIP_SITE=https://your-zulip-instance.com
|
||||
OPENAI_API_KEY=your-openai-api-key
|
||||
```
|
||||
|
||||
2. Make the setup script executable:
|
||||
```bash
|
||||
chmod +x pm2_start.sh
|
||||
```
|
||||
|
||||
3. Run the setup script:
|
||||
```bash
|
||||
./pm2_start.sh
|
||||
```
|
||||
|
||||
4. The script will:
|
||||
- Install PM2 if not already installed
|
||||
- Start the bot as a background service
|
||||
- Configure PM2 to run at system startup
|
||||
- Provide instructions for any required sudo commands
|
||||
|
||||
### Managing the Service
|
||||
|
||||
- **Check status**: `pm2 status`
|
||||
- **View logs**: `pm2 logs zulip-bot`
|
||||
- **Restart**: `pm2 restart zulip-bot`
|
||||
- **Stop**: `pm2 stop zulip-bot`
|
||||
- **Start (if stopped)**: `pm2 start zulip-bot`
|
||||
|
||||
### Troubleshooting
|
||||
|
||||
If the service fails to start:
|
||||
|
||||
1. Check logs for errors: `pm2 logs zulip-bot`
|
||||
2. Ensure all environment variables are properly set
|
||||
3. Verify that the Flask app works by running it directly: `./run_app.sh`
|
||||
|
||||
## Manual Setup
|
||||
|
||||
If you prefer to run the bot without PM2:
|
||||
|
||||
1. Activate the virtual environment:
|
||||
```bash
|
||||
source venv/bin/activate
|
||||
```
|
||||
|
||||
2. Run the Flask app:
|
||||
```bash
|
||||
./run_app.sh
|
||||
```
|
||||
|
||||
## Development
|
||||
|
||||
For development purposes, you can run the Flask app in debug mode:
|
||||
|
||||
```bash
|
||||
export FLASK_APP=app
|
||||
export FLASK_DEBUG=1
|
||||
flask run --port=5100
|
||||
```
|
||||
BIN
all_channels_sync_state.pickle
Normal file
BIN
all_channels_sync_state.pickle
Normal file
Binary file not shown.
146
app/__init__.py
Normal file
146
app/__init__.py
Normal file
@ -0,0 +1,146 @@
|
||||
"""
|
||||
Main application entry point for the Zulip Bot application.
|
||||
"""
|
||||
import os
|
||||
from flask import Flask, request, jsonify
|
||||
from app.config import load_config
|
||||
|
||||
def create_app(config_name=None):
|
||||
"""Create and configure the Flask application."""
|
||||
app = Flask(__name__)
|
||||
|
||||
# Load configuration
|
||||
config = load_config(config_name)
|
||||
app.config.from_object(config)
|
||||
|
||||
# Set DEBUG mode for the app
|
||||
app.config['DEBUG'] = True
|
||||
|
||||
# Override any environment flags to disable safety filters
|
||||
os.environ['GEMINI_NO_SAFETY'] = 'true'
|
||||
|
||||
# Apply NumPy compatibility patch for ChromaDB
|
||||
from app.utils import patch_chromadb_numpy
|
||||
patch_chromadb_numpy()
|
||||
|
||||
# Initialize database connections
|
||||
from app.db import init_db
|
||||
init_db(app)
|
||||
|
||||
# Check if we're in the main process or a Flask reloader worker
|
||||
# When Flask reloads in debug mode, it sets an environment variable
|
||||
# We only want to start services in the main process to avoid duplication
|
||||
is_flask_reloader_process = os.environ.get('WERKZEUG_RUN_MAIN') == 'true'
|
||||
is_main_process = not os.environ.get('WERKZEUG_RUN_MAIN')
|
||||
|
||||
# Only start services in the main process or if --no-reload is used
|
||||
# This prevents duplicate services when using Flask's debug mode
|
||||
should_start_services = is_flask_reloader_process or is_main_process
|
||||
|
||||
# Initialize message sync service and bot service regardless of process
|
||||
# but only start them in the appropriate process
|
||||
from app.utils.sync_service import MessageSyncService
|
||||
sync_service = MessageSyncService(sync_interval=60) # Sync every 60 seconds
|
||||
|
||||
# Store sync_service in app context so it can be accessed elsewhere
|
||||
app.sync_service = sync_service
|
||||
|
||||
# Initialize Zulip bot service
|
||||
from app.utils.bot_service import ZulipBotService
|
||||
bot_service = ZulipBotService()
|
||||
|
||||
# Store bot_service in app context so it can be accessed elsewhere
|
||||
app.bot_service = bot_service
|
||||
|
||||
# Start the services in a better way (avoiding deprecated before_first_request)
|
||||
# But only if this is the main process or Flask reloader's main thread
|
||||
with app.app_context():
|
||||
# Add logging to help diagnose any issues
|
||||
app.logger.info(f"App initialization, should_start_services={should_start_services}, "
|
||||
f"is_main_process={is_main_process}, is_flask_reloader_process={is_flask_reloader_process}")
|
||||
|
||||
if should_start_services:
|
||||
# Start the sync service
|
||||
app.logger.info("Starting sync service...")
|
||||
sync_service.start()
|
||||
|
||||
# Start the bot service and log the result
|
||||
app.logger.info("Starting Zulip bot service...")
|
||||
if bot_service.thread and bot_service.thread.is_alive():
|
||||
app.logger.info("Bot service is already running, not starting again")
|
||||
else:
|
||||
bot_service.start()
|
||||
app.logger.info("Bot service started successfully")
|
||||
else:
|
||||
app.logger.info("Skipping service startup in Flask reloader process")
|
||||
|
||||
# Register a shutdown function to stop the services
|
||||
@app.teardown_appcontext
|
||||
def stop_services(exception=None):
|
||||
if hasattr(app, 'sync_service'):
|
||||
app.sync_service.stop()
|
||||
if hasattr(app, 'bot_service'):
|
||||
app.bot_service.stop()
|
||||
|
||||
# Register blueprints
|
||||
# This will be implemented later
|
||||
|
||||
@app.route('/health')
|
||||
def health_check():
|
||||
"""Simple health check endpoint."""
|
||||
return jsonify({'status': 'ok'})
|
||||
|
||||
@app.route('/sync/now')
|
||||
def trigger_sync():
|
||||
"""Trigger an immediate sync."""
|
||||
if hasattr(app, 'sync_service'):
|
||||
app.sync_service.sync_now()
|
||||
return jsonify({'status': 'sync_triggered'})
|
||||
return jsonify({'status': 'error', 'message': 'Sync service not available'}), 500
|
||||
|
||||
@app.route('/bot/status')
|
||||
def bot_status():
|
||||
"""Get the status of the bot service."""
|
||||
if hasattr(app, 'bot_service') and app.bot_service.thread and app.bot_service.thread.is_alive():
|
||||
return jsonify({'status': 'running'})
|
||||
return jsonify({'status': 'stopped'})
|
||||
|
||||
@app.route('/bot/start', methods=['POST'])
|
||||
def start_bot():
|
||||
"""Start the bot service."""
|
||||
if hasattr(app, 'bot_service'):
|
||||
app.bot_service.start()
|
||||
return jsonify({'status': 'started'})
|
||||
return jsonify({'status': 'error', 'message': 'Bot service not available'}), 500
|
||||
|
||||
@app.route('/bot/stop', methods=['POST'])
|
||||
def stop_bot():
|
||||
"""Stop the bot service."""
|
||||
if hasattr(app, 'bot_service'):
|
||||
app.bot_service.stop()
|
||||
return jsonify({'status': 'stopped'})
|
||||
return jsonify({'status': 'error', 'message': 'Bot service not available'}), 500
|
||||
|
||||
@app.route('/bot/test', methods=['POST'])
|
||||
def test_bot():
|
||||
"""Send a test message to verify the bot is working."""
|
||||
if not hasattr(app, 'bot_service'):
|
||||
return jsonify({'status': 'error', 'message': 'Bot service not available'}), 500
|
||||
|
||||
data = request.get_json()
|
||||
if not data or 'recipient' not in data or 'content' not in data:
|
||||
return jsonify({'status': 'error', 'message': 'Missing required fields: recipient, content'}), 400
|
||||
|
||||
result = app.bot_service.send_test_message(data['recipient'], data['content'])
|
||||
return jsonify({'status': 'sent', 'result': result})
|
||||
|
||||
@app.route('/bot/reset-cache', methods=['POST'])
|
||||
def reset_bot_cache():
|
||||
"""Reset the bot's message cache to fix issues with message processing."""
|
||||
if not hasattr(app, 'bot_service'):
|
||||
return jsonify({'status': 'error', 'message': 'Bot service not available'}), 500
|
||||
|
||||
result = app.bot_service.reset_cache()
|
||||
return jsonify({'status': 'success', 'message': result})
|
||||
|
||||
return app
|
||||
BIN
app/__pycache__/__init__.cpython-311.pyc
Normal file
BIN
app/__pycache__/__init__.cpython-311.pyc
Normal file
Binary file not shown.
106
app/config/__init__.py
Normal file
106
app/config/__init__.py
Normal file
@ -0,0 +1,106 @@
|
||||
"""
|
||||
Configuration module for the application.
|
||||
|
||||
Loads environment variables and provides configuration values.
|
||||
"""
|
||||
import os
|
||||
from dotenv import load_dotenv
|
||||
|
||||
# Load environment variables from .env file (if it exists)
|
||||
load_dotenv()
|
||||
|
||||
class Config:
|
||||
"""Configuration class for the application."""
|
||||
|
||||
# Zulip API settings
|
||||
ZULIP_EMAIL = os.getenv("ZULIP_EMAIL", "IT_bot-bot@zulip.lci.ge")
|
||||
ZULIP_API_KEY = os.getenv("ZULIP_API_KEY", "ta8x0Rwlf5yLlZutETiTZbHFtQMVOv1z")
|
||||
ZULIP_SITE = os.getenv("ZULIP_SITE", "https://zulip.lci.ge")
|
||||
|
||||
# Zulip database settings
|
||||
ZULIP_DB_HOST = os.getenv("ZULIP_DB_HOST", "zulip.lci.ge")
|
||||
ZULIP_DB_PORT = os.getenv("ZULIP_DB_PORT", "5432")
|
||||
ZULIP_DB_NAME = os.getenv("ZULIP_DB_NAME", "zulip")
|
||||
ZULIP_DB_USER = os.getenv("ZULIP_DB_USER", "zulip")
|
||||
ZULIP_DB_PASSWORD = os.getenv("ZULIP_DB_PASSWORD", "BlackMoonSky89")
|
||||
|
||||
# Database URL
|
||||
SQLALCHEMY_DATABASE_URI = f"postgresql://{ZULIP_DB_USER}:{ZULIP_DB_PASSWORD}@{ZULIP_DB_HOST}:{ZULIP_DB_PORT}/{ZULIP_DB_NAME}"
|
||||
|
||||
# ChromaDB settings
|
||||
CHROMADB_PATH = os.getenv("CHROMADB_PATH", "./chromadb")
|
||||
CHROMADB_COLLECTION = os.getenv("CHROMADB_COLLECTION", "zulip_messages")
|
||||
|
||||
# Channels to monitor (IT Discussions, IT Knowledge, IT Support)
|
||||
CHANNELS_TO_MONITOR = [
|
||||
"IT Discussions",
|
||||
"IT Knowledge",
|
||||
"IT Support"
|
||||
]
|
||||
|
||||
# AI model settings
|
||||
# OpenAI settings (primary)
|
||||
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
|
||||
OPENAI_MODEL = os.getenv("OPENAI_MODEL", "gpt-4o")
|
||||
|
||||
# Gemini API settings (legacy)
|
||||
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY", "AIzaSyD_VYKUcleCUkAxZj1sX3pWLHvGk0HDe9s")
|
||||
|
||||
# Embedding settings
|
||||
USE_NOMIC_EMBEDDINGS = os.getenv("USE_NOMIC_EMBEDDINGS", "False").lower() == "true"
|
||||
COHERE_API_KEY = os.getenv("COHERE_API_KEY", "4sCOTMgEg5rXeXU0XMmPeucSBMl5xd4FMhyV2UDW")
|
||||
|
||||
# Ollama settings
|
||||
OLLAMA_MODEL = os.getenv("OLLAMA_MODEL", "nomic-embed-text")
|
||||
OLLAMA_HOST = os.getenv("OLLAMA_HOST", "http://localhost:11434")
|
||||
|
||||
# Flask settings
|
||||
SECRET_KEY = os.getenv("SECRET_KEY", "your_secret_key_here")
|
||||
DEBUG = os.getenv("DEBUG", "True").lower() == "true"
|
||||
|
||||
# Bot settings
|
||||
BOT_NAME = "IT_Bot"
|
||||
BOT_TRIGGER = f"@**{BOT_NAME}**"
|
||||
|
||||
# Rate limiting settings
|
||||
RATE_LIMIT_PERIOD = int(os.getenv("RATE_LIMIT_PERIOD", "60")) # 60 seconds
|
||||
RATE_LIMIT_REQUESTS = int(os.getenv("RATE_LIMIT_REQUESTS", "10")) # 10 requests per period
|
||||
|
||||
class DevelopmentConfig(Config):
|
||||
"""Development configuration."""
|
||||
DEBUG = True
|
||||
|
||||
class ProductionConfig(Config):
|
||||
"""Production configuration."""
|
||||
DEBUG = False
|
||||
|
||||
class TestingConfig(Config):
|
||||
"""Testing configuration."""
|
||||
TESTING = True
|
||||
# Use a test database
|
||||
SQLALCHEMY_DATABASE_URI = os.getenv('TEST_SQLALCHEMY_DATABASE_URI', 'postgresql://zulip:BlackMoonSky89@zulip.lci.ge:5432/zulip_test')
|
||||
# Use a test ChromaDB path
|
||||
CHROMADB_PATH = os.getenv('TEST_CHROMADB_PATH', './chromadb_test')
|
||||
|
||||
# Configuration dictionary
|
||||
config_dict = {
|
||||
'development': DevelopmentConfig,
|
||||
'production': ProductionConfig,
|
||||
'testing': TestingConfig,
|
||||
'default': DevelopmentConfig
|
||||
}
|
||||
|
||||
def load_config(config_name=None):
|
||||
"""
|
||||
Load the appropriate configuration based on environment variables or the provided config_name.
|
||||
|
||||
Args:
|
||||
config_name (str, optional): Name of the configuration to load. Defaults to None.
|
||||
|
||||
Returns:
|
||||
Config: Configuration object
|
||||
"""
|
||||
if not config_name:
|
||||
config_name = os.getenv('FLASK_ENV', 'default')
|
||||
|
||||
return config_dict.get(config_name, config_dict['default'])
|
||||
BIN
app/config/__pycache__/__init__.cpython-311.pyc
Normal file
BIN
app/config/__pycache__/__init__.cpython-311.pyc
Normal file
Binary file not shown.
105
app/db/__init__.py
Normal file
105
app/db/__init__.py
Normal file
@ -0,0 +1,105 @@
|
||||
"""
|
||||
Database module for the application.
|
||||
Handles connections to PostgreSQL (Zulip DB) and ChromaDB.
|
||||
"""
|
||||
from sqlalchemy import create_engine
|
||||
from sqlalchemy.orm import sessionmaker, scoped_session
|
||||
from sqlalchemy.ext.declarative import declarative_base
|
||||
import chromadb
|
||||
|
||||
# SQLAlchemy base class for models
|
||||
Base = declarative_base()
|
||||
|
||||
# Global variables for SQLAlchemy
|
||||
db_engine = None
|
||||
db_session = None
|
||||
|
||||
# Global variable for ChromaDB
|
||||
chroma_client = None
|
||||
chroma_collection = None
|
||||
|
||||
def init_db(app):
|
||||
"""
|
||||
Initialize database connections.
|
||||
|
||||
Args:
|
||||
app: Flask application object
|
||||
"""
|
||||
global db_engine, db_session, chroma_client, chroma_collection
|
||||
|
||||
# Initialize SQLAlchemy engine and session
|
||||
db_engine = create_engine(app.config['SQLALCHEMY_DATABASE_URI'])
|
||||
db_session = scoped_session(sessionmaker(autocommit=False, autoflush=False, bind=db_engine))
|
||||
|
||||
# Set query property for models
|
||||
Base.query = db_session.query_property()
|
||||
|
||||
# Initialize ChromaDB
|
||||
try:
|
||||
# Set allow_reset to True to prevent "Add of existing embedding ID" warnings
|
||||
chroma_client = chromadb.PersistentClient(
|
||||
path=app.config['CHROMADB_PATH'],
|
||||
settings=chromadb.Settings(
|
||||
allow_reset=True,
|
||||
anonymized_telemetry=False,
|
||||
is_persistent=True
|
||||
)
|
||||
)
|
||||
|
||||
# Import here to avoid circular imports
|
||||
from app.db.chroma_service import CustomEmbeddingFunction
|
||||
|
||||
# Create embedding function with setting from config
|
||||
try:
|
||||
# Always use Ollama since it's more reliable
|
||||
embedding_function = CustomEmbeddingFunction(use_nomic=False)
|
||||
|
||||
# Get or create ChromaDB collection for Zulip messages with custom embedding function
|
||||
chroma_collection = chroma_client.get_or_create_collection(
|
||||
name=app.config.get('CHROMADB_COLLECTION', 'zulip_messages'),
|
||||
metadata={
|
||||
"hnsw:space": "cosine",
|
||||
"hnsw:allow_replace_deleted": True # Allow replacing deleted vectors
|
||||
},
|
||||
embedding_function=embedding_function
|
||||
)
|
||||
except Exception as e:
|
||||
print(f"Error with embedding function: {e}")
|
||||
print("Creating collection without embedding function")
|
||||
# Create collection without embedding function
|
||||
chroma_collection = chroma_client.get_or_create_collection(
|
||||
name=app.config.get('CHROMADB_COLLECTION', 'zulip_messages'),
|
||||
metadata={
|
||||
"hnsw:space": "cosine",
|
||||
"hnsw:allow_replace_deleted": True # Allow replacing deleted vectors
|
||||
}
|
||||
)
|
||||
except Exception as e:
|
||||
print(f"Critical error initializing ChromaDB: {e}")
|
||||
print("ChromaDB functionality will not be available")
|
||||
chroma_client = None
|
||||
chroma_collection = None
|
||||
|
||||
# Register teardown function to remove database sessions
|
||||
@app.teardown_appcontext
|
||||
def shutdown_session(exception=None):
|
||||
"""Remove the database session at the end of the request."""
|
||||
db_session.remove()
|
||||
|
||||
def get_db_session():
|
||||
"""
|
||||
Get the current database session.
|
||||
|
||||
Returns:
|
||||
SQLAlchemy session object
|
||||
"""
|
||||
return db_session
|
||||
|
||||
def get_chroma_collection():
|
||||
"""
|
||||
Get the ChromaDB collection for Zulip messages.
|
||||
|
||||
Returns:
|
||||
ChromaDB collection object
|
||||
"""
|
||||
return chroma_collection
|
||||
BIN
app/db/__pycache__/__init__.cpython-311.pyc
Normal file
BIN
app/db/__pycache__/__init__.cpython-311.pyc
Normal file
Binary file not shown.
BIN
app/db/__pycache__/chroma_service.cpython-311.pyc
Normal file
BIN
app/db/__pycache__/chroma_service.cpython-311.pyc
Normal file
Binary file not shown.
BIN
app/db/__pycache__/zulip_service.cpython-311.pyc
Normal file
BIN
app/db/__pycache__/zulip_service.cpython-311.pyc
Normal file
Binary file not shown.
418
app/db/chroma_service.py
Normal file
418
app/db/chroma_service.py
Normal file
@ -0,0 +1,418 @@
|
||||
"""
|
||||
Service for storing and retrieving embedded messages in ChromaDB.
|
||||
"""
|
||||
import json
|
||||
from datetime import datetime
|
||||
from typing import List, Dict, Any, Optional, Union
|
||||
import chromadb
|
||||
from chromadb.utils import embedding_functions
|
||||
from app.db import get_chroma_collection
|
||||
from app.utils.embeddings import EmbeddingService
|
||||
from app.utils.contextual_retrieval.context_service import ContextService
|
||||
from app.utils.contextual_retrieval.bm25_service import BM25Service
|
||||
from app.config import Config
|
||||
import logging
|
||||
|
||||
# Set up logging
|
||||
logger = logging.getLogger("chroma_service")
|
||||
|
||||
class CustomEmbeddingFunction(embedding_functions.EmbeddingFunction):
|
||||
"""Custom embedding function using our EmbeddingService."""
|
||||
|
||||
def __init__(self, use_nomic: bool = True):
|
||||
"""
|
||||
Initialize the custom embedding function.
|
||||
|
||||
Args:
|
||||
use_nomic: Whether to use Nomic (True) or Ollama (False) for embeddings
|
||||
"""
|
||||
self.use_nomic = use_nomic
|
||||
|
||||
def __call__(self, texts: List[str]) -> List[List[float]]:
|
||||
"""
|
||||
Generate embeddings for a list of texts.
|
||||
|
||||
Args:
|
||||
texts: List of texts to generate embeddings for
|
||||
|
||||
Returns:
|
||||
List of embeddings as float arrays
|
||||
"""
|
||||
return EmbeddingService.get_embeddings(texts, use_nomic=self.use_nomic)
|
||||
|
||||
class ChromaDBService:
|
||||
"""Service for storing and retrieving embedded messages in ChromaDB."""
|
||||
|
||||
# Use Ollama embeddings by default for reliability
|
||||
_embedding_function = CustomEmbeddingFunction(use_nomic=False)
|
||||
|
||||
@staticmethod
|
||||
def format_message_content(content, channel_name, subject, sender_name, date_sent):
|
||||
"""
|
||||
Format message content with metadata but without contextual enrichment.
|
||||
|
||||
Args:
|
||||
content (str): Original message content
|
||||
channel_name (str): Name of the channel
|
||||
subject (str): Subject of the message
|
||||
sender_name (str): Name of the sender
|
||||
date_sent (datetime): Date the message was sent
|
||||
|
||||
Returns:
|
||||
str: Formatted message content with basic metadata
|
||||
"""
|
||||
# Format date in a readable format
|
||||
date_str = date_sent.strftime("%Y-%m-%d %H:%M:%S")
|
||||
|
||||
# Replace None values with empty strings
|
||||
content = content or ""
|
||||
channel_name = channel_name or "Unknown Channel"
|
||||
subject = subject or "No Subject"
|
||||
sender_name = sender_name or "Unknown Sender"
|
||||
|
||||
# Return plain content with minimal metadata prefix
|
||||
return f"Channel: {channel_name} | Subject: {subject} | Sent by: {sender_name} | Date: {date_str}\n\n{content}"
|
||||
|
||||
@staticmethod
|
||||
def sanitize_metadata(metadata):
|
||||
"""
|
||||
Sanitize metadata to ensure no None values.
|
||||
|
||||
Args:
|
||||
metadata (dict): Metadata dictionary
|
||||
|
||||
Returns:
|
||||
dict: Sanitized metadata with no None values
|
||||
"""
|
||||
sanitized = {}
|
||||
for key, value in metadata.items():
|
||||
if value is None:
|
||||
if key == "channel":
|
||||
sanitized[key] = "Unknown Channel"
|
||||
elif key == "subject":
|
||||
sanitized[key] = "No Subject"
|
||||
elif key == "sender":
|
||||
sanitized[key] = "Unknown Sender"
|
||||
elif key == "timestamp":
|
||||
sanitized[key] = datetime.now().isoformat()
|
||||
else:
|
||||
sanitized[key] = ""
|
||||
else:
|
||||
sanitized[key] = value
|
||||
return sanitized
|
||||
|
||||
@staticmethod
|
||||
def add_message(message_id, content, channel_name, subject, sender_name, date_sent):
|
||||
"""
|
||||
Add a message to the ChromaDB collection with contextual information.
|
||||
|
||||
Args:
|
||||
message_id (str): ID of the message
|
||||
content (str): Content of the message
|
||||
channel_name (str): Name of the channel
|
||||
subject (str): Subject of the message
|
||||
sender_name (str): Name of the sender
|
||||
date_sent (datetime): Date the message was sent
|
||||
|
||||
Returns:
|
||||
bool: True if successful, False otherwise
|
||||
"""
|
||||
try:
|
||||
# Check if message already exists to avoid duplicates
|
||||
if ChromaDBService.message_exists(message_id):
|
||||
logger.info(f"Message ID {message_id} already exists in ChromaDB, skipping")
|
||||
return True
|
||||
|
||||
collection = get_chroma_collection()
|
||||
|
||||
# Create metadata and sanitize to prevent None values
|
||||
metadata = {
|
||||
"channel": channel_name,
|
||||
"subject": subject,
|
||||
"sender": sender_name,
|
||||
"timestamp": date_sent.isoformat() if date_sent else datetime.now().isoformat(),
|
||||
"source": "zulip"
|
||||
}
|
||||
|
||||
# Sanitize metadata to replace None values
|
||||
metadata = ChromaDBService.sanitize_metadata(metadata)
|
||||
|
||||
# Format the content to include structured context information
|
||||
formatted_content = ChromaDBService.format_message_content(
|
||||
content, channel_name, subject, sender_name, date_sent
|
||||
)
|
||||
|
||||
# Generate embeddings using our custom embedding function
|
||||
embeddings = ChromaDBService._embedding_function([formatted_content])
|
||||
|
||||
# Add to ChromaDB
|
||||
collection.add(
|
||||
ids=[str(message_id)],
|
||||
documents=[formatted_content],
|
||||
metadatas=[metadata],
|
||||
embeddings=embeddings if embeddings else None
|
||||
)
|
||||
|
||||
# Also add to BM25 index for hybrid search
|
||||
BM25Service.add_document(formatted_content, str(message_id))
|
||||
|
||||
logger.info(f"Successfully added message ID {message_id} to ChromaDB")
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"Error adding message to ChromaDB: {e}")
|
||||
return False
|
||||
|
||||
@staticmethod
|
||||
def search_similar(query_text, n_results=5, filter_criteria=None, use_hybrid=True, _internal_call=False):
|
||||
"""
|
||||
Search for similar messages in ChromaDB with improved contextual relevance.
|
||||
|
||||
Args:
|
||||
query_text (str): Text to search for
|
||||
n_results (int): Number of results to return
|
||||
filter_criteria (dict): Metadata filter criteria
|
||||
use_hybrid (bool): Whether to use hybrid search or just vector search
|
||||
_internal_call (bool): Internal parameter to prevent circular calls
|
||||
|
||||
Returns:
|
||||
dict: Search results from ChromaDB
|
||||
"""
|
||||
try:
|
||||
logger.info("Using temporary ChromaDB client to prevent duplicate embeddings")
|
||||
collection = get_chroma_collection()
|
||||
|
||||
# If hybrid search is disabled or this is an internal call from HybridSearchService,
|
||||
# fall back to vector-only search to prevent circular references
|
||||
if not use_hybrid or _internal_call:
|
||||
try:
|
||||
# Generate query embedding locally instead of using the collection's embedding function
|
||||
query_embedding = EmbeddingService.get_ollama_embeddings([query_text])[0]
|
||||
|
||||
# Perform search with embeddings using API directly to prevent collection modifications
|
||||
# Create a temporary read-only client just for search to avoid modifying the main collection
|
||||
temp_client = chromadb.PersistentClient(
|
||||
path=Config.CHROMADB_PATH,
|
||||
settings=chromadb.Settings(
|
||||
anonymized_telemetry=False,
|
||||
is_persistent=True,
|
||||
allow_reset=False
|
||||
)
|
||||
)
|
||||
|
||||
# Get the existing collection without an embedding function
|
||||
temp_collection = temp_client.get_collection(
|
||||
name=Config.CHROMADB_COLLECTION or "zulip_messages"
|
||||
)
|
||||
|
||||
# Perform search with embeddings
|
||||
results = temp_collection.query(
|
||||
query_embeddings=[query_embedding],
|
||||
n_results=n_results,
|
||||
where=filter_criteria,
|
||||
include=["metadatas", "documents", "distances"]
|
||||
)
|
||||
|
||||
# Close temporary client
|
||||
del temp_client
|
||||
|
||||
return results
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error with vector search: {e}")
|
||||
logger.info("Falling back to direct text query")
|
||||
|
||||
# Fallback to direct text query if embeddings fail
|
||||
# But use a similar approach with a temporary client
|
||||
try:
|
||||
# Create temporary client just for search
|
||||
temp_client = chromadb.PersistentClient(
|
||||
path=Config.CHROMADB_PATH,
|
||||
settings=chromadb.Settings(
|
||||
anonymized_telemetry=False,
|
||||
is_persistent=True,
|
||||
allow_reset=False
|
||||
)
|
||||
)
|
||||
|
||||
# Get the existing collection without an embedding function
|
||||
temp_collection = temp_client.get_collection(
|
||||
name=Config.CHROMADB_COLLECTION or "zulip_messages"
|
||||
)
|
||||
|
||||
# Use CustomEmbeddingFunction for just this query
|
||||
from app.db.chroma_service import CustomEmbeddingFunction
|
||||
embedding_func = CustomEmbeddingFunction(use_nomic=False)
|
||||
|
||||
# Get embedding for query
|
||||
query_embedding = embedding_func([query_text])[0]
|
||||
|
||||
# Search using the embedding
|
||||
results = temp_collection.query(
|
||||
query_embeddings=[query_embedding],
|
||||
n_results=n_results,
|
||||
where=filter_criteria,
|
||||
include=["metadatas", "documents", "distances"]
|
||||
)
|
||||
|
||||
# Close temporary client
|
||||
del temp_client
|
||||
|
||||
return results
|
||||
|
||||
except Exception as text_query_error:
|
||||
logger.error(f"Error with text query: {text_query_error}")
|
||||
# Last resort, just get all documents and do a simple text search
|
||||
all_docs = collection.get(where=filter_criteria, include=["metadatas", "documents", "embeddings"])
|
||||
# Return an empty result structure if no docs found
|
||||
if not all_docs or not all_docs.get('ids'):
|
||||
return {"ids": [[]], "documents": [[]], "metadatas": [[]], "distances": [[]]}
|
||||
return {"ids": [all_docs['ids'][:n_results]],
|
||||
"documents": [all_docs['documents'][:n_results]],
|
||||
"metadatas": [all_docs['metadatas'][:n_results]],
|
||||
"distances": [[1.0] * min(n_results, len(all_docs['ids']))]}
|
||||
|
||||
# Use BM25 + vector search from hybrid search module
|
||||
# We're not calling it directly here to avoid circular imports
|
||||
try:
|
||||
from app.utils.contextual_retrieval.hybrid_search import HybridSearchService
|
||||
|
||||
# Use hybrid search
|
||||
results = HybridSearchService.hybrid_search(
|
||||
query=query_text,
|
||||
n_results=n_results,
|
||||
filter_criteria=filter_criteria,
|
||||
rerank=True # Enable reranking
|
||||
)
|
||||
|
||||
# Convert to ChromaDB query result format
|
||||
formatted_results = {
|
||||
'ids': [[doc['id'] for doc in results]],
|
||||
'documents': [[doc['content'] for doc in results]],
|
||||
'metadatas': [[doc.get('metadata', {}) for doc in results]],
|
||||
'distances': [[1.0 - doc.get('combined_score', 0) for doc in results]]
|
||||
}
|
||||
|
||||
return formatted_results
|
||||
except ImportError:
|
||||
logger.warning("Hybrid search module not available, falling back to vector search")
|
||||
# Fall back to vector search if hybrid search module not available
|
||||
|
||||
# Create temporary client for search
|
||||
temp_client = chromadb.PersistentClient(
|
||||
path=Config.CHROMADB_PATH,
|
||||
settings=chromadb.Settings(
|
||||
anonymized_telemetry=False,
|
||||
is_persistent=True,
|
||||
allow_reset=False
|
||||
)
|
||||
)
|
||||
|
||||
# Get the existing collection without an embedding function
|
||||
temp_collection = temp_client.get_collection(
|
||||
name=Config.CHROMADB_COLLECTION or "zulip_messages"
|
||||
)
|
||||
|
||||
# Generate embedding
|
||||
query_embedding = EmbeddingService.get_ollama_embeddings([query_text])[0]
|
||||
|
||||
# Perform search
|
||||
results = temp_collection.query(
|
||||
query_embeddings=[query_embedding],
|
||||
n_results=n_results,
|
||||
where=filter_criteria,
|
||||
include=["metadatas", "documents", "distances"]
|
||||
)
|
||||
|
||||
# Close temporary client
|
||||
del temp_client
|
||||
|
||||
return results
|
||||
except Exception as e:
|
||||
logger.error(f"Error searching ChromaDB: {e}")
|
||||
# Return an empty result set rather than None
|
||||
return {"ids": [[]], "documents": [[]], "metadatas": [[]], "distances": [[]]}
|
||||
|
||||
@staticmethod
|
||||
def delete_message(message_id):
|
||||
"""
|
||||
Delete a message from ChromaDB.
|
||||
|
||||
Args:
|
||||
message_id (str): ID of the message to delete
|
||||
|
||||
Returns:
|
||||
bool: True if successful, False otherwise
|
||||
"""
|
||||
try:
|
||||
collection = get_chroma_collection()
|
||||
collection.delete(ids=[str(message_id)])
|
||||
|
||||
# Also update BM25 index - for simplicity, we'll rebuild it from ChromaDB
|
||||
# In a production scenario, you might want a more efficient approach
|
||||
all_results = collection.get()
|
||||
if all_results and all_results['ids']:
|
||||
BM25Service.index_documents(all_results['documents'], all_results['ids'])
|
||||
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"Error deleting message from ChromaDB: {e}")
|
||||
return False
|
||||
|
||||
@staticmethod
|
||||
def get_message_by_id(message_id):
|
||||
"""
|
||||
Get a message from ChromaDB by ID.
|
||||
|
||||
Args:
|
||||
message_id (str): ID of the message to retrieve
|
||||
|
||||
Returns:
|
||||
dict: Message data or None if not found
|
||||
"""
|
||||
try:
|
||||
collection = get_chroma_collection()
|
||||
result = collection.get(ids=[str(message_id)])
|
||||
|
||||
if result['ids'] and len(result['ids']) > 0:
|
||||
return {
|
||||
'id': result['ids'][0],
|
||||
'content': result['documents'][0],
|
||||
'metadata': result['metadatas'][0]
|
||||
}
|
||||
return None
|
||||
except RecursionError:
|
||||
logger.error(f"Recursion error when getting message ID {message_id} from ChromaDB")
|
||||
return None
|
||||
except Exception as e:
|
||||
logger.error(f"Error getting message from ChromaDB: {e}")
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def message_exists(message_id):
|
||||
"""
|
||||
Check if a message exists in ChromaDB.
|
||||
|
||||
Args:
|
||||
message_id (str): ID of the message to check
|
||||
|
||||
Returns:
|
||||
bool: True if exists, False otherwise
|
||||
"""
|
||||
try:
|
||||
collection = get_chroma_collection()
|
||||
result = collection.get(ids=[str(message_id)], include=[])
|
||||
|
||||
return len(result['ids']) > 0
|
||||
except Exception as e:
|
||||
logger.error(f"Error checking if message exists in ChromaDB: {e}")
|
||||
return False
|
||||
|
||||
@staticmethod
|
||||
def switch_embedding_method(use_nomic: bool):
|
||||
"""
|
||||
Switch between Nomic and Ollama embedding methods.
|
||||
|
||||
Args:
|
||||
use_nomic: Whether to use Nomic (True) or Ollama (False)
|
||||
"""
|
||||
ChromaDBService._embedding_function = CustomEmbeddingFunction(use_nomic=use_nomic)
|
||||
120
app/db/integration_service.py
Normal file
120
app/db/integration_service.py
Normal file
@ -0,0 +1,120 @@
|
||||
"""
|
||||
Database integration service.
|
||||
Combines functionality from both Zulip and ChromaDB services.
|
||||
"""
|
||||
from datetime import datetime
|
||||
from app.db.zulip_service import ZulipDatabaseService
|
||||
from app.db.chroma_service import ChromaDBService
|
||||
from app.utils.contextual_retrieval.hybrid_search import HybridSearchService
|
||||
|
||||
class DatabaseIntegrationService:
|
||||
"""
|
||||
Service for integrating between Zulip DB and ChromaDB.
|
||||
Handles the synchronization of messages from Zulip to ChromaDB.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def sync_messages_to_chromadb(days_ago=30, limit=1000):
|
||||
"""
|
||||
Sync recent messages from Zulip to ChromaDB.
|
||||
|
||||
Args:
|
||||
days_ago (int): Number of days to look back
|
||||
limit (int): Maximum number of messages to sync
|
||||
|
||||
Returns:
|
||||
dict: Statistics about the sync operation
|
||||
"""
|
||||
# Get messages from Zulip
|
||||
messages = ZulipDatabaseService.get_messages_from_it_channels(days_ago=days_ago, limit=limit)
|
||||
|
||||
stats = {
|
||||
"total_messages": len(messages),
|
||||
"new_messages": 0,
|
||||
"already_existing": 0,
|
||||
"failed": 0
|
||||
}
|
||||
|
||||
# Process each message
|
||||
for message in messages:
|
||||
# Check if message already exists in ChromaDB
|
||||
if ChromaDBService.message_exists(message.id):
|
||||
stats["already_existing"] += 1
|
||||
continue
|
||||
|
||||
# Get channel name for the message
|
||||
channel_name = ZulipDatabaseService.get_channel_name_for_message(message)
|
||||
|
||||
# Get sender name (we don't have that information readily available from the query)
|
||||
# In a real implementation, we would join with the UserProfile table
|
||||
sender_name = f"User ID: {message.sender_id}"
|
||||
|
||||
# Add message to ChromaDB
|
||||
success = ChromaDBService.add_message(
|
||||
message_id=message.id,
|
||||
content=message.content,
|
||||
channel_name=channel_name,
|
||||
subject=message.subject,
|
||||
sender_name=sender_name,
|
||||
date_sent=message.date_sent
|
||||
)
|
||||
|
||||
if success:
|
||||
stats["new_messages"] += 1
|
||||
else:
|
||||
stats["failed"] += 1
|
||||
|
||||
return stats
|
||||
|
||||
@staticmethod
|
||||
def search_knowledge_base(query_text, n_results=5, filter_channel=None, use_hybrid=True, use_reranking=True):
|
||||
"""
|
||||
Search for messages in the knowledge base using hybrid search.
|
||||
|
||||
Args:
|
||||
query_text (str): Text to search for
|
||||
n_results (int): Number of results to return
|
||||
filter_channel (str): Optional channel name to filter results
|
||||
use_hybrid (bool): Whether to use hybrid search or just vector search
|
||||
use_reranking (bool): Whether to apply reranking to the results
|
||||
|
||||
Returns:
|
||||
list: List of search results
|
||||
"""
|
||||
# Prepare filter criteria
|
||||
filter_criteria = None
|
||||
if filter_channel:
|
||||
filter_criteria = {"channel": filter_channel}
|
||||
|
||||
# Decide which search method to use
|
||||
if use_hybrid:
|
||||
# Use the hybrid search service
|
||||
results = HybridSearchService.hybrid_search(
|
||||
query=query_text,
|
||||
n_results=n_results,
|
||||
filter_criteria=filter_criteria,
|
||||
rerank=use_reranking
|
||||
)
|
||||
|
||||
return results
|
||||
else:
|
||||
# Use the standard ChromaDB search
|
||||
results = ChromaDBService.search_similar(
|
||||
query_text=query_text,
|
||||
n_results=n_results,
|
||||
filter_criteria=filter_criteria,
|
||||
use_hybrid=False
|
||||
)
|
||||
|
||||
# Format results
|
||||
formatted_results = []
|
||||
if results and results['ids'] and len(results['ids'][0]) > 0:
|
||||
for i in range(len(results['ids'][0])):
|
||||
formatted_results.append({
|
||||
'id': results['ids'][0][i],
|
||||
'content': results['documents'][0][i],
|
||||
'metadata': results['metadatas'][0][i],
|
||||
'score': results['distances'][0][i] if 'distances' in results else None
|
||||
})
|
||||
|
||||
return formatted_results
|
||||
209
app/db/zulip_service.py
Normal file
209
app/db/zulip_service.py
Normal file
@ -0,0 +1,209 @@
|
||||
"""
|
||||
Service for querying messages from the Zulip database.
|
||||
"""
|
||||
from datetime import datetime, timedelta
|
||||
from sqlalchemy import and_, or_
|
||||
from app.db import get_db_session
|
||||
from app.models.zulip import Message, Stream, Recipient, UserProfile, IT_RECIPIENT_IDS
|
||||
|
||||
class ZulipDatabaseService:
|
||||
"""Service for querying messages from the Zulip database."""
|
||||
|
||||
@staticmethod
|
||||
def get_messages_from_it_channels(days_ago=None, limit=1000, since=None):
|
||||
"""
|
||||
Get recent messages from IT channels.
|
||||
|
||||
Args:
|
||||
days_ago (int): Number of days to look back (optional)
|
||||
limit (int): Maximum number of messages to return
|
||||
since (datetime): Get messages after this datetime (optional)
|
||||
|
||||
Returns:
|
||||
list: List of Message objects
|
||||
"""
|
||||
session = get_db_session()
|
||||
|
||||
# Build the query based on parameters
|
||||
query = session.query(Message).filter(
|
||||
Message.recipient_id.in_(IT_RECIPIENT_IDS)
|
||||
)
|
||||
|
||||
# Add date filter if specified
|
||||
if since:
|
||||
query = query.filter(Message.date_sent >= since)
|
||||
elif days_ago:
|
||||
start_date = datetime.now() - timedelta(days=days_ago)
|
||||
query = query.filter(Message.date_sent >= start_date)
|
||||
|
||||
# Get results
|
||||
messages = query.order_by(Message.id.desc()).limit(limit).all()
|
||||
|
||||
return messages
|
||||
|
||||
@staticmethod
|
||||
def get_messages_newer_than_id(message_id, limit=100):
|
||||
"""
|
||||
Get messages with ID greater than the specified ID.
|
||||
|
||||
Args:
|
||||
message_id (int): Get messages with ID greater than this
|
||||
limit (int): Maximum number of messages to return
|
||||
|
||||
Returns:
|
||||
list: List of Message objects
|
||||
"""
|
||||
session = get_db_session()
|
||||
|
||||
messages = session.query(Message).filter(
|
||||
and_(
|
||||
Message.recipient_id.in_(IT_RECIPIENT_IDS),
|
||||
Message.id > message_id
|
||||
)
|
||||
).order_by(Message.id.asc()).limit(limit).all()
|
||||
|
||||
return messages
|
||||
|
||||
@staticmethod
|
||||
def get_message_by_id(message_id):
|
||||
"""
|
||||
Get a specific message by ID.
|
||||
|
||||
Args:
|
||||
message_id (int): ID of the message to retrieve
|
||||
|
||||
Returns:
|
||||
Message: Message object or None if not found
|
||||
"""
|
||||
session = get_db_session()
|
||||
return session.query(Message).filter(Message.id == message_id).first()
|
||||
|
||||
@staticmethod
|
||||
def search_messages(search_term, days_ago=365, limit=100):
|
||||
"""
|
||||
Search for messages containing a specific term.
|
||||
|
||||
Args:
|
||||
search_term (str): Term to search for
|
||||
days_ago (int): Number of days to look back
|
||||
limit (int): Maximum number of messages to return
|
||||
|
||||
Returns:
|
||||
list: List of Message objects matching the search
|
||||
"""
|
||||
session = get_db_session()
|
||||
start_date = datetime.now() - timedelta(days=days_ago)
|
||||
|
||||
# Use the tsquery system if available, otherwise fall back to LIKE
|
||||
messages = session.query(Message).filter(
|
||||
and_(
|
||||
Message.recipient_id.in_(IT_RECIPIENT_IDS),
|
||||
Message.date_sent >= start_date,
|
||||
or_(
|
||||
Message.content.ilike(f'%{search_term}%'),
|
||||
Message.subject.ilike(f'%{search_term}%')
|
||||
)
|
||||
)
|
||||
).order_by(Message.date_sent.desc()).limit(limit).all()
|
||||
|
||||
return messages
|
||||
|
||||
@staticmethod
|
||||
def get_channel_name_for_message(message):
|
||||
"""
|
||||
Get the channel name for a message.
|
||||
|
||||
Args:
|
||||
message (Message): Message object
|
||||
|
||||
Returns:
|
||||
str: Channel name or "Unknown Channel" if not found
|
||||
"""
|
||||
session = get_db_session()
|
||||
|
||||
try:
|
||||
if not message or not message.recipient_id:
|
||||
return "Unknown Channel"
|
||||
|
||||
# First, get the recipient to determine type
|
||||
recipient = session.query(Recipient).filter(
|
||||
Recipient.id == message.recipient_id
|
||||
).first()
|
||||
|
||||
if not recipient:
|
||||
return "Unknown Channel"
|
||||
|
||||
# Check recipient type (1 = stream, 2 = user, 3 = huddle)
|
||||
if recipient.type != 1:
|
||||
# For direct messages or huddles
|
||||
return "Direct Message" if recipient.type == 2 else "Group Message"
|
||||
|
||||
# For stream messages, get the stream name
|
||||
stream = session.query(Stream).filter(
|
||||
Stream.recipient_id == message.recipient_id
|
||||
).first()
|
||||
|
||||
# Return the name or a default value
|
||||
return stream.name if stream and stream.name else "Unknown Channel"
|
||||
except Exception as e:
|
||||
# Log the error but don't crash - return a default value
|
||||
print(f"Error getting channel name for message {message.id if message else 'unknown'}: {e}")
|
||||
return "Unknown Channel"
|
||||
|
||||
@staticmethod
|
||||
def get_sender_name_for_message(message):
|
||||
"""
|
||||
Get the sender name for a message.
|
||||
|
||||
Args:
|
||||
message (Message): Message object
|
||||
|
||||
Returns:
|
||||
str: Sender full name or 'Unknown User' if not found
|
||||
"""
|
||||
session = get_db_session()
|
||||
|
||||
try:
|
||||
if not message or not message.sender_id:
|
||||
return "Unknown User"
|
||||
|
||||
user = session.query(UserProfile).filter(
|
||||
UserProfile.id == message.sender_id
|
||||
).first()
|
||||
|
||||
return user.full_name if user and user.full_name else "Unknown User"
|
||||
except Exception as e:
|
||||
# Log the error but don't crash - return a default value
|
||||
print(f"Error getting sender name for message {message.id if message else 'unknown'}: {e}")
|
||||
return "Unknown User"
|
||||
|
||||
@staticmethod
|
||||
def count_messages_up_to_id(message_id, since=None):
|
||||
"""
|
||||
Count messages with ID less than or equal to the specified ID.
|
||||
|
||||
Args:
|
||||
message_id (int): Count messages with ID <= this
|
||||
since (datetime): Only count messages after this datetime (optional)
|
||||
|
||||
Returns:
|
||||
int: Count of messages
|
||||
"""
|
||||
session = get_db_session()
|
||||
|
||||
# Build the query
|
||||
query = session.query(Message).filter(
|
||||
and_(
|
||||
Message.recipient_id.in_(IT_RECIPIENT_IDS),
|
||||
Message.id <= message_id
|
||||
)
|
||||
)
|
||||
|
||||
# Add date filter if specified
|
||||
if since:
|
||||
query = query.filter(Message.date_sent >= since)
|
||||
|
||||
# Count the messages
|
||||
count = query.count()
|
||||
|
||||
return count
|
||||
10
app/models/__init__.py
Normal file
10
app/models/__init__.py
Normal file
@ -0,0 +1,10 @@
|
||||
"""
|
||||
Models module for the application.
|
||||
Contains SQLAlchemy model definitions for Zulip database tables.
|
||||
"""
|
||||
from app.db import Base
|
||||
|
||||
# Import models to make them available through the models module
|
||||
from app.models.zulip import Recipient, Stream, Message, UserProfile, IT_CHANNELS, IT_RECIPIENT_IDS
|
||||
|
||||
# This will be populated as we define models in the next steps
|
||||
BIN
app/models/__pycache__/__init__.cpython-311.pyc
Normal file
BIN
app/models/__pycache__/__init__.cpython-311.pyc
Normal file
Binary file not shown.
BIN
app/models/__pycache__/zulip.cpython-311.pyc
Normal file
BIN
app/models/__pycache__/zulip.cpython-311.pyc
Normal file
Binary file not shown.
96
app/models/zulip.py
Normal file
96
app/models/zulip.py
Normal file
@ -0,0 +1,96 @@
|
||||
"""
|
||||
SQLAlchemy models for the Zulip database tables.
|
||||
"""
|
||||
from sqlalchemy import Column, Integer, String, Text, Boolean, SmallInteger, DateTime, ForeignKey, BigInteger
|
||||
from sqlalchemy.orm import relationship
|
||||
from app.db import Base
|
||||
|
||||
class Recipient(Base):
|
||||
"""
|
||||
Model for zerver_recipient table in Zulip DB.
|
||||
Recipients can be of different types (e.g., stream, user, huddle).
|
||||
"""
|
||||
__tablename__ = 'zerver_recipient'
|
||||
__table_args__ = {'schema': 'zulip'}
|
||||
|
||||
id = Column(Integer, primary_key=True)
|
||||
type_id = Column(Integer)
|
||||
type = Column(SmallInteger) # 1 for stream, 2 for user, 3 for huddle
|
||||
|
||||
# Relationships
|
||||
messages = relationship("Message", back_populates="recipient")
|
||||
stream = relationship("Stream", back_populates="recipient", uselist=False)
|
||||
|
||||
class Stream(Base):
|
||||
"""
|
||||
Model for zerver_stream table in Zulip DB.
|
||||
Represents a Zulip channel (called stream in Zulip terminology).
|
||||
"""
|
||||
__tablename__ = 'zerver_stream'
|
||||
__table_args__ = {'schema': 'zulip'}
|
||||
|
||||
id = Column(BigInteger, primary_key=True)
|
||||
name = Column(String)
|
||||
date_created = Column(DateTime)
|
||||
deactivated = Column(Boolean)
|
||||
description = Column(String)
|
||||
rendered_description = Column(Text)
|
||||
invite_only = Column(Boolean)
|
||||
recipient_id = Column(Integer, ForeignKey('zulip.zerver_recipient.id'))
|
||||
realm_id = Column(Integer)
|
||||
|
||||
# Relationships
|
||||
recipient = relationship("Recipient", back_populates="stream")
|
||||
|
||||
class Message(Base):
|
||||
"""
|
||||
Model for zerver_message table in Zulip DB.
|
||||
Represents a message sent in Zulip.
|
||||
"""
|
||||
__tablename__ = 'zerver_message'
|
||||
__table_args__ = {'schema': 'zulip'}
|
||||
|
||||
id = Column(Integer, primary_key=True)
|
||||
sender_id = Column(Integer, ForeignKey('zulip.zerver_userprofile.id'))
|
||||
recipient_id = Column(Integer, ForeignKey('zulip.zerver_recipient.id'))
|
||||
subject = Column(String)
|
||||
content = Column(Text)
|
||||
rendered_content = Column(Text)
|
||||
date_sent = Column(DateTime)
|
||||
type = Column(SmallInteger) # 1 for stream message, 2 for private message
|
||||
has_attachment = Column(Boolean)
|
||||
has_image = Column(Boolean)
|
||||
has_link = Column(Boolean)
|
||||
is_channel_message = Column(Boolean)
|
||||
realm_id = Column(Integer)
|
||||
|
||||
# Relationships
|
||||
sender = relationship("UserProfile", back_populates="messages")
|
||||
recipient = relationship("Recipient", back_populates="messages")
|
||||
|
||||
class UserProfile(Base):
|
||||
"""
|
||||
Model for zerver_userprofile table in Zulip DB.
|
||||
Represents a Zulip user.
|
||||
"""
|
||||
__tablename__ = 'zerver_userprofile'
|
||||
__table_args__ = {'schema': 'zulip'}
|
||||
|
||||
id = Column(Integer, primary_key=True)
|
||||
email = Column(String)
|
||||
full_name = Column(String)
|
||||
is_active = Column(Boolean)
|
||||
realm_id = Column(Integer)
|
||||
|
||||
# Relationships
|
||||
messages = relationship("Message", back_populates="sender")
|
||||
|
||||
# Constants for the channels we're monitoring
|
||||
IT_CHANNELS = {
|
||||
"IT Discussions": 5, # id = 5, recipient_id = 16
|
||||
"IT Knowledge": 17, # id = 17, recipient_id = 47
|
||||
"IT Support": 16 # id = 16, recipient_id = 43
|
||||
}
|
||||
|
||||
# Recipient IDs for the channels we're monitoring
|
||||
IT_RECIPIENT_IDS = [16, 47, 43]
|
||||
33
app/utils/__init__.py
Normal file
33
app/utils/__init__.py
Normal file
@ -0,0 +1,33 @@
|
||||
"""
|
||||
Utilities module for the application.
|
||||
Contains helper functions and utilities for the application.
|
||||
"""
|
||||
import importlib
|
||||
import sys
|
||||
import numpy as np
|
||||
|
||||
def patch_chromadb_numpy():
|
||||
"""
|
||||
Patch ChromaDB to use np.nan instead of np.NaN for NumPy 2.0 compatibility.
|
||||
|
||||
This function uses monkey patching to replace the old np.NaN reference in the
|
||||
brute_force_index.py file of ChromaDB with the new np.nan (lowercase).
|
||||
"""
|
||||
try:
|
||||
# Get the module where the error occurs
|
||||
from chromadb.segment.impl.vector import brute_force_index
|
||||
|
||||
# Patch the module to use np.nan instead of np.NaN
|
||||
if not hasattr(np, 'NaN'):
|
||||
np.NaN = np.nan
|
||||
|
||||
print("NumPy compatibility patch applied for ChromaDB")
|
||||
return True
|
||||
except ImportError:
|
||||
print("Could not patch ChromaDB: module not found")
|
||||
return False
|
||||
except Exception as e:
|
||||
print(f"Error patching ChromaDB: {e}")
|
||||
return False
|
||||
|
||||
# This module will be populated with utility functions in later steps
|
||||
BIN
app/utils/__pycache__/__init__.cpython-311.pyc
Normal file
BIN
app/utils/__pycache__/__init__.cpython-311.pyc
Normal file
Binary file not shown.
BIN
app/utils/__pycache__/ai_service.cpython-311.pyc
Normal file
BIN
app/utils/__pycache__/ai_service.cpython-311.pyc
Normal file
Binary file not shown.
BIN
app/utils/__pycache__/bot_service.cpython-311.pyc
Normal file
BIN
app/utils/__pycache__/bot_service.cpython-311.pyc
Normal file
Binary file not shown.
BIN
app/utils/__pycache__/embeddings.cpython-311.pyc
Normal file
BIN
app/utils/__pycache__/embeddings.cpython-311.pyc
Normal file
Binary file not shown.
BIN
app/utils/__pycache__/sync_service.cpython-311.pyc
Normal file
BIN
app/utils/__pycache__/sync_service.cpython-311.pyc
Normal file
Binary file not shown.
372
app/utils/ai_service.py
Normal file
372
app/utils/ai_service.py
Normal file
@ -0,0 +1,372 @@
|
||||
"""
|
||||
AI service for OpenAI API integration.
|
||||
|
||||
This module provides a class for generating responses using the OpenAI API.
|
||||
It handles authentication, prompt engineering, error handling, and retries.
|
||||
"""
|
||||
|
||||
import os
|
||||
import time
|
||||
import logging
|
||||
import hashlib
|
||||
import functools
|
||||
from datetime import datetime, timedelta
|
||||
from typing import List, Dict, Any, Optional, Tuple
|
||||
from openai import OpenAI
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
||||
logger = logging.getLogger("ai_service")
|
||||
|
||||
# Simple in-memory cache for responses
|
||||
RESPONSE_CACHE = {}
|
||||
CACHE_TTL = 3600 # 1 hour in seconds
|
||||
|
||||
class OpenAIService:
|
||||
"""Service for generating responses using the OpenAI API."""
|
||||
|
||||
def __init__(self, api_key: Optional[str] = None,
|
||||
model_name: str = "gpt-4o",
|
||||
enable_cache: bool = True,
|
||||
cache_ttl: int = CACHE_TTL,
|
||||
rate_limit: int = 60): # 60 requests per minute
|
||||
"""
|
||||
Initialize the OpenAI service.
|
||||
|
||||
Args:
|
||||
api_key: API key for OpenAI. If None, uses OPENAI_API_KEY environment variable.
|
||||
model_name: Name of the OpenAI model to use.
|
||||
enable_cache: Whether to enable response caching.
|
||||
cache_ttl: Time-to-live for cached responses in seconds.
|
||||
rate_limit: Maximum number of requests allowed per minute.
|
||||
"""
|
||||
self.api_key = api_key or os.getenv("OPENAI_API_KEY")
|
||||
if not self.api_key:
|
||||
raise ValueError("OpenAI API key not provided. Set OPENAI_API_KEY environment variable or pass api_key parameter.")
|
||||
|
||||
self.model_name = model_name
|
||||
self.enable_cache = enable_cache
|
||||
self.cache_ttl = cache_ttl
|
||||
self.rate_limit = rate_limit
|
||||
|
||||
# Rate limiting state
|
||||
self.request_timestamps = []
|
||||
|
||||
# Configure OpenAI API
|
||||
self.client = OpenAI(api_key=self.api_key)
|
||||
|
||||
logger.info(f"Initialized OpenAIService with model: {model_name}")
|
||||
|
||||
def _check_rate_limit(self):
|
||||
"""
|
||||
Check if the rate limit has been reached.
|
||||
Waits if necessary to stay within the rate limit.
|
||||
"""
|
||||
current_time = time.time()
|
||||
# Remove timestamps older than 60 seconds
|
||||
self.request_timestamps = [ts for ts in self.request_timestamps if current_time - ts < 60]
|
||||
|
||||
# Check if we've reached the rate limit
|
||||
if len(self.request_timestamps) >= self.rate_limit:
|
||||
# Calculate how long to wait
|
||||
oldest_timestamp = min(self.request_timestamps)
|
||||
sleep_time = 60 - (current_time - oldest_timestamp)
|
||||
if sleep_time > 0:
|
||||
logger.warning(f"Rate limit reached. Waiting {sleep_time:.2f} seconds...")
|
||||
time.sleep(sleep_time)
|
||||
|
||||
# Add current timestamp to the list
|
||||
self.request_timestamps.append(time.time())
|
||||
|
||||
def _detect_language(self, text: str) -> str:
|
||||
"""
|
||||
Detect the language of a text string.
|
||||
|
||||
Args:
|
||||
text: The text to detect the language of.
|
||||
|
||||
Returns:
|
||||
A language code, e.g. 'en' for English, 'ka' for Georgian.
|
||||
"""
|
||||
try:
|
||||
# Use a very small prompt to detect language
|
||||
if not text:
|
||||
return 'en' # Default to English for empty text
|
||||
|
||||
# Simple language detection using a dedicated small request
|
||||
response = self.client.chat.completions.create(
|
||||
model=self.model_name,
|
||||
messages=[
|
||||
{"role": "system", "content": "You are a language detection service. Respond with only the ISO language code ('en' for English, 'ka' for Georgian, etc.)."},
|
||||
{"role": "user", "content": f"Detect the language of this text: {text[:100]}"}
|
||||
],
|
||||
max_tokens=10,
|
||||
temperature=0
|
||||
)
|
||||
|
||||
language_code = response.choices[0].message.content.strip().lower()
|
||||
logger.info(f"Detected language: {language_code}")
|
||||
|
||||
# Validate and default to English for any issues
|
||||
if language_code not in ['en', 'ka']:
|
||||
return 'en'
|
||||
|
||||
return language_code
|
||||
except Exception as e:
|
||||
logger.error(f"Error detecting language: {e}")
|
||||
return 'en' # Default to English on error
|
||||
|
||||
def _generate_cache_key(self, query: str, context: List[Dict[str, Any]]) -> str:
|
||||
"""
|
||||
Generate a cache key for the query and context.
|
||||
|
||||
Args:
|
||||
query: The query string.
|
||||
context: The context documents.
|
||||
|
||||
Returns:
|
||||
A string hash key for caching.
|
||||
"""
|
||||
# Create a string representation of the context
|
||||
context_str = ""
|
||||
for doc in context:
|
||||
if 'content' in doc:
|
||||
context_str += doc['content'][:100] # Use just the beginning for performance
|
||||
|
||||
# Create a hash of the query and context
|
||||
key_str = query + context_str
|
||||
return hashlib.md5(key_str.encode('utf-8')).hexdigest()
|
||||
|
||||
def _get_cached_response(self, cache_key: str) -> Optional[str]:
|
||||
"""
|
||||
Get a cached response if available and not expired.
|
||||
|
||||
Args:
|
||||
cache_key: The cache key.
|
||||
|
||||
Returns:
|
||||
The cached response, or None if not found or expired.
|
||||
"""
|
||||
if not self.enable_cache:
|
||||
return None
|
||||
|
||||
if cache_key in RESPONSE_CACHE:
|
||||
timestamp, response = RESPONSE_CACHE[cache_key]
|
||||
|
||||
# Check if the cache entry has expired
|
||||
if time.time() - timestamp < self.cache_ttl:
|
||||
logger.info("Using cached response")
|
||||
return response
|
||||
|
||||
# Remove expired cache entry
|
||||
del RESPONSE_CACHE[cache_key]
|
||||
|
||||
return None
|
||||
|
||||
def _cache_response(self, cache_key: str, response: str):
|
||||
"""
|
||||
Cache a response.
|
||||
|
||||
Args:
|
||||
cache_key: The cache key.
|
||||
response: The response to cache.
|
||||
"""
|
||||
if not self.enable_cache:
|
||||
return
|
||||
|
||||
RESPONSE_CACHE[cache_key] = (time.time(), response)
|
||||
|
||||
# Clean up expired cache entries if cache is getting large
|
||||
if len(RESPONSE_CACHE) > 1000: # Arbitrary limit
|
||||
self._cleanup_cache()
|
||||
|
||||
def _cleanup_cache(self):
|
||||
"""Clean up expired cache entries."""
|
||||
current_time = time.time()
|
||||
keys_to_delete = []
|
||||
|
||||
for key, (timestamp, _) in RESPONSE_CACHE.items():
|
||||
if current_time - timestamp >= self.cache_ttl:
|
||||
keys_to_delete.append(key)
|
||||
|
||||
for key in keys_to_delete:
|
||||
del RESPONSE_CACHE[key]
|
||||
|
||||
logger.info(f"Cleaned up {len(keys_to_delete)} expired cache entries")
|
||||
|
||||
def generate_response(self, query: str, context: List[Dict[str, Any]],
|
||||
max_retries: int = 3, temperature: float = 0.7) -> str:
|
||||
"""
|
||||
Generate a response using the OpenAI API.
|
||||
|
||||
Args:
|
||||
query: The user's query.
|
||||
context: A list of relevant context documents from ChromaDB.
|
||||
Each document should be a dict with 'content' and 'metadata' keys.
|
||||
max_retries: Maximum number of retry attempts for API failures.
|
||||
temperature: Controls randomness in the response. Lower is more deterministic.
|
||||
|
||||
Returns:
|
||||
The generated response text.
|
||||
"""
|
||||
# Check rate limit
|
||||
self._check_rate_limit()
|
||||
|
||||
# Detect language
|
||||
language = self._detect_language(query)
|
||||
|
||||
# Check cache
|
||||
cache_key = self._generate_cache_key(query, context)
|
||||
cached_response = self._get_cached_response(cache_key)
|
||||
if cached_response:
|
||||
return cached_response
|
||||
|
||||
# Construct the messages using the context
|
||||
messages = self._construct_messages(query, context, language)
|
||||
|
||||
# Try to generate response with retries
|
||||
retry_count = 0
|
||||
while retry_count <= max_retries:
|
||||
try:
|
||||
logger.info(f"Attempting to generate response (attempt {retry_count+1}/{max_retries+1})")
|
||||
|
||||
# Generate with OpenAI API
|
||||
response = self.client.chat.completions.create(
|
||||
model=self.model_name,
|
||||
messages=messages,
|
||||
temperature=temperature,
|
||||
max_tokens=4096,
|
||||
top_p=0.8
|
||||
)
|
||||
|
||||
# Extract the response text
|
||||
response_text = response.choices[0].message.content
|
||||
|
||||
# Cache the response
|
||||
self._cache_response(cache_key, response_text)
|
||||
|
||||
# Return the response text
|
||||
return response_text
|
||||
|
||||
except Exception as e:
|
||||
retry_count += 1
|
||||
wait_time = 2 ** retry_count # Exponential backoff
|
||||
|
||||
# Log more details about the error
|
||||
logger.error(f"API call error: {type(e).__name__}: {str(e)}")
|
||||
|
||||
if retry_count <= max_retries:
|
||||
logger.warning(f"API call failed: {str(e)}. Retrying in {wait_time} seconds. (Attempt {retry_count}/{max_retries})")
|
||||
time.sleep(wait_time)
|
||||
else:
|
||||
logger.error(f"API call failed after {max_retries} retries: {str(e)}")
|
||||
# Return a simple response when all retries fail
|
||||
return f"I'm here to help with IT questions. However, I'm currently experiencing technical difficulties. Please try again later or contact IT support directly."
|
||||
|
||||
def _construct_messages(self, query: str, context: List[Dict[str, Any]], language: str = 'en') -> List[Dict[str, str]]:
|
||||
"""
|
||||
Construct message list with the query and context.
|
||||
|
||||
Args:
|
||||
query: The user's query.
|
||||
context: A list of relevant context documents from ChromaDB.
|
||||
language: The language code (e.g., 'en', 'ka').
|
||||
|
||||
Returns:
|
||||
List of message dictionaries for the OpenAI API.
|
||||
"""
|
||||
# System instruction based on language
|
||||
if language == 'ka':
|
||||
system_instruction = """როგორც IT_Bot, თქვენი როლია ორგანიზაციაში IT ტექნიკური დახმარების გაწევა:
|
||||
|
||||
## როლი და მიზანი:
|
||||
- თქვენ ხართ ორგანიზაციის IT დახმარების ბოტი, რომელიც ეხმარება თანამშრომლებს ტექნიკური საკითხების გადაჭრაში.
|
||||
- გამოიყენეთ მოცემული ისტორიული კონტექსტი ზუსტი და სასარგებლო პასუხების გასაცემად.
|
||||
- როდესაც კონტექსტი ამბობს რომ რაიმე პრობლემა შეიძლება არსებობდეს, ჩათვალეთ რომ ეს მართლაც პრობლემაა.
|
||||
|
||||
## პასუხების მიდგომა:
|
||||
1. გამოიყენეთ მოცემული კონტექსტი პასუხების შესაქმნელად. თუ კონტექსტში მოცემულია კონკრეტული IT საკითხები და მათი გადაწყვეტა, გამოიყენეთ ეს ინფორმაცია.
|
||||
2. თუ კონტექსტი შეიცავს ინფორმაციას მსგავსი პრობლემის შესახებ, გააანალიზეთ, როგორ გადაიჭრა ეს პრობლემა წარსულში.
|
||||
3. მითითებები და ცოდნა მოცემული კონტექსტიდან პრიორიტეტული უნდა იყოს ზოგად ცოდნასთან შედარებით.
|
||||
4. თუ კითხვა არ უკავშირდება IT თემებს, მიუთითეთ მომხმარებელს, რომ დაუკავშირდეს IT მხარდაჭერას.
|
||||
5. დეტალური, ნაბიჯ-ნაბიჯ ინსტრუქციები მიაწოდეთ, როცა სთხოვენ ტექნიკური პრობლემის გადაჭრას.
|
||||
|
||||
## პასუხის ფორმატი:
|
||||
- მკაფიო, ზუსტი და კონკრეტული პასუხები გაეცით.
|
||||
- პასუხები დააფორმატეთ ადვილად წასაკითხად, გამოიყენეთ პუნქტები და ქვესათაურები, როცა საჭიროა.
|
||||
- მიაწოდეთ კონკრეტული ბრძანებები, კოდის მაგალითები ან ინსტრუქციები, როცა საჭიროა.
|
||||
- არ გამოიყენოთ [Reference X] ფორმატი პასუხებში - ინფორმაცია პირდაპირ ჩასვით პასუხში წყაროზე მითითების გარეშე."""
|
||||
else: # Default to English
|
||||
system_instruction = """As IT_Bot, your role is to provide technical IT support within the organization:
|
||||
|
||||
## Role and Purpose:
|
||||
- You are an IT support bot for the organization, helping employees resolve technical issues.
|
||||
- Use the provided historical context to give accurate and helpful responses.
|
||||
- When context mentions that there may be an issue with something, assume there is an issue.
|
||||
|
||||
## Response Approach:
|
||||
1. Use the provided context to craft your answers. If the context contains specific IT issues and resolutions, use that information.
|
||||
2. If the context contains information about similar problems, analyze how the problem was resolved in the past.
|
||||
3. Guidance and knowledge from the provided context should take precedence over general knowledge.
|
||||
4. If a question is unrelated to IT topics, direct the user to contact IT support.
|
||||
5. Provide detailed, step-by-step instructions when asked about resolving a technical issue.
|
||||
|
||||
## Response Format:
|
||||
- Respond with clear, precise, and specific answers.
|
||||
- Format answers for easy reading, using bullet points and subheadings when appropriate.
|
||||
- Provide specific commands, code examples, or instructions when relevant.
|
||||
- IMPORTANT: DO NOT use reference numbers like [Reference X] in your responses. Instead, directly incorporate the relevant information into your answer without citing sources."""
|
||||
|
||||
# Process the context data
|
||||
context_text = ""
|
||||
if context:
|
||||
# Sort context by relevance (assuming they're already in relevance order)
|
||||
context_text = "Reference information from IT knowledge base:\n\n"
|
||||
|
||||
for i, doc in enumerate(context):
|
||||
if 'content' in doc:
|
||||
# Create a more structured reference entry
|
||||
content = doc['content']
|
||||
|
||||
# Build a descriptive reference header with metadata
|
||||
ref_details = []
|
||||
if 'metadata' in doc and doc['metadata']:
|
||||
metadata = doc['metadata']
|
||||
if 'subject' in metadata and metadata['subject']:
|
||||
ref_details.append(f"Topic: {metadata['subject']}")
|
||||
if 'channel' in metadata and metadata['channel']:
|
||||
ref_details.append(f"Channel: {metadata['channel']}")
|
||||
if 'sender' in metadata and metadata['sender']:
|
||||
ref_details.append(f"From: {metadata['sender']}")
|
||||
if 'timestamp' in metadata and metadata['timestamp']:
|
||||
try:
|
||||
# Try to format the timestamp in a more readable way
|
||||
date_str = metadata['timestamp'][:10] # Just use the date part
|
||||
ref_details.append(f"Date: {date_str}")
|
||||
except:
|
||||
pass
|
||||
|
||||
# Create a detailed reference header with all the metadata
|
||||
ref_header = f"Context {i+1}"
|
||||
if ref_details:
|
||||
ref_header += f": {' | '.join(ref_details)}"
|
||||
|
||||
# Format each reference entry
|
||||
context_text += f"[{ref_header}]\n{content}\n\n"
|
||||
|
||||
# Create messages array for the chat completions API
|
||||
messages = [
|
||||
{"role": "system", "content": system_instruction}
|
||||
]
|
||||
|
||||
# Add context as a separate message from the system if available
|
||||
if context_text:
|
||||
messages.append({"role": "system", "content": context_text})
|
||||
|
||||
# Add the user query
|
||||
messages.append({"role": "user", "content": query})
|
||||
|
||||
return messages
|
||||
|
||||
# For backwards compatibility, provide GeminiService as an alias for OpenAIService
|
||||
GeminiService = OpenAIService
|
||||
402
app/utils/bot_service.py
Normal file
402
app/utils/bot_service.py
Normal file
@ -0,0 +1,402 @@
|
||||
"""
|
||||
Zulip bot service for handling interactions with Zulip.
|
||||
"""
|
||||
|
||||
import os
|
||||
import re
|
||||
import logging
|
||||
import threading
|
||||
import time
|
||||
import hashlib
|
||||
import tempfile
|
||||
from typing import Optional, List, Dict, Any
|
||||
|
||||
import zulip
|
||||
from app.db.chroma_service import ChromaDBService
|
||||
from app.utils.ai_service import GeminiService
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
||||
logger = logging.getLogger("bot_service")
|
||||
|
||||
class ZulipBotService:
|
||||
"""Service for handling Zulip bot interactions."""
|
||||
|
||||
# Singleton instance
|
||||
_instance = None
|
||||
_lock = threading.Lock()
|
||||
_process_id = os.getpid() # Store the process ID when this module is loaded
|
||||
|
||||
def __new__(cls, *args, **kwargs):
|
||||
with cls._lock:
|
||||
current_pid = os.getpid()
|
||||
if cls._instance is None or cls._process_id != current_pid:
|
||||
logger.info(f"Creating new ZulipBotService singleton instance for process {current_pid}")
|
||||
cls._instance = super(ZulipBotService, cls).__new__(cls)
|
||||
cls._instance._initialized = False
|
||||
cls._process_id = current_pid # Update the stored process ID
|
||||
return cls._instance
|
||||
|
||||
def __init__(self,
|
||||
email: Optional[str] = None,
|
||||
api_key: Optional[str] = None,
|
||||
site: Optional[str] = None,
|
||||
chroma_service: Optional[ChromaDBService] = None,
|
||||
ai_service: Optional[GeminiService] = None):
|
||||
"""Initialize the Zulip bot service."""
|
||||
with self._lock:
|
||||
# Skip initialization if already initialized (singleton pattern)
|
||||
if self._initialized:
|
||||
return
|
||||
|
||||
# Load config from environment variables if not provided
|
||||
self.email = email or os.getenv("ZULIP_BOT_EMAIL")
|
||||
self.api_key = api_key or os.getenv("ZULIP_BOT_API_KEY")
|
||||
self.site = site or os.getenv("ZULIP_SITE")
|
||||
|
||||
if not all([self.email, self.api_key, self.site]):
|
||||
raise ValueError("Missing Zulip configuration. Set ZULIP_BOT_EMAIL, ZULIP_BOT_API_KEY, and ZULIP_SITE env variables.")
|
||||
|
||||
# Initialize Zulip client
|
||||
self.client = zulip.Client(
|
||||
email=self.email,
|
||||
api_key=self.api_key,
|
||||
site=self.site
|
||||
)
|
||||
|
||||
# Initialize services
|
||||
self.chroma_service = chroma_service or ChromaDBService()
|
||||
self.ai_service = ai_service or GeminiService()
|
||||
|
||||
# Thread for message handling
|
||||
self.thread = None
|
||||
self.running = False
|
||||
|
||||
# Simple set to track processed message IDs
|
||||
self.processed_message_ids = set()
|
||||
|
||||
# Bot identification pattern - exact match for IT_Bot mention in Zulip format
|
||||
self.bot_mention_pattern = re.compile(r'@\*\*IT_Bot\*\*')
|
||||
|
||||
# Default response for empty queries
|
||||
self.default_response = "Hello. If you have a technical question, please ask. If you require assistance with non-technical matters, please contact IT support."
|
||||
|
||||
# Track backoff state for rate limiting
|
||||
self._backoff_time = 1 # Start with 1 second backoff
|
||||
self._consecutive_rate_limit_errors = 0
|
||||
self._max_backoff_time = 60 # Maximum backoff of 60 seconds
|
||||
|
||||
# Mark as initialized
|
||||
self._initialized = True
|
||||
|
||||
logger.info("Initialized ZulipBotService")
|
||||
|
||||
def start(self):
|
||||
"""Start the bot service in a separate thread."""
|
||||
with self._lock:
|
||||
if self.thread and self.thread.is_alive():
|
||||
logger.warning("Bot service is already running")
|
||||
return
|
||||
|
||||
self.running = True
|
||||
self.thread = threading.Thread(target=self._message_loop)
|
||||
self.thread.daemon = True
|
||||
self.thread.start()
|
||||
logger.info("Started ZulipBotService")
|
||||
|
||||
def stop(self):
|
||||
"""Stop the bot service."""
|
||||
with self._lock:
|
||||
if not self.thread or not self.thread.is_alive():
|
||||
logger.warning("Bot service is not running")
|
||||
return
|
||||
|
||||
self.running = False
|
||||
self.thread.join(timeout=5.0)
|
||||
logger.info("Stopped ZulipBotService")
|
||||
|
||||
def _message_loop(self):
|
||||
"""Main message handling loop."""
|
||||
# How far back to check for mentions (in seconds)
|
||||
# Default to 60 seconds, but can be adjusted
|
||||
lookback_period = 60
|
||||
|
||||
while self.running:
|
||||
try:
|
||||
# Get messages that mention the bot
|
||||
new_messages = self._check_for_mentions(lookback_period)
|
||||
|
||||
# Process new messages
|
||||
for message in new_messages:
|
||||
self._process_message(message)
|
||||
# Add a small delay between processing messages
|
||||
time.sleep(0.5)
|
||||
|
||||
# Clean up old processed message IDs periodically
|
||||
if len(self.processed_message_ids) > 1000:
|
||||
self.processed_message_ids = set(list(self.processed_message_ids)[-1000:])
|
||||
|
||||
# Wait before checking again (reduces API usage)
|
||||
time.sleep(5.0)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in message loop: {str(e)}")
|
||||
# Apply backoff on errors to avoid hammering the API
|
||||
if "API usage exceeded rate limit" in str(e):
|
||||
self._consecutive_rate_limit_errors += 1
|
||||
backoff_time = min(self._backoff_time * 2, self._max_backoff_time)
|
||||
logger.info(f"Rate limit hit, backing off for {backoff_time} seconds")
|
||||
time.sleep(backoff_time)
|
||||
self._backoff_time = backoff_time
|
||||
else:
|
||||
# For other errors, just wait a bit
|
||||
time.sleep(3)
|
||||
|
||||
def _check_for_mentions(self, lookback_period):
|
||||
"""
|
||||
Check for new messages that mention the bot.
|
||||
|
||||
Args:
|
||||
lookback_period: How far back to check for mentions (in seconds)
|
||||
|
||||
Returns:
|
||||
List of messages that mention the bot
|
||||
"""
|
||||
# Calculate the timestamp for the lookback period
|
||||
lookback_timestamp = int(time.time() - lookback_period)
|
||||
|
||||
try:
|
||||
# If we've had rate limit errors, apply backoff
|
||||
if self._consecutive_rate_limit_errors > 0:
|
||||
backoff_delay = min(self._backoff_time, self._max_backoff_time)
|
||||
logger.info(f"Rate limit backoff: waiting {backoff_delay} seconds before API call")
|
||||
time.sleep(backoff_delay)
|
||||
|
||||
# Get all messages that mention the bot
|
||||
# Use the request endpoint for more control
|
||||
request = {
|
||||
"anchor": "newest",
|
||||
"num_before": 100,
|
||||
"num_after": 0,
|
||||
"narrow": [
|
||||
{"operator": "is", "operand": "mentioned"},
|
||||
{"operator": "streams", "operand": "public"}
|
||||
],
|
||||
"client_gravatar": False,
|
||||
"apply_markdown": False
|
||||
}
|
||||
|
||||
result = self.client.get_messages(request)
|
||||
|
||||
# Reset backoff if request was successful
|
||||
if result.get("result") == "success":
|
||||
if self._consecutive_rate_limit_errors > 0:
|
||||
logger.info("Successful API call, resetting rate limit backoff")
|
||||
self._consecutive_rate_limit_errors = 0
|
||||
self._backoff_time = 1
|
||||
else:
|
||||
logger.error(f"Failed to get messages: {result.get('msg', 'Unknown error')}")
|
||||
return []
|
||||
|
||||
# Filter messages
|
||||
new_messages = []
|
||||
for message in result.get("messages", []):
|
||||
# Skip if we've already processed this message
|
||||
if message["id"] in self.processed_message_ids:
|
||||
continue
|
||||
|
||||
# Skip messages not sent after our lookback time
|
||||
if message.get("timestamp", 0) < lookback_timestamp:
|
||||
continue
|
||||
|
||||
# Skip messages from the bot itself
|
||||
if message.get("sender_email") == self.email:
|
||||
continue
|
||||
|
||||
# Check if the bot is actually mentioned in the content
|
||||
if self.bot_mention_pattern.search(message.get("content", "")):
|
||||
# Add to processed set and new message list
|
||||
self.processed_message_ids.add(message["id"])
|
||||
new_messages.append(message)
|
||||
|
||||
if new_messages:
|
||||
logger.info(f"Found {len(new_messages)} new mention(s) of the bot")
|
||||
|
||||
return new_messages
|
||||
|
||||
except Exception as e:
|
||||
if "API usage exceeded rate limit" in str(e):
|
||||
self._consecutive_rate_limit_errors += 1
|
||||
self._backoff_time = min(self._backoff_time * 2, self._max_backoff_time)
|
||||
logger.error(f"Error checking for mentions: {str(e)} (backoff: {self._backoff_time}s)")
|
||||
else:
|
||||
logger.error(f"Error checking for mentions: {str(e)}")
|
||||
return []
|
||||
|
||||
def _process_message(self, message):
|
||||
"""
|
||||
Process a message and send a response.
|
||||
|
||||
Args:
|
||||
message: The message to process.
|
||||
"""
|
||||
try:
|
||||
# Extract content
|
||||
content = message.get("content", "")
|
||||
|
||||
# Log detailed information
|
||||
logger.info(f"Processing message ID: {message.get('id')}")
|
||||
|
||||
# Extract user query (remove the bot mention)
|
||||
query = self.bot_mention_pattern.sub("", content).strip()
|
||||
|
||||
# Log the incoming message
|
||||
logger.info(f"Extracted query: {query[:50]}...")
|
||||
|
||||
# If query is empty, provide the default response
|
||||
if not query:
|
||||
logger.info(f"Empty query received, sending default response")
|
||||
self._send_response(message, self.default_response)
|
||||
return
|
||||
|
||||
# Retrieve relevant context from ChromaDB
|
||||
context = self._retrieve_context(query)
|
||||
|
||||
# Generate response using the AI service
|
||||
response_text = self.ai_service.generate_response(query, context)
|
||||
|
||||
# Send the response
|
||||
self._send_response(message, response_text)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing message: {str(e)}")
|
||||
self._send_response(message,
|
||||
"I apologize, but I encountered an error while processing your request. "
|
||||
"Please try again or contact the IT support team if the issue persists.")
|
||||
|
||||
def _retrieve_context(self, query, n_results=40):
|
||||
"""
|
||||
Retrieve relevant context from ChromaDB with enhanced relevance.
|
||||
|
||||
Args:
|
||||
query: The user's query.
|
||||
n_results: Number of results to retrieve.
|
||||
|
||||
Returns:
|
||||
A list of relevant context documents.
|
||||
"""
|
||||
try:
|
||||
# Search for similar documents in ChromaDB
|
||||
search_results = self.chroma_service.search_similar(query, n_results=n_results)
|
||||
|
||||
if not search_results:
|
||||
logger.warning(f"No context found for query: {query[:50]}...")
|
||||
return []
|
||||
|
||||
# Extract documents and metadata
|
||||
documents = []
|
||||
|
||||
# Check if there are documents in the results
|
||||
if search_results.get("documents") and len(search_results.get("documents", [])) > 0:
|
||||
# Get the documents and their metadata
|
||||
docs = search_results.get("documents", [[]])[0]
|
||||
metas = search_results.get("metadatas", [[]])[0]
|
||||
|
||||
# Calculate a simple relevance score for each document based on position
|
||||
relevance_scores = []
|
||||
for i, (doc, metadata) in enumerate(zip(docs, metas)):
|
||||
# Create a document with its metadata
|
||||
if isinstance(doc, list) and len(doc) > 0:
|
||||
doc = doc[0] # Handle nested lists
|
||||
|
||||
# Include relevance position in metadata
|
||||
if metadata:
|
||||
metadata["relevance_position"] = i + 1
|
||||
|
||||
# Store document with enhanced metadata
|
||||
documents.append({
|
||||
"content": doc,
|
||||
"metadata": metadata,
|
||||
})
|
||||
|
||||
logger.info(f"Retrieved {len(documents)} context documents for query: {query[:30]}...")
|
||||
return documents
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error retrieving context: {str(e)}")
|
||||
return []
|
||||
|
||||
def _send_response(self, original_message, response_text):
|
||||
"""
|
||||
Send a response to a message.
|
||||
|
||||
Args:
|
||||
original_message: The original message being responded to.
|
||||
response_text: The text of the response to send.
|
||||
"""
|
||||
try:
|
||||
message_type = original_message.get("type")
|
||||
if message_type == "stream":
|
||||
# For stream messages, respond in the same stream and topic
|
||||
response = {
|
||||
"type": "stream",
|
||||
"to": original_message.get("display_recipient"),
|
||||
"subject": original_message.get("subject"),
|
||||
"content": response_text
|
||||
}
|
||||
else:
|
||||
# For private messages, respond to the sender
|
||||
response = {
|
||||
"type": "private",
|
||||
"to": [original_message.get("sender_email")],
|
||||
"content": response_text
|
||||
}
|
||||
|
||||
result = self.client.send_message(response)
|
||||
|
||||
if result.get("result") != "success":
|
||||
error_msg = result.get("msg", "Unknown error")
|
||||
logger.error(f"Failed to send response: {error_msg}")
|
||||
else:
|
||||
logger.info(f"Sent response to message: {original_message.get('id')}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error sending response: {str(e)}")
|
||||
|
||||
def send_test_message(self, recipient, content):
|
||||
"""
|
||||
Send a test message to verify the bot is working.
|
||||
|
||||
Args:
|
||||
recipient: The recipient of the message (email for private, channel name for stream).
|
||||
content: The content of the message.
|
||||
|
||||
Returns:
|
||||
The result of the API call.
|
||||
"""
|
||||
if "@" in recipient:
|
||||
# Private message
|
||||
message = {
|
||||
"type": "private",
|
||||
"to": [recipient],
|
||||
"content": content
|
||||
}
|
||||
else:
|
||||
# Stream message
|
||||
message = {
|
||||
"type": "stream",
|
||||
"to": recipient,
|
||||
"subject": "Bot Test",
|
||||
"content": content
|
||||
}
|
||||
|
||||
result = self.client.send_message(message)
|
||||
logger.info(f"Sent test message to {recipient}, result: {result.get('result')}")
|
||||
return result
|
||||
|
||||
def reset_cache(self):
|
||||
"""Reset message cache."""
|
||||
with self._lock:
|
||||
logger.info("Resetting message caches")
|
||||
self.processed_message_ids = set()
|
||||
return "Message cache reset successfully"
|
||||
8
app/utils/contextual_retrieval/__init__.py
Normal file
8
app/utils/contextual_retrieval/__init__.py
Normal file
@ -0,0 +1,8 @@
|
||||
"""
|
||||
Contextual Retrieval package for enhancing RAG systems.
|
||||
|
||||
This package implements advanced retrieval techniques based on Anthropic's Contextual Retrieval:
|
||||
- Contextual Embeddings: Adding rich context to chunks before embedding
|
||||
- Contextual BM25: Using BM25 for exact matching with context-enhanced chunks
|
||||
- Reranking: Further improving results by reranking retrieved chunks
|
||||
"""
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
181
app/utils/contextual_retrieval/bm25_service.py
Normal file
181
app/utils/contextual_retrieval/bm25_service.py
Normal file
@ -0,0 +1,181 @@
|
||||
"""
|
||||
BM25 Service for exact keyword matching in retrieval.
|
||||
|
||||
This service implements the BM25 algorithm for better lexical search,
|
||||
complementing the semantic search provided by vector embeddings.
|
||||
"""
|
||||
import os
|
||||
import pickle
|
||||
import numpy as np
|
||||
from typing import Dict, List, Optional, Tuple, Union
|
||||
from rank_bm25 import BM25Okapi
|
||||
import re
|
||||
import nltk
|
||||
from nltk.tokenize import word_tokenize
|
||||
from nltk.corpus import stopwords
|
||||
|
||||
# Download NLTK resources
|
||||
try:
|
||||
nltk.data.find('tokenizers/punkt')
|
||||
except LookupError:
|
||||
nltk.download('punkt', quiet=True)
|
||||
|
||||
try:
|
||||
nltk.data.find('corpora/stopwords')
|
||||
except LookupError:
|
||||
nltk.download('stopwords', quiet=True)
|
||||
|
||||
class BM25Service:
|
||||
"""Service for BM25-based search."""
|
||||
|
||||
# BM25 index and corpus
|
||||
_bm25 = None
|
||||
_corpus = []
|
||||
_doc_ids = []
|
||||
_index_path = os.path.join("chromadb", "bm25_index.pkl")
|
||||
|
||||
@staticmethod
|
||||
def preprocess_text(text: str) -> List[str]:
|
||||
"""
|
||||
Preprocess text for BM25 indexing.
|
||||
|
||||
Args:
|
||||
text (str): Text to preprocess
|
||||
|
||||
Returns:
|
||||
List[str]: List of preprocessed tokens
|
||||
"""
|
||||
# Convert to lowercase
|
||||
text = text.lower()
|
||||
|
||||
# Remove special characters and digits
|
||||
text = re.sub(r'[^\w\s]', ' ', text)
|
||||
text = re.sub(r'\d+', ' ', text)
|
||||
|
||||
# Tokenize
|
||||
tokens = word_tokenize(text)
|
||||
|
||||
# Remove stopwords
|
||||
stop_words = set(stopwords.words('english'))
|
||||
tokens = [token for token in tokens if token not in stop_words and len(token) > 1]
|
||||
|
||||
return tokens
|
||||
|
||||
@staticmethod
|
||||
def index_documents(documents: List[str], doc_ids: List[str]) -> None:
|
||||
"""
|
||||
Create a BM25 index for a list of documents.
|
||||
|
||||
Args:
|
||||
documents (List[str]): List of document contents
|
||||
doc_ids (List[str]): List of document IDs
|
||||
"""
|
||||
# Preprocess documents
|
||||
tokenized_corpus = [BM25Service.preprocess_text(doc) for doc in documents]
|
||||
|
||||
# Create BM25 index
|
||||
BM25Service._bm25 = BM25Okapi(tokenized_corpus)
|
||||
BM25Service._corpus = documents
|
||||
BM25Service._doc_ids = doc_ids
|
||||
|
||||
# Save index to disk
|
||||
BM25Service.save_index()
|
||||
|
||||
@staticmethod
|
||||
def add_document(document: str, doc_id: str) -> None:
|
||||
"""
|
||||
Add a single document to the BM25 index.
|
||||
|
||||
Args:
|
||||
document (str): Document content
|
||||
doc_id (str): Document ID
|
||||
"""
|
||||
# Create index if it doesn't exist
|
||||
if BM25Service._bm25 is None:
|
||||
BM25Service.load_index()
|
||||
if BM25Service._bm25 is None:
|
||||
BM25Service.index_documents([document], [doc_id])
|
||||
return
|
||||
|
||||
# Add document to corpus
|
||||
BM25Service._corpus.append(document)
|
||||
BM25Service._doc_ids.append(doc_id)
|
||||
|
||||
# Preprocess document
|
||||
tokenized_doc = BM25Service.preprocess_text(document)
|
||||
|
||||
# Rebuild index
|
||||
tokenized_corpus = [BM25Service.preprocess_text(doc) for doc in BM25Service._corpus]
|
||||
BM25Service._bm25 = BM25Okapi(tokenized_corpus)
|
||||
|
||||
# Save index to disk
|
||||
BM25Service.save_index()
|
||||
|
||||
@staticmethod
|
||||
def search(query: str, top_k: int = 5) -> List[Tuple[str, float]]:
|
||||
"""
|
||||
Search for documents using BM25.
|
||||
|
||||
Args:
|
||||
query (str): Query text
|
||||
top_k (int): Number of results to return
|
||||
|
||||
Returns:
|
||||
List[Tuple[str, float]]: List of (doc_id, score) tuples
|
||||
"""
|
||||
# Load index if it doesn't exist
|
||||
if BM25Service._bm25 is None:
|
||||
BM25Service.load_index()
|
||||
if BM25Service._bm25 is None:
|
||||
return []
|
||||
|
||||
# Preprocess query
|
||||
tokenized_query = BM25Service.preprocess_text(query)
|
||||
|
||||
# Get scores
|
||||
scores = BM25Service._bm25.get_scores(tokenized_query)
|
||||
|
||||
# Get top-k documents
|
||||
top_indices = np.argsort(scores)[::-1][:top_k]
|
||||
|
||||
# Return (doc_id, score) pairs
|
||||
results = []
|
||||
for idx in top_indices:
|
||||
if idx < len(BM25Service._doc_ids):
|
||||
results.append((BM25Service._doc_ids[idx], scores[idx]))
|
||||
|
||||
return results
|
||||
|
||||
@staticmethod
|
||||
def save_index() -> None:
|
||||
"""Save BM25 index to disk."""
|
||||
try:
|
||||
# Create directory if it doesn't exist
|
||||
os.makedirs(os.path.dirname(BM25Service._index_path), exist_ok=True)
|
||||
|
||||
# Save index
|
||||
with open(BM25Service._index_path, 'wb') as f:
|
||||
pickle.dump({
|
||||
'bm25': BM25Service._bm25,
|
||||
'corpus': BM25Service._corpus,
|
||||
'doc_ids': BM25Service._doc_ids
|
||||
}, f)
|
||||
except Exception as e:
|
||||
print(f"Error saving BM25 index: {e}")
|
||||
|
||||
@staticmethod
|
||||
def load_index() -> None:
|
||||
"""Load BM25 index from disk."""
|
||||
try:
|
||||
if os.path.exists(BM25Service._index_path):
|
||||
with open(BM25Service._index_path, 'rb') as f:
|
||||
data = pickle.load(f)
|
||||
BM25Service._bm25 = data.get('bm25')
|
||||
BM25Service._corpus = data.get('corpus', [])
|
||||
BM25Service._doc_ids = data.get('doc_ids', [])
|
||||
except Exception as e:
|
||||
print(f"Error loading BM25 index: {e}")
|
||||
# Initialize with empty index
|
||||
BM25Service._bm25 = None
|
||||
BM25Service._corpus = []
|
||||
BM25Service._doc_ids = []
|
||||
112
app/utils/contextual_retrieval/context_service.py
Normal file
112
app/utils/contextual_retrieval/context_service.py
Normal file
@ -0,0 +1,112 @@
|
||||
"""
|
||||
Context Service for generating rich contextual descriptions for messages.
|
||||
|
||||
This service uses LLMs to generate contextual descriptions for messages,
|
||||
which improves retrieval by providing more context to the embedding process.
|
||||
"""
|
||||
import os
|
||||
import time
|
||||
from typing import Dict, List, Optional, Union
|
||||
from openai import OpenAI
|
||||
from app.config import Config
|
||||
|
||||
class ContextService:
|
||||
"""Service for generating rich contextual descriptions for messages."""
|
||||
|
||||
# Initialize OpenAI client
|
||||
client = OpenAI(api_key=Config.OPENAI_API_KEY)
|
||||
|
||||
# Cache for context generation to reduce API calls
|
||||
_context_cache = {}
|
||||
|
||||
@staticmethod
|
||||
def generate_context(content: str, metadata: Dict) -> str:
|
||||
"""
|
||||
Generate a rich contextual description for a message.
|
||||
|
||||
Args:
|
||||
content (str): The original message content
|
||||
metadata (Dict): Metadata about the message (channel, subject, sender, timestamp)
|
||||
|
||||
Returns:
|
||||
str: A rich contextual description
|
||||
"""
|
||||
# Create a cache key from content and metadata
|
||||
cache_key = f"{content[:100]}_{metadata.get('channel')}_{metadata.get('subject')}"
|
||||
|
||||
# Check if we have this context cached
|
||||
if cache_key in ContextService._context_cache:
|
||||
return ContextService._context_cache[cache_key]
|
||||
|
||||
try:
|
||||
# Create messages for context generation
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a context generation assistant. Generate a short, succinct context description for the given message. The context should situate this message within its domain and highlight key information that would be helpful for retrieval. Keep the context under 100 words."
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": f"""
|
||||
Message details:
|
||||
- Channel: {metadata.get('channel', 'Unknown')}
|
||||
- Subject: {metadata.get('subject', 'Unknown')}
|
||||
- Sender: {metadata.get('sender', 'Unknown')}
|
||||
- Timestamp: {metadata.get('timestamp', 'Unknown')}
|
||||
|
||||
Message content:
|
||||
{content}
|
||||
"""
|
||||
}
|
||||
]
|
||||
|
||||
# Generate the context using OpenAI
|
||||
response = ContextService.client.chat.completions.create(
|
||||
model="gpt-4o",
|
||||
messages=messages,
|
||||
max_tokens=150,
|
||||
temperature=0.3
|
||||
)
|
||||
|
||||
# Extract the response text
|
||||
context = response.choices[0].message.content.strip()
|
||||
|
||||
# If the context is too long, truncate it
|
||||
if len(context) > 500:
|
||||
context = context[:497] + "..."
|
||||
|
||||
# Cache the result
|
||||
ContextService._context_cache[cache_key] = context
|
||||
|
||||
return context
|
||||
except Exception as e:
|
||||
print(f"Error generating context: {e}")
|
||||
|
||||
# Fallback to a simple context based on metadata
|
||||
channel = metadata.get('channel', 'Unknown')
|
||||
subject = metadata.get('subject', 'Unknown')
|
||||
|
||||
fallback_context = f"This message is from the {channel} channel and discusses {subject}."
|
||||
|
||||
# Cache the fallback
|
||||
ContextService._context_cache[cache_key] = fallback_context
|
||||
|
||||
return fallback_context
|
||||
|
||||
@staticmethod
|
||||
def contextualize_content(content: str, metadata: Dict) -> str:
|
||||
"""
|
||||
Add rich contextual description to a message.
|
||||
|
||||
Args:
|
||||
content (str): The original message content
|
||||
metadata (Dict): Metadata about the message
|
||||
|
||||
Returns:
|
||||
str: The content with context prepended
|
||||
"""
|
||||
# Generate the context
|
||||
context = ContextService.generate_context(content, metadata)
|
||||
|
||||
# Add the context to the content
|
||||
return f"CONTEXT: {context}\n\nCONTENT: {content}"
|
||||
160
app/utils/contextual_retrieval/hybrid_search.py
Normal file
160
app/utils/contextual_retrieval/hybrid_search.py
Normal file
@ -0,0 +1,160 @@
|
||||
"""
|
||||
Hybrid Search Service that combines vector search and BM25 search.
|
||||
|
||||
This service implements hybrid search by combining results from vector-based
|
||||
semantic search and BM25 lexical search using rank fusion.
|
||||
"""
|
||||
import numpy as np
|
||||
from typing import Dict, List, Optional, Tuple, Union
|
||||
from app.db.chroma_service import ChromaDBService
|
||||
from app.utils.contextual_retrieval.bm25_service import BM25Service
|
||||
from app.utils.contextual_retrieval.reranker_service import RerankerService
|
||||
import logging
|
||||
|
||||
# Set up logging
|
||||
logger = logging.getLogger("hybrid_search")
|
||||
|
||||
class HybridSearchService:
|
||||
"""Service for hybrid search combining vector search and BM25."""
|
||||
|
||||
@staticmethod
|
||||
def hybrid_search(query: str, n_results: int = 5, filter_criteria: Optional[Dict] = None,
|
||||
rerank: bool = True, semantic_weight: float = 0.7) -> List[Dict]:
|
||||
"""
|
||||
Perform hybrid search using vector search and BM25.
|
||||
|
||||
Args:
|
||||
query (str): Query text
|
||||
n_results (int): Number of results to return
|
||||
filter_criteria (Dict): Metadata filter criteria
|
||||
rerank (bool): Whether to apply reranking
|
||||
semantic_weight (float): Weight for semantic search (0-1)
|
||||
|
||||
Returns:
|
||||
List[Dict]: Search results
|
||||
"""
|
||||
try:
|
||||
# Get more results than requested for fusion
|
||||
vector_n = n_results * 3
|
||||
bm25_n = n_results * 3
|
||||
|
||||
# Perform vector search - use _internal_call=True to prevent circular imports
|
||||
vector_results = ChromaDBService.search_similar(
|
||||
query_text=query,
|
||||
n_results=vector_n,
|
||||
filter_criteria=filter_criteria,
|
||||
_internal_call=True # This prevents circular calls
|
||||
)
|
||||
|
||||
# Extract vector search results
|
||||
vec_docs = []
|
||||
if vector_results and 'documents' in vector_results and len(vector_results['documents']) > 0:
|
||||
for i in range(len(vector_results['documents'][0])):
|
||||
vec_docs.append({
|
||||
'id': vector_results['ids'][0][i],
|
||||
'content': vector_results['documents'][0][i],
|
||||
'metadata': vector_results['metadatas'][0][i],
|
||||
'vector_score': 1.0 - min(vector_results['distances'][0][i], 1.0),
|
||||
'rank': i + 1 # 1-based rank
|
||||
})
|
||||
|
||||
# Perform BM25 search
|
||||
bm25_results = BM25Service.search(query, top_k=bm25_n)
|
||||
|
||||
# Extract BM25 search results and normalize scores
|
||||
bm25_docs = []
|
||||
if bm25_results:
|
||||
# Get max score for normalization
|
||||
max_score = max([score for _, score in bm25_results]) if bm25_results else 1.0
|
||||
|
||||
# Create a set of doc IDs already in vector results to avoid duplicate lookups
|
||||
existing_doc_ids = {doc['id'] for doc in vec_docs}
|
||||
|
||||
for i, (doc_id, score) in enumerate(bm25_results):
|
||||
# Skip duplicate lookups
|
||||
if doc_id in existing_doc_ids:
|
||||
continue
|
||||
|
||||
# Get document content from ChromaDB (if available)
|
||||
try:
|
||||
doc_data = ChromaDBService.get_message_by_id(doc_id)
|
||||
if doc_data:
|
||||
bm25_docs.append({
|
||||
'id': doc_id,
|
||||
'content': doc_data['content'],
|
||||
'metadata': doc_data['metadata'],
|
||||
'bm25_score': score / max_score if max_score > 0 else 0,
|
||||
'rank': i + 1 # 1-based rank
|
||||
})
|
||||
except Exception as e:
|
||||
logger.warning(f"Error retrieving document {doc_id}: {e}")
|
||||
continue
|
||||
|
||||
# Combine results using reciprocal rank fusion
|
||||
fused_docs = HybridSearchService._fuse_results(vec_docs, bm25_docs, semantic_weight)
|
||||
|
||||
# Apply reranking if requested
|
||||
if rerank and len(fused_docs) > 0:
|
||||
try:
|
||||
return RerankerService.rerank(query, fused_docs, top_k=n_results)
|
||||
except Exception as e:
|
||||
logger.warning(f"Reranking failed: {e}, returning non-reranked results")
|
||||
return fused_docs[:n_results]
|
||||
|
||||
# Otherwise just return the top n fused results
|
||||
return fused_docs[:n_results]
|
||||
except Exception as e:
|
||||
logger.error(f"Error in hybrid search: {e}")
|
||||
# Return empty results on error
|
||||
return []
|
||||
|
||||
@staticmethod
|
||||
def _fuse_results(vec_docs: List[Dict], bm25_docs: List[Dict],
|
||||
semantic_weight: float = 0.7) -> List[Dict]:
|
||||
"""
|
||||
Fuse results from vector search and BM25 search.
|
||||
|
||||
Args:
|
||||
vec_docs (List[Dict]): Vector search results
|
||||
bm25_docs (List[Dict]): BM25 search results
|
||||
semantic_weight (float): Weight for semantic search (0-1)
|
||||
|
||||
Returns:
|
||||
List[Dict]: Fused search results
|
||||
"""
|
||||
# Create a map of document IDs to documents
|
||||
doc_map = {}
|
||||
|
||||
# Process vector search results
|
||||
for doc in vec_docs:
|
||||
doc_id = doc['id']
|
||||
if doc_id not in doc_map:
|
||||
doc_map[doc_id] = doc.copy()
|
||||
doc_map[doc_id]['combined_score'] = doc.get('vector_score', 0) * semantic_weight
|
||||
else:
|
||||
# Update existing document
|
||||
doc_map[doc_id]['vector_score'] = doc.get('vector_score', 0)
|
||||
doc_map[doc_id]['combined_score'] = (
|
||||
doc_map[doc_id].get('combined_score', 0) +
|
||||
doc.get('vector_score', 0) * semantic_weight
|
||||
)
|
||||
|
||||
# Process BM25 search results
|
||||
for doc in bm25_docs:
|
||||
doc_id = doc['id']
|
||||
if doc_id not in doc_map:
|
||||
doc_map[doc_id] = doc.copy()
|
||||
doc_map[doc_id]['combined_score'] = doc.get('bm25_score', 0) * (1 - semantic_weight)
|
||||
else:
|
||||
# Update existing document
|
||||
doc_map[doc_id]['bm25_score'] = doc.get('bm25_score', 0)
|
||||
doc_map[doc_id]['combined_score'] = (
|
||||
doc_map[doc_id].get('combined_score', 0) +
|
||||
doc.get('bm25_score', 0) * (1 - semantic_weight)
|
||||
)
|
||||
|
||||
# Convert map to list and sort by combined score
|
||||
results = list(doc_map.values())
|
||||
results.sort(key=lambda x: x.get('combined_score', 0), reverse=True)
|
||||
|
||||
return results
|
||||
249
app/utils/contextual_retrieval/reranker_service.py
Normal file
249
app/utils/contextual_retrieval/reranker_service.py
Normal file
@ -0,0 +1,249 @@
|
||||
"""
|
||||
Reranker Service for improving search results by reranking candidate documents.
|
||||
|
||||
This service uses a custom reranking approach combining multiple signals
|
||||
to improve the relevance of search results.
|
||||
"""
|
||||
import re
|
||||
import numpy as np
|
||||
from typing import Dict, List, Optional, Tuple, Union
|
||||
import logging
|
||||
|
||||
# Set up logging
|
||||
logger = logging.getLogger("reranker_service")
|
||||
|
||||
class RerankerService:
|
||||
"""Service for reranking search results using a custom approach."""
|
||||
|
||||
# Cache for reranked results
|
||||
_rerank_cache = {}
|
||||
|
||||
@staticmethod
|
||||
def rerank(query: str, documents: List[Dict], top_k: int = 20) -> List[Dict]:
|
||||
"""
|
||||
Rerank documents based on relevance to query using a multi-factor approach.
|
||||
|
||||
Args:
|
||||
query (str): Query text
|
||||
documents (List[Dict]): List of document dictionaries with 'id' and 'content'
|
||||
top_k (int): Number of results to return
|
||||
|
||||
Returns:
|
||||
List[Dict]: Reranked documents
|
||||
"""
|
||||
# Return all documents if there are fewer than top_k
|
||||
if len(documents) <= top_k:
|
||||
return documents
|
||||
|
||||
# Create cache key
|
||||
cache_key = f"{query}_{sorted([doc.get('id', '') for doc in documents])}"
|
||||
|
||||
# Check if we have this reranking cached
|
||||
if cache_key in RerankerService._rerank_cache:
|
||||
return RerankerService._rerank_cache[cache_key][:top_k]
|
||||
|
||||
try:
|
||||
# Prepare query
|
||||
query_terms = RerankerService._tokenize(query)
|
||||
query_lower = query.lower()
|
||||
|
||||
# Calculate multi-factor relevance score for each document
|
||||
scored_docs = []
|
||||
for doc in documents:
|
||||
content = doc.get('content', '')
|
||||
content_lower = content.lower()
|
||||
|
||||
# 1. Term frequency scoring (similar to BM25)
|
||||
term_score = RerankerService._calculate_term_score(content_lower, query_terms)
|
||||
|
||||
# 2. Exact phrase matching
|
||||
phrase_score = RerankerService._calculate_phrase_score(content_lower, query_lower)
|
||||
|
||||
# 3. Semantic similarity (use existing score if available)
|
||||
semantic_score = RerankerService._get_semantic_score(doc)
|
||||
|
||||
# 4. Document position bonus
|
||||
position_score = RerankerService._calculate_position_score(content_lower, query_terms)
|
||||
|
||||
# 5. Document length normalization
|
||||
length_factor = RerankerService._calculate_length_factor(content)
|
||||
|
||||
# Calculate final combined score
|
||||
# Weights can be adjusted based on performance
|
||||
final_score = (
|
||||
0.35 * term_score +
|
||||
0.30 * phrase_score +
|
||||
0.25 * semantic_score +
|
||||
0.10 * position_score
|
||||
) * length_factor
|
||||
|
||||
scored_doc = doc.copy()
|
||||
scored_doc['score'] = final_score
|
||||
scored_doc['_term_score'] = term_score
|
||||
scored_doc['_phrase_score'] = phrase_score
|
||||
scored_doc['_semantic_score'] = semantic_score
|
||||
scored_doc['_position_score'] = position_score
|
||||
|
||||
scored_docs.append(scored_doc)
|
||||
|
||||
# Sort by final score (highest first)
|
||||
scored_docs.sort(key=lambda x: x.get('score', 0), reverse=True)
|
||||
|
||||
# Take the top_k
|
||||
result = scored_docs[:top_k]
|
||||
|
||||
# Clean up diagnostic scores before returning
|
||||
for doc in result:
|
||||
doc.pop('_term_score', None)
|
||||
doc.pop('_phrase_score', None)
|
||||
doc.pop('_semantic_score', None)
|
||||
doc.pop('_position_score', None)
|
||||
|
||||
# Cache the results
|
||||
RerankerService._rerank_cache[cache_key] = result
|
||||
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error reranking documents: {e}")
|
||||
|
||||
# Fallback: simple sorting based on combined_score if available
|
||||
documents.sort(key=lambda x: x.get('combined_score', 0), reverse=True)
|
||||
return documents[:top_k]
|
||||
|
||||
@staticmethod
|
||||
def _tokenize(text: str) -> List[str]:
|
||||
"""
|
||||
Tokenize a string into terms.
|
||||
|
||||
Args:
|
||||
text (str): Text to tokenize
|
||||
|
||||
Returns:
|
||||
List[str]: List of tokens
|
||||
"""
|
||||
# Simple tokenization by splitting on whitespace and removing punctuation
|
||||
tokens = re.findall(r'\b\w+\b', text.lower())
|
||||
return tokens
|
||||
|
||||
@staticmethod
|
||||
def _calculate_term_score(content: str, query_terms: List[str]) -> float:
|
||||
"""
|
||||
Calculate term frequency score.
|
||||
|
||||
Args:
|
||||
content (str): Document content
|
||||
query_terms (List[str]): Query terms
|
||||
|
||||
Returns:
|
||||
float: Term frequency score
|
||||
"""
|
||||
score = 0
|
||||
content_tokens = RerankerService._tokenize(content)
|
||||
|
||||
# Simple term frequency calculation
|
||||
for term in query_terms:
|
||||
term_count = content_tokens.count(term)
|
||||
score += term_count
|
||||
|
||||
# Normalize by document length
|
||||
if len(content_tokens) > 0:
|
||||
score = score / len(content_tokens)
|
||||
|
||||
return score
|
||||
|
||||
@staticmethod
|
||||
def _calculate_phrase_score(content: str, query: str) -> float:
|
||||
"""
|
||||
Calculate exact phrase matching score.
|
||||
|
||||
Args:
|
||||
content (str): Document content
|
||||
query (str): Original query
|
||||
|
||||
Returns:
|
||||
float: Phrase matching score
|
||||
"""
|
||||
# Count exact matches of the query in the content
|
||||
exact_matches = content.count(query)
|
||||
|
||||
# Calculating score for sentence fragments
|
||||
score = exact_matches * 2.0 # Higher weight for exact matches
|
||||
|
||||
# Check for partial matches if no exact matches
|
||||
if exact_matches == 0 and len(query) > 5:
|
||||
# Generate query n-grams (only for longer queries)
|
||||
query_parts = [query[i:i+4] for i in range(0, len(query)-3)]
|
||||
for part in query_parts:
|
||||
if len(part) >= 4: # Only consider meaningful parts
|
||||
score += 0.2 * content.count(part)
|
||||
|
||||
return min(score, 10.0) # Cap to avoid extremely high scores
|
||||
|
||||
@staticmethod
|
||||
def _get_semantic_score(doc: Dict) -> float:
|
||||
"""
|
||||
Extract semantic similarity score from document.
|
||||
|
||||
Args:
|
||||
doc (Dict): Document
|
||||
|
||||
Returns:
|
||||
float: Semantic similarity score
|
||||
"""
|
||||
# Use vector_score if available (from vector search)
|
||||
if 'vector_score' in doc:
|
||||
return doc['vector_score']
|
||||
|
||||
# Use combined_score as fallback
|
||||
if 'combined_score' in doc:
|
||||
return doc['combined_score']
|
||||
|
||||
return 0.5 # Default middle value if no scores available
|
||||
|
||||
@staticmethod
|
||||
def _calculate_position_score(content: str, query_terms: List[str]) -> float:
|
||||
"""
|
||||
Calculate score based on position of match in document.
|
||||
Earlier matches often indicate higher relevance.
|
||||
|
||||
Args:
|
||||
content (str): Document content
|
||||
query_terms (List[str]): Query terms
|
||||
|
||||
Returns:
|
||||
float: Position score
|
||||
"""
|
||||
score = 0
|
||||
# Check for terms in the first 20% of the document
|
||||
first_section = content[:int(len(content) * 0.2)]
|
||||
|
||||
for term in query_terms:
|
||||
if term in first_section:
|
||||
score += 0.5
|
||||
|
||||
return min(score, 1.0) # Normalize to maximum of 1.0
|
||||
|
||||
@staticmethod
|
||||
def _calculate_length_factor(content: str) -> float:
|
||||
"""
|
||||
Calculate length normalization factor.
|
||||
Prevents extremely short documents from ranking too high.
|
||||
|
||||
Args:
|
||||
content (str): Document content
|
||||
|
||||
Returns:
|
||||
float: Length normalization factor
|
||||
"""
|
||||
token_count = len(RerankerService._tokenize(content))
|
||||
|
||||
# Penalize very short documents
|
||||
if token_count < 10:
|
||||
return 0.7
|
||||
|
||||
# Slightly favor mid-sized documents
|
||||
if 20 <= token_count <= 300:
|
||||
return 1.1
|
||||
|
||||
return 1.0 # Neutral factor for other documents
|
||||
111
app/utils/embeddings.py
Normal file
111
app/utils/embeddings.py
Normal file
@ -0,0 +1,111 @@
|
||||
"""
|
||||
Embeddings utilities using Ollama and Nomic.
|
||||
"""
|
||||
import os
|
||||
import requests
|
||||
import numpy as np
|
||||
from typing import List, Optional, Union
|
||||
import ollama
|
||||
from app.config import Config
|
||||
|
||||
class EmbeddingService:
|
||||
"""Service for generating embeddings using Ollama and Nomic."""
|
||||
|
||||
@staticmethod
|
||||
def get_ollama_embeddings(texts: List[str], model: Optional[str] = None) -> List[List[float]]:
|
||||
"""
|
||||
Generate embeddings using Ollama.
|
||||
|
||||
Args:
|
||||
texts: List of texts to generate embeddings for
|
||||
model: Ollama model to use for embeddings (default from config)
|
||||
|
||||
Returns:
|
||||
List of embeddings as float arrays
|
||||
"""
|
||||
if model is None:
|
||||
# Use model from config
|
||||
model = Config.OLLAMA_MODEL
|
||||
|
||||
# Set Ollama host from config
|
||||
ollama.host = Config.OLLAMA_HOST
|
||||
|
||||
embeddings = []
|
||||
|
||||
for text in texts:
|
||||
try:
|
||||
# Call Ollama API for embeddings
|
||||
response = ollama.embeddings(model=model, prompt=text)
|
||||
embedding = response.get("embedding", [])
|
||||
embeddings.append(embedding)
|
||||
except Exception as e:
|
||||
print(f"Error generating Ollama embedding: {e}")
|
||||
# Return a zero embedding as fallback
|
||||
embeddings.append([0.0] * 768) # typical dimension for text embeddings
|
||||
|
||||
return embeddings
|
||||
|
||||
@staticmethod
|
||||
def get_nomic_embeddings(texts: List[str]) -> List[List[float]]:
|
||||
"""
|
||||
Generate embeddings using Nomic.
|
||||
|
||||
Args:
|
||||
texts: List of texts to generate embeddings for
|
||||
|
||||
Returns:
|
||||
List of embeddings as float arrays
|
||||
"""
|
||||
try:
|
||||
# The new version of Nomic requires a Cohere API key, so we'll fall back to Ollama
|
||||
# if we don't have one configured
|
||||
cohere_api_key = Config.COHERE_API_KEY
|
||||
|
||||
if not cohere_api_key:
|
||||
print("No Cohere API key found for Nomic embeddings, falling back to Ollama")
|
||||
return EmbeddingService.get_ollama_embeddings(texts)
|
||||
|
||||
# Dynamically import nomic embedders to avoid startup errors if not available
|
||||
from nomic.embedders import CohereEmbedder
|
||||
|
||||
# Create a Nomic embedding model using CohereEmbedder with API key
|
||||
embedding_model = CohereEmbedder(cohere_api_key=cohere_api_key)
|
||||
|
||||
# Generate embeddings for the texts
|
||||
embeddings = []
|
||||
for text in texts:
|
||||
embedding = embedding_model.embed(text)
|
||||
embeddings.append(embedding)
|
||||
|
||||
return embeddings
|
||||
except Exception as e:
|
||||
print(f"Error generating Nomic embeddings: {e}")
|
||||
# Fall back to Ollama embeddings
|
||||
print("Falling back to Ollama embeddings")
|
||||
return EmbeddingService.get_ollama_embeddings(texts)
|
||||
|
||||
@staticmethod
|
||||
def get_embeddings(texts: Union[str, List[str]], use_nomic: Optional[bool] = None) -> List[List[float]]:
|
||||
"""
|
||||
Generate embeddings using either Nomic or Ollama.
|
||||
|
||||
Args:
|
||||
texts: Text or list of texts to generate embeddings for
|
||||
use_nomic: Whether to use Nomic (True) or Ollama (False), defaults to config setting
|
||||
|
||||
Returns:
|
||||
List of embeddings as float arrays
|
||||
"""
|
||||
# Convert single text to list
|
||||
if isinstance(texts, str):
|
||||
texts = [texts]
|
||||
|
||||
# If use_nomic is not specified, use the config setting
|
||||
if use_nomic is None:
|
||||
use_nomic = Config.USE_NOMIC_EMBEDDINGS
|
||||
|
||||
# Generate embeddings using chosen method
|
||||
if use_nomic:
|
||||
return EmbeddingService.get_nomic_embeddings(texts)
|
||||
else:
|
||||
return EmbeddingService.get_ollama_embeddings(texts)
|
||||
217
app/utils/sync_service.py
Normal file
217
app/utils/sync_service.py
Normal file
@ -0,0 +1,217 @@
|
||||
"""
|
||||
Message synchronization service.
|
||||
Handles periodic fetching of new messages from Zulip and adds them to ChromaDB.
|
||||
"""
|
||||
import os
|
||||
import time
|
||||
import logging
|
||||
import threading
|
||||
import pickle
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
from app.db.zulip_service import ZulipDatabaseService
|
||||
from app.db.chroma_service import ChromaDBService
|
||||
|
||||
# Configure logger
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
||||
)
|
||||
logger = logging.getLogger("sync_service")
|
||||
|
||||
class MessageSyncService:
|
||||
"""Service for synchronizing messages from Zulip to ChromaDB."""
|
||||
|
||||
# File to store the last synced message ID
|
||||
_SYNC_STATE_FILE = "sync_state.pickle"
|
||||
|
||||
def __init__(self, sync_interval=60, state_dir=None):
|
||||
"""
|
||||
Initialize the message sync service.
|
||||
|
||||
Args:
|
||||
sync_interval (int): Sync interval in seconds (default: 60)
|
||||
state_dir (str): Directory to store sync state file (default: current directory)
|
||||
"""
|
||||
self.sync_interval = sync_interval
|
||||
self.is_running = False
|
||||
self.sync_thread = None
|
||||
self.state_dir = state_dir or os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
self.last_sync_time = None
|
||||
self.last_message_id = None
|
||||
self.batch_size = 50 # Default batch size
|
||||
|
||||
# Load the last synced state if available
|
||||
self._load_sync_state()
|
||||
|
||||
def _set_batch_size(self, batch_size):
|
||||
"""Set the batch size for syncing messages."""
|
||||
if batch_size > 0:
|
||||
self.batch_size = batch_size
|
||||
logger.info(f"Set batch size to {batch_size}")
|
||||
else:
|
||||
logger.warning(f"Invalid batch size: {batch_size}, using default")
|
||||
|
||||
def _get_state_file_path(self):
|
||||
"""Get the full path to the sync state file."""
|
||||
return os.path.join(self.state_dir, self._SYNC_STATE_FILE)
|
||||
|
||||
def _load_sync_state(self):
|
||||
"""Load the last sync state from disk."""
|
||||
try:
|
||||
state_file = self._get_state_file_path()
|
||||
if os.path.exists(state_file):
|
||||
with open(state_file, 'rb') as f:
|
||||
state = pickle.load(f)
|
||||
self.last_sync_time = state.get('last_sync_time')
|
||||
self.last_message_id = state.get('last_message_id')
|
||||
logger.info(f"Loaded sync state: last_sync_time={self.last_sync_time}, last_message_id={self.last_message_id}")
|
||||
else:
|
||||
logger.info("No previous sync state found, starting fresh")
|
||||
except Exception as e:
|
||||
logger.error(f"Error loading sync state: {e}")
|
||||
|
||||
def _save_sync_state(self):
|
||||
"""Save the current sync state to disk."""
|
||||
try:
|
||||
state = {
|
||||
'last_sync_time': self.last_sync_time,
|
||||
'last_message_id': self.last_message_id
|
||||
}
|
||||
state_file = self._get_state_file_path()
|
||||
with open(state_file, 'wb') as f:
|
||||
pickle.dump(state, f)
|
||||
logger.info(f"Saved sync state: {state}")
|
||||
except Exception as e:
|
||||
logger.error(f"Error saving sync state: {e}")
|
||||
|
||||
def _sync_messages(self):
|
||||
"""
|
||||
Sync new messages from Zulip to ChromaDB.
|
||||
|
||||
This method fetches new messages from the Zulip database that haven't been
|
||||
synchronized yet and adds them to ChromaDB.
|
||||
"""
|
||||
try:
|
||||
# Set default sync time if not set yet
|
||||
if not self.last_sync_time:
|
||||
# Start with messages from the last 7 days if no previous sync
|
||||
self.last_sync_time = datetime.now() - timedelta(days=7)
|
||||
|
||||
# Get messages newer than the last sync time
|
||||
logger.info(f"Fetching messages since {self.last_sync_time} or ID > {self.last_message_id}")
|
||||
|
||||
# Get new messages
|
||||
messages = []
|
||||
if self.last_message_id:
|
||||
# Get messages with ID greater than the last processed message ID
|
||||
messages = ZulipDatabaseService.get_messages_newer_than_id(self.last_message_id, limit=self.batch_size)
|
||||
else:
|
||||
# Get messages from IT channels since the last sync time
|
||||
messages = ZulipDatabaseService.get_messages_from_it_channels(
|
||||
since=self.last_sync_time,
|
||||
limit=self.batch_size
|
||||
)
|
||||
|
||||
if not messages:
|
||||
logger.info("No new messages found to sync")
|
||||
return
|
||||
|
||||
logger.info(f"Found {len(messages)} new messages to sync")
|
||||
|
||||
# Add messages to ChromaDB
|
||||
synced_count = 0
|
||||
already_exists_count = 0
|
||||
highest_message_id = self.last_message_id or 0
|
||||
|
||||
# Get a list of unique message IDs
|
||||
unique_message_ids = set(message.id for message in messages)
|
||||
logger.info(f"Found {len(unique_message_ids)} unique message IDs out of {len(messages)} messages")
|
||||
|
||||
for message in messages:
|
||||
message_id = message.id
|
||||
|
||||
# Update highest message ID seen
|
||||
if message_id > highest_message_id:
|
||||
highest_message_id = message_id
|
||||
|
||||
channel_name = ZulipDatabaseService.get_channel_name_for_message(message)
|
||||
sender_name = ZulipDatabaseService.get_sender_name_for_message(message)
|
||||
|
||||
# Check if this message already exists in ChromaDB to avoid duplicates
|
||||
if ChromaDBService.message_exists(message_id):
|
||||
already_exists_count += 1
|
||||
logger.debug(f"Message {message_id} already exists in ChromaDB, skipping")
|
||||
continue
|
||||
|
||||
# Add the message to ChromaDB
|
||||
success = ChromaDBService.add_message(
|
||||
message_id=message_id,
|
||||
content=message.content,
|
||||
channel_name=channel_name,
|
||||
subject=message.subject,
|
||||
sender_name=sender_name,
|
||||
date_sent=message.date_sent
|
||||
)
|
||||
|
||||
if success:
|
||||
synced_count += 1
|
||||
else:
|
||||
logger.warning(f"Failed to add message {message_id} to ChromaDB")
|
||||
|
||||
# Update the last sync time and message ID
|
||||
self.last_sync_time = datetime.now()
|
||||
if highest_message_id > (self.last_message_id or 0):
|
||||
self.last_message_id = highest_message_id
|
||||
|
||||
# Save the sync state
|
||||
self._save_sync_state()
|
||||
|
||||
logger.info(f"Sync completed. Added {synced_count} new messages to ChromaDB. Skipped {already_exists_count} existing messages. Last message ID: {self.last_message_id}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error syncing messages: {e}")
|
||||
|
||||
def _sync_loop(self):
|
||||
"""Main sync loop."""
|
||||
while self.is_running:
|
||||
try:
|
||||
self._sync_messages()
|
||||
# Sleep for the specified interval
|
||||
for _ in range(self.sync_interval):
|
||||
if not self.is_running:
|
||||
break
|
||||
time.sleep(1)
|
||||
except Exception as e:
|
||||
logger.error(f"Error in sync loop: {e}")
|
||||
# Sleep a bit before retrying to avoid tight error loops
|
||||
time.sleep(5)
|
||||
|
||||
def start(self):
|
||||
"""Start the message sync service."""
|
||||
if self.is_running:
|
||||
logger.warning("Sync service is already running")
|
||||
return
|
||||
|
||||
logger.info(f"Starting message sync service with interval {self.sync_interval} seconds")
|
||||
self.is_running = True
|
||||
self.sync_thread = threading.Thread(target=self._sync_loop)
|
||||
self.sync_thread.daemon = True
|
||||
self.sync_thread.start()
|
||||
|
||||
def stop(self):
|
||||
"""Stop the message sync service."""
|
||||
if not self.is_running:
|
||||
logger.warning("Sync service is not running")
|
||||
return
|
||||
|
||||
logger.info("Stopping message sync service")
|
||||
self.is_running = False
|
||||
if self.sync_thread:
|
||||
self.sync_thread.join(timeout=10)
|
||||
logger.info("Sync service stopped")
|
||||
|
||||
def sync_now(self):
|
||||
"""Manually trigger a sync operation."""
|
||||
logger.info("Manual sync triggered")
|
||||
self._sync_messages()
|
||||
BIN
chromadb/bm25_index.pkl
Normal file
BIN
chromadb/bm25_index.pkl
Normal file
Binary file not shown.
BIN
chromadb/chroma.sqlite3
Normal file
BIN
chromadb/chroma.sqlite3
Normal file
Binary file not shown.
BIN
chromadb/dfdf3cf3-41c2-4f8b-8e2f-0e411dccc78d/data_level0.bin
Normal file
BIN
chromadb/dfdf3cf3-41c2-4f8b-8e2f-0e411dccc78d/data_level0.bin
Normal file
Binary file not shown.
BIN
chromadb/dfdf3cf3-41c2-4f8b-8e2f-0e411dccc78d/header.bin
Normal file
BIN
chromadb/dfdf3cf3-41c2-4f8b-8e2f-0e411dccc78d/header.bin
Normal file
Binary file not shown.
Binary file not shown.
BIN
chromadb/dfdf3cf3-41c2-4f8b-8e2f-0e411dccc78d/length.bin
Normal file
BIN
chromadb/dfdf3cf3-41c2-4f8b-8e2f-0e411dccc78d/length.bin
Normal file
Binary file not shown.
BIN
chromadb/dfdf3cf3-41c2-4f8b-8e2f-0e411dccc78d/link_lists.bin
Normal file
BIN
chromadb/dfdf3cf3-41c2-4f8b-8e2f-0e411dccc78d/link_lists.bin
Normal file
Binary file not shown.
141
compare_all_messages.py
Executable file
141
compare_all_messages.py
Executable file
@ -0,0 +1,141 @@
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
Simple script to compare ALL messages in Zulip to ChromaDB with no restrictions.
|
||||
"""
|
||||
import os
|
||||
import sys
|
||||
import logging
|
||||
from collections import defaultdict
|
||||
from datetime import datetime
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
||||
)
|
||||
logger = logging.getLogger("compare_all_messages")
|
||||
|
||||
# Add the current directory to the path so we can import the app module
|
||||
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
|
||||
|
||||
# Apply NumPy compatibility patch for ChromaDB
|
||||
from app.utils import patch_chromadb_numpy
|
||||
patch_chromadb_numpy()
|
||||
|
||||
from app import create_app
|
||||
from app.db import get_chroma_collection, get_db_session
|
||||
from app.db.zulip_service import ZulipDatabaseService
|
||||
from app.models.zulip import Message
|
||||
|
||||
def main():
|
||||
"""Main function to compare Zulip messages with ChromaDB entries."""
|
||||
logger.info("Starting simple comparison of ALL messages")
|
||||
|
||||
# Create the Flask app (needed for context)
|
||||
app = create_app()
|
||||
|
||||
with app.app_context():
|
||||
print("\n====================================================")
|
||||
print("COMPARING ALL ZULIP MESSAGES WITH CHROMADB")
|
||||
print(f"Started at: {datetime.now()}")
|
||||
print("====================================================\n")
|
||||
|
||||
try:
|
||||
# Get Zulip DB session
|
||||
session = get_db_session()
|
||||
|
||||
# Get ALL messages from Zulip
|
||||
print("Fetching all messages from Zulip...")
|
||||
zulip_messages = session.query(Message).all()
|
||||
zulip_ids = set(str(msg.id) for msg in zulip_messages)
|
||||
|
||||
# Get channel counts
|
||||
channel_counts = defaultdict(int)
|
||||
for message in zulip_messages:
|
||||
channel_name = ZulipDatabaseService.get_channel_name_for_message(message)
|
||||
if channel_name is None:
|
||||
channel_name = "Unknown Channel"
|
||||
channel_counts[channel_name] += 1
|
||||
|
||||
# Print Zulip stats
|
||||
print(f"\nZulip has {len(zulip_messages)} total messages across {len(channel_counts)} channels")
|
||||
|
||||
# Get ChromaDB collection
|
||||
collection = get_chroma_collection()
|
||||
|
||||
if not collection:
|
||||
print("ERROR: Failed to get ChromaDB collection")
|
||||
return
|
||||
|
||||
# Get all entries from ChromaDB
|
||||
print("Fetching all entries from ChromaDB...")
|
||||
chroma_result = collection.get(include=['metadatas'])
|
||||
|
||||
if not chroma_result or 'ids' not in chroma_result or not chroma_result['ids']:
|
||||
print("No entries found in ChromaDB")
|
||||
return
|
||||
|
||||
# Get unique ChromaDB IDs
|
||||
chroma_ids = set(chroma_result['ids'])
|
||||
|
||||
# Get channel counts for ChromaDB
|
||||
chroma_channel_counts = defaultdict(int)
|
||||
for i, _ in enumerate(chroma_result['ids']):
|
||||
if chroma_result.get('metadatas') and len(chroma_result['metadatas']) > i:
|
||||
metadata = chroma_result['metadatas'][i]
|
||||
channel = metadata.get('channel', 'Unknown')
|
||||
chroma_channel_counts[channel] += 1
|
||||
|
||||
# Print ChromaDB stats
|
||||
print(f"ChromaDB has {len(chroma_result['ids'])} total entries")
|
||||
print(f"ChromaDB has {len(chroma_ids)} unique entries")
|
||||
|
||||
# Calculate missing and extra
|
||||
missing_from_chromadb = zulip_ids - chroma_ids
|
||||
extra_in_chromadb = chroma_ids - zulip_ids
|
||||
|
||||
# Calculate overall sync percentage
|
||||
sync_percentage = (len(chroma_ids) / len(zulip_ids) * 100) if zulip_ids else 0
|
||||
|
||||
# Print comparison results
|
||||
print("\n====================================================")
|
||||
print("COMPARISON RESULTS")
|
||||
print("====================================================")
|
||||
print(f"Zulip total messages: {len(zulip_messages)}")
|
||||
print(f"ChromaDB total entries: {len(chroma_result['ids'])}")
|
||||
print(f"ChromaDB unique entries: {len(chroma_ids)}")
|
||||
print(f"Sync percentage: {sync_percentage:.2f}%")
|
||||
print(f"Messages in Zulip but not in ChromaDB: {len(missing_from_chromadb)}")
|
||||
print(f"Entries in ChromaDB not in Zulip: {len(extra_in_chromadb)}")
|
||||
|
||||
# Print channel comparison
|
||||
print("\nCHANNEL COMPARISON:")
|
||||
print("-" * 70)
|
||||
print(f"{'Channel':<25} {'Zulip':<10} {'ChromaDB':<10} {'Diff':<10} {'%':<10}")
|
||||
print("-" * 70)
|
||||
|
||||
all_channels = sorted(set(channel_counts.keys()) | set(chroma_channel_counts.keys()))
|
||||
for channel in all_channels:
|
||||
zulip_count = channel_counts.get(channel, 0)
|
||||
chroma_count = chroma_channel_counts.get(channel, 0)
|
||||
diff = zulip_count - chroma_count
|
||||
percentage = (chroma_count / zulip_count * 100) if zulip_count > 0 else 0
|
||||
print(f"{channel[:25]:<25} {zulip_count:<10} {chroma_count:<10} {diff:<10} {percentage:.2f}%")
|
||||
|
||||
# Print recommendations
|
||||
print("\n====================================================")
|
||||
print("RECOMMENDATIONS")
|
||||
print("====================================================")
|
||||
if sync_percentage < 100:
|
||||
print("- Run ./sync_all_messages.py to sync missing messages")
|
||||
else:
|
||||
print("- All messages are synced!")
|
||||
|
||||
print(f"\nComparison completed at: {datetime.now()}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error during comparison: {e}")
|
||||
logger.error(f"Error during comparison: {e}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
333
compare_messages.py
Executable file
333
compare_messages.py
Executable file
@ -0,0 +1,333 @@
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
Script to compare the number of messages in Zulip channels to ChromaDB.
|
||||
|
||||
This script will gather statistics on message counts from both Zulip DB and ChromaDB,
|
||||
then generate a report showing discrepancies between the two.
|
||||
"""
|
||||
import os
|
||||
import sys
|
||||
import logging
|
||||
from collections import defaultdict, Counter
|
||||
from datetime import datetime, timedelta
|
||||
import argparse
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
||||
)
|
||||
logger = logging.getLogger("compare_messages")
|
||||
|
||||
# Add the current directory to the path so we can import the app module
|
||||
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
|
||||
|
||||
# Apply NumPy compatibility patch for ChromaDB
|
||||
from app.utils import patch_chromadb_numpy
|
||||
patch_chromadb_numpy()
|
||||
|
||||
from app import create_app
|
||||
from app.db import get_chroma_collection, get_db_session
|
||||
from app.db.zulip_service import ZulipDatabaseService
|
||||
from app.models.zulip import Message, Stream, Recipient, UserProfile
|
||||
from sqlalchemy import and_, not_, or_
|
||||
from app.config import Config
|
||||
|
||||
def get_excluded_user_ids():
|
||||
"""Get the user IDs of IT_Bot and ai_bot."""
|
||||
session = get_db_session()
|
||||
excluded_users = session.query(UserProfile).filter(
|
||||
UserProfile.full_name.in_(['IT_Bot', 'ai_bot'])
|
||||
).all()
|
||||
|
||||
excluded_user_ids = [user.id for user in excluded_users]
|
||||
logger.info(f"Excluding messages from users: {[u.full_name for u in excluded_users]} (IDs: {excluded_user_ids})")
|
||||
return excluded_user_ids
|
||||
|
||||
def get_sandbox_recipient_id():
|
||||
"""Get the recipient ID for the sandbox channel."""
|
||||
session = get_db_session()
|
||||
sandbox_stream = session.query(Stream).filter(
|
||||
Stream.name == 'sandbox'
|
||||
).first()
|
||||
|
||||
if sandbox_stream:
|
||||
logger.info(f"Excluding messages from sandbox channel (recipient_id={sandbox_stream.recipient_id})")
|
||||
return sandbox_stream.recipient_id
|
||||
else:
|
||||
logger.warning("Sandbox channel not found")
|
||||
return None
|
||||
|
||||
def get_zulip_message_counts(days=30):
|
||||
"""
|
||||
Get message counts from Zulip database for all channels except sandbox,
|
||||
also excluding IT_Bot and ai_bot messages.
|
||||
|
||||
Args:
|
||||
days: Number of days to look back
|
||||
|
||||
Returns:
|
||||
dict: Channel name to message count mapping
|
||||
"""
|
||||
logger.info(f"Getting message counts from Zulip DB for the last {days} days")
|
||||
|
||||
try:
|
||||
session = get_db_session()
|
||||
# Get excluded user IDs (IT_Bot and ai_bot)
|
||||
excluded_user_ids = get_excluded_user_ids()
|
||||
|
||||
# Get sandbox recipient ID to exclude
|
||||
sandbox_recipient_id = get_sandbox_recipient_id()
|
||||
|
||||
# Build filters
|
||||
since_date = datetime.now() - timedelta(days=days)
|
||||
filters = [Message.date_sent >= since_date]
|
||||
|
||||
# Add filter for excluded users
|
||||
if excluded_user_ids:
|
||||
filters.append(not_(Message.sender_id.in_(excluded_user_ids)))
|
||||
|
||||
# Add filter for excluded recipient (sandbox)
|
||||
if sandbox_recipient_id:
|
||||
filters.append(Message.recipient_id != sandbox_recipient_id)
|
||||
|
||||
# Get all messages
|
||||
messages = session.query(Message).filter(and_(*filters)).all()
|
||||
|
||||
# Get all channels except sandbox
|
||||
streams = session.query(Stream).filter(
|
||||
Stream.deactivated == False
|
||||
).all()
|
||||
|
||||
# Filter out sandbox
|
||||
included_streams = [stream for stream in streams
|
||||
if stream.recipient_id != sandbox_recipient_id]
|
||||
|
||||
# Print the list of channels being analyzed
|
||||
channels = [(stream.name, stream.recipient_id) for stream in included_streams]
|
||||
channels.sort(key=lambda x: x[0])
|
||||
logger.info(f"Analyzing messages from {len(channels)} channels:")
|
||||
for channel_name, recipient_id in channels:
|
||||
logger.info(f"- {channel_name} (recipient_id={recipient_id})")
|
||||
|
||||
# Count messages by channel
|
||||
channel_counts = defaultdict(int)
|
||||
message_ids = set()
|
||||
|
||||
for message in messages:
|
||||
channel_name = ZulipDatabaseService.get_channel_name_for_message(message)
|
||||
if channel_name and channel_name != "sandbox":
|
||||
channel_counts[channel_name] += 1
|
||||
message_ids.add(str(message.id)) # Convert to string for comparison with ChromaDB
|
||||
|
||||
# Print the message counts by channel
|
||||
logger.info(f"Message counts by channel:")
|
||||
for channel, count in sorted(channel_counts.items()):
|
||||
logger.info(f"- {channel}: {count} messages")
|
||||
|
||||
return {
|
||||
'channel_counts': dict(channel_counts),
|
||||
'total_count': len(messages),
|
||||
'unique_count': len(message_ids),
|
||||
'message_ids': message_ids
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error(f"Error getting Zulip message counts: {e}")
|
||||
return {'channel_counts': {}, 'total_count': 0, 'unique_count': 0, 'message_ids': set()}
|
||||
|
||||
def get_chromadb_message_counts():
|
||||
"""
|
||||
Get message counts from ChromaDB.
|
||||
|
||||
Returns:
|
||||
dict: Statistics about ChromaDB messages
|
||||
"""
|
||||
logger.info("Getting message counts from ChromaDB")
|
||||
|
||||
try:
|
||||
collection = get_chroma_collection()
|
||||
|
||||
if not collection:
|
||||
logger.error("Failed to get ChromaDB collection")
|
||||
return {'channel_counts': {}, 'total_count': 0, 'unique_count': 0, 'message_ids': set()}
|
||||
|
||||
# Get all entries
|
||||
result = collection.get(include=['metadatas'])
|
||||
|
||||
if not result or 'ids' not in result or not result['ids']:
|
||||
logger.info("No entries found in ChromaDB")
|
||||
return {'channel_counts': {}, 'total_count': 0, 'unique_count': 0, 'message_ids': set()}
|
||||
|
||||
# Count messages by channel
|
||||
channel_counts = defaultdict(int)
|
||||
message_ids = set()
|
||||
|
||||
for i, message_id in enumerate(result['ids']):
|
||||
# Extract channel from metadata
|
||||
if result.get('metadatas') and len(result['metadatas']) > i:
|
||||
metadata = result['metadatas'][i]
|
||||
channel = metadata.get('channel', 'Unknown')
|
||||
if channel != "sandbox":
|
||||
channel_counts[channel] += 1
|
||||
|
||||
# Add to message_ids set
|
||||
message_ids.add(message_id)
|
||||
|
||||
# Count duplicates
|
||||
id_counts = Counter(result['ids'])
|
||||
duplicates = {message_id: count for message_id, count in id_counts.items() if count > 1}
|
||||
|
||||
# Print the message counts by channel
|
||||
logger.info(f"ChromaDB message counts by channel:")
|
||||
for channel, count in sorted(channel_counts.items()):
|
||||
logger.info(f"- {channel}: {count} messages")
|
||||
|
||||
return {
|
||||
'channel_counts': dict(channel_counts),
|
||||
'total_count': len(result['ids']),
|
||||
'unique_count': len(message_ids),
|
||||
'message_ids': message_ids,
|
||||
'duplicate_count': len(duplicates),
|
||||
'duplicates': duplicates
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error(f"Error getting ChromaDB message counts: {e}")
|
||||
return {'channel_counts': {}, 'total_count': 0, 'unique_count': 0, 'message_ids': set()}
|
||||
|
||||
def compare_counts(zulip_counts, chromadb_counts, days):
|
||||
"""
|
||||
Compare message counts between Zulip and ChromaDB.
|
||||
|
||||
Args:
|
||||
zulip_counts: Counts from Zulip DB
|
||||
chromadb_counts: Counts from ChromaDB
|
||||
days: Number of days looked back
|
||||
|
||||
Returns:
|
||||
dict: Comparison statistics
|
||||
"""
|
||||
logger.info("Comparing message counts")
|
||||
|
||||
# Get message IDs in Zulip but not in ChromaDB
|
||||
zulip_ids = set(zulip_counts['message_ids'])
|
||||
chroma_ids = set(chromadb_counts['message_ids'])
|
||||
|
||||
# Convert all IDs to strings for comparison
|
||||
zulip_ids = {str(id) for id in zulip_ids}
|
||||
chroma_ids = {str(id) for id in chroma_ids}
|
||||
|
||||
missing_from_chromadb = zulip_ids - chroma_ids
|
||||
|
||||
# Get message IDs in ChromaDB but not in Zulip (within the timeframe)
|
||||
extra_in_chromadb = chroma_ids - zulip_ids
|
||||
|
||||
# Channel comparison
|
||||
channel_comparison = {}
|
||||
all_channels = set(zulip_counts['channel_counts'].keys()) | set(chromadb_counts['channel_counts'].keys())
|
||||
|
||||
for channel in all_channels:
|
||||
zulip_count = zulip_counts['channel_counts'].get(channel, 0)
|
||||
chromadb_count = chromadb_counts['channel_counts'].get(channel, 0)
|
||||
difference = zulip_count - chromadb_count
|
||||
|
||||
channel_comparison[channel] = {
|
||||
'zulip_count': zulip_count,
|
||||
'chromadb_count': chromadb_count,
|
||||
'difference': difference,
|
||||
'percentage': (chromadb_count / zulip_count * 100) if zulip_count > 0 else 0
|
||||
}
|
||||
|
||||
return {
|
||||
'channel_comparison': channel_comparison,
|
||||
'missing_from_chromadb': missing_from_chromadb,
|
||||
'missing_count': len(missing_from_chromadb),
|
||||
'extra_in_chromadb': extra_in_chromadb,
|
||||
'extra_count': len(extra_in_chromadb),
|
||||
'zulip_total': zulip_counts['total_count'],
|
||||
'chromadb_total': chromadb_counts['total_count'],
|
||||
'zulip_unique': zulip_counts['unique_count'],
|
||||
'chromadb_unique': chromadb_counts['unique_count'],
|
||||
'duplicate_count': chromadb_counts.get('duplicate_count', 0),
|
||||
'days': days
|
||||
}
|
||||
|
||||
def print_comparison_report(comparison):
|
||||
"""
|
||||
Print a report of the comparison.
|
||||
|
||||
Args:
|
||||
comparison: Comparison statistics
|
||||
"""
|
||||
print("\n" + "=" * 80)
|
||||
print(f"ZULIP TO CHROMADB COMPARISON REPORT (Last {comparison['days']} days)")
|
||||
print("=" * 80)
|
||||
|
||||
print("\nSUMMARY:")
|
||||
print(f"Zulip total messages: {comparison['zulip_total']}")
|
||||
print(f"Zulip unique messages: {comparison['zulip_unique']}")
|
||||
print(f"ChromaDB total entries: {comparison['chromadb_total']}")
|
||||
print(f"ChromaDB unique entries: {comparison['chromadb_unique']}")
|
||||
print(f"Duplicate entries in ChromaDB: {comparison['duplicate_count']}")
|
||||
|
||||
sync_percentage = (comparison['chromadb_unique'] / comparison['zulip_unique'] * 100) if comparison['zulip_unique'] > 0 else 0
|
||||
print(f"Overall sync rate: {sync_percentage:.2f}%")
|
||||
|
||||
print(f"Messages in Zulip but missing from ChromaDB: {comparison['missing_count']}")
|
||||
print(f"Entries in ChromaDB not found in recent Zulip data: {comparison['extra_count']}")
|
||||
|
||||
print("\nCHANNEL BREAKDOWN:")
|
||||
print("-" * 80)
|
||||
print(f"{'Channel':<25} {'Zulip':<10} {'ChromaDB':<10} {'Diff':<10} {'Sync %':<10}")
|
||||
print("-" * 80)
|
||||
|
||||
for channel, stats in sorted(comparison['channel_comparison'].items()):
|
||||
print(f"{channel:<25} {stats['zulip_count']:<10} {stats['chromadb_count']:<10} {stats['difference']:<10} {stats['percentage']:.2f}%")
|
||||
|
||||
if comparison['missing_count'] > 0:
|
||||
print("\nMISSING MESSAGE IDS (Sample):")
|
||||
print(", ".join(str(mid) for mid in list(comparison['missing_from_chromadb'])[:10]))
|
||||
|
||||
if comparison['duplicate_count'] > 0:
|
||||
print("\nDUPLICATE ENTRIES DETECTED")
|
||||
print(f"Total messages with duplicates: {comparison['duplicate_count']}")
|
||||
|
||||
print("\n" + "=" * 80)
|
||||
print("RECOMMENDATIONS:")
|
||||
|
||||
if comparison['duplicate_count'] > 0:
|
||||
print("- Run ./fix_duplicate_entries.py to remove duplicate entries")
|
||||
|
||||
if comparison['missing_count'] > 0:
|
||||
print("- Run python sync_all_channels.py --force --days {0} to sync missing messages".format(comparison['days']))
|
||||
|
||||
if sync_percentage < 95:
|
||||
print("- Investigate sync service settings and DB connection issues")
|
||||
|
||||
print("=" * 80 + "\n")
|
||||
|
||||
def main():
|
||||
"""Main entry point."""
|
||||
parser = argparse.ArgumentParser(description="Compare Zulip channel messages to ChromaDB entries")
|
||||
parser.add_argument("--days", type=int, default=30, help="Number of days to look back in Zulip history")
|
||||
args = parser.parse_args()
|
||||
|
||||
logger.info("Starting message comparison")
|
||||
|
||||
# Create the Flask app (needed for context)
|
||||
app = create_app()
|
||||
|
||||
with app.app_context():
|
||||
# Get message counts
|
||||
zulip_counts = get_zulip_message_counts(days=args.days)
|
||||
chromadb_counts = get_chromadb_message_counts()
|
||||
|
||||
# Compare counts
|
||||
comparison = compare_counts(zulip_counts, chromadb_counts, args.days)
|
||||
|
||||
# Print report
|
||||
print_comparison_report(comparison)
|
||||
|
||||
logger.info("Comparison completed")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
23
ecosystem.config.js
Normal file
23
ecosystem.config.js
Normal file
@ -0,0 +1,23 @@
|
||||
module.exports = {
|
||||
apps: [
|
||||
{
|
||||
name: 'zulip-bot',
|
||||
script: './run_app.sh',
|
||||
interpreter: '/bin/bash',
|
||||
instances: 1,
|
||||
autorestart: true,
|
||||
watch: false,
|
||||
max_memory_restart: '500M',
|
||||
env: {
|
||||
NODE_ENV: 'production',
|
||||
FLASK_APP: 'app',
|
||||
FLASK_RUN_PORT: 5100
|
||||
},
|
||||
log_date_format: 'YYYY-MM-DD HH:mm:ss',
|
||||
error_file: 'logs/zulip-bot-error.log',
|
||||
out_file: 'logs/zulip-bot-out.log',
|
||||
merge_logs: true,
|
||||
time: true
|
||||
}
|
||||
]
|
||||
};
|
||||
5618
logs/zulip-bot-error.log
Normal file
5618
logs/zulip-bot-error.log
Normal file
File diff suppressed because it is too large
Load Diff
47
logs/zulip-bot-out.log
Normal file
47
logs/zulip-bot-out.log
Normal file
@ -0,0 +1,47 @@
|
||||
2025-05-14T17:36:16: Checking for processes on port 5100...
|
||||
2025-05-14T17:36:16: No process found on port 5100
|
||||
2025-05-14T17:36:16: Activating virtual environment...
|
||||
2025-05-14T17:36:16: Starting Flask app on port 5100...
|
||||
2025-05-14T17:36:17: NumPy compatibility patch applied for ChromaDB
|
||||
2025-05-14T17:36:17: * Serving Flask app 'app'
|
||||
2025-05-14T17:36:17: * Debug mode: on
|
||||
2025-05-14T17:38:41: Flask app stopped
|
||||
2025-05-14T17:38:41: Checking for processes on port 5100...
|
||||
2025-05-14T17:38:41: No process found on port 5100
|
||||
2025-05-14T17:38:41: Activating virtual environment...
|
||||
2025-05-14T17:38:41: Starting Flask app on port 5100...
|
||||
2025-05-14T17:38:42: NumPy compatibility patch applied for ChromaDB
|
||||
2025-05-14T17:38:42: * Serving Flask app 'app'
|
||||
2025-05-14T17:38:42: * Debug mode: on
|
||||
2025-05-14T17:38:42: Flask app stopped
|
||||
2025-05-14T17:38:42: Checking for processes on port 5100...
|
||||
2025-05-14T17:38:42: Killing process 2093957 on port 5100
|
||||
2025-05-14T17:38:42: Activating virtual environment...
|
||||
2025-05-14T17:38:42: Starting Flask app on port 5100...
|
||||
2025-05-14T17:38:43: NumPy compatibility patch applied for ChromaDB
|
||||
2025-05-14T17:38:43: * Serving Flask app 'app'
|
||||
2025-05-14T17:38:43: * Debug mode: on
|
||||
2025-05-14T17:38:51: Flask app stopped
|
||||
2025-05-14T17:38:51: Checking for processes on port 5100...
|
||||
2025-05-14T17:38:51: No process found on port 5100
|
||||
2025-05-14T17:38:51: Activating virtual environment...
|
||||
2025-05-14T17:38:51: Starting Flask app on port 5100...
|
||||
2025-05-14T17:38:52: NumPy compatibility patch applied for ChromaDB
|
||||
2025-05-14T17:38:52: * Serving Flask app 'app'
|
||||
2025-05-14T17:38:52: * Debug mode: on
|
||||
2025-05-15T09:29:44: Flask app stopped
|
||||
2025-05-15T09:29:44: Checking for processes on port 5100...
|
||||
2025-05-15T09:29:44: No process found on port 5100
|
||||
2025-05-15T09:29:44: Activating virtual environment...
|
||||
2025-05-15T09:29:44: Starting Flask app on port 5100...
|
||||
2025-05-15T09:29:45: NumPy compatibility patch applied for ChromaDB
|
||||
2025-05-15T09:29:45: * Serving Flask app 'app'
|
||||
2025-05-15T09:29:45: * Debug mode: on
|
||||
2025-05-15T09:29:46: Flask app stopped
|
||||
2025-05-15T09:29:46: Checking for processes on port 5100...
|
||||
2025-05-15T09:29:46: No process found on port 5100
|
||||
2025-05-15T09:29:46: Activating virtual environment...
|
||||
2025-05-15T09:29:46: Starting Flask app on port 5100...
|
||||
2025-05-15T09:29:47: NumPy compatibility patch applied for ChromaDB
|
||||
2025-05-15T09:29:47: * Serving Flask app 'app'
|
||||
2025-05-15T09:29:47: * Debug mode: on
|
||||
38
pm2_start.sh
Executable file
38
pm2_start.sh
Executable file
@ -0,0 +1,38 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Create logs directory if it doesn't exist
|
||||
mkdir -p logs
|
||||
|
||||
# Make sure the run_app.sh script is executable
|
||||
chmod +x run_app.sh
|
||||
|
||||
# Check if PM2 is installed
|
||||
if ! command -v pm2 &> /dev/null; then
|
||||
echo "PM2 is not installed. Installing..."
|
||||
npm install -g pm2
|
||||
fi
|
||||
|
||||
# Start the application with PM2
|
||||
echo "Starting Zulip Bot service with PM2..."
|
||||
pm2 start ecosystem.config.js
|
||||
|
||||
# Save the current PM2 configuration
|
||||
echo "Saving PM2 configuration..."
|
||||
pm2 save
|
||||
|
||||
# Configure PM2 to start on boot (may require sudo)
|
||||
echo "Setting up PM2 to start on system boot..."
|
||||
if sudo pm2 startup | grep -q "sudo"; then
|
||||
# If the output contains sudo commands, extract and run them
|
||||
sudo_cmd=$(sudo pm2 startup | grep "sudo" | tail -n 1)
|
||||
echo "Run the following command with sudo privileges to enable PM2 on startup:"
|
||||
echo "$sudo_cmd"
|
||||
else
|
||||
echo "PM2 startup configuration completed."
|
||||
fi
|
||||
|
||||
echo "PM2 service setup complete. Zulip Bot is now running as a service."
|
||||
echo "To check status: pm2 status"
|
||||
echo "To view logs: pm2 logs zulip-bot"
|
||||
echo "To restart: pm2 restart zulip-bot"
|
||||
echo "To stop: pm2 stop zulip-bot"
|
||||
81
project_config.md
Normal file
81
project_config.md
Normal file
@ -0,0 +1,81 @@
|
||||
# Project Configuration (LTM)
|
||||
|
||||
*This file contains the stable, long-term context for the project.*
|
||||
*It should be updated infrequently, primarily when core goals, tech, or patterns change.*
|
||||
|
||||
---
|
||||
|
||||
## Core Goal
|
||||
|
||||
Develop a Python-based Flask application that integrates with Zulip to:
|
||||
|
||||
* Connect to a Zulip PostgreSQL database.
|
||||
* Retrieve messages from the specified channels: **IT Discussions, IT Knowledge, IT Support**.
|
||||
* Embed these messages into ChromaDB for efficient retrieval.
|
||||
* Implement a Zulip bot named **IT\_Bot** that responds to user queries when mentioned using the format `@**IT_Bot**`.
|
||||
* Generate context-based responses using the Gemini API.
|
||||
|
||||
---
|
||||
|
||||
## Tech Stack
|
||||
|
||||
* **Backend:** Python, Flask
|
||||
* **Database:** PostgreSQL (Zulip DB), ChromaDB
|
||||
* **AI Integration:** Gemini API
|
||||
* **Bot Framework:** Zulip Bot API
|
||||
* **Environment Management:** Virtualenv or Conda
|
||||
* **Version Control:** Git
|
||||
|
||||
---
|
||||
|
||||
## Critical Patterns & Conventions
|
||||
|
||||
* **Database Access:**
|
||||
|
||||
* Store database credentials securely (e.g., environment variables or a secrets manager).
|
||||
* Use SQLAlchemy ORM for structured queries.
|
||||
|
||||
* **Message Retrieval:**
|
||||
|
||||
* Implement periodic tasks to pull messages from the channels.
|
||||
* Ensure idempotent operations to prevent duplicates in ChromaDB.
|
||||
|
||||
* **Embedding Strategy:**
|
||||
|
||||
* Embed messages with metadata (e.g., channel name, timestamp, user ID).
|
||||
|
||||
* **Bot Activation:**
|
||||
|
||||
* The bot listens for `@**IT_Bot**` mentions.
|
||||
* Upon activation, relevant context is fetched from ChromaDB, and a response is generated using the Gemini API.
|
||||
|
||||
* **Error Handling:**
|
||||
|
||||
* Implement structured logging.
|
||||
* Gracefully handle API rate limits and database connection errors.
|
||||
|
||||
* **Security:**
|
||||
|
||||
* Store credentials and API keys in environment variables.
|
||||
* Implement rate limiting to prevent abuse.
|
||||
|
||||
---
|
||||
|
||||
## Key Constraints
|
||||
|
||||
* **Channels Monitored:** IT Discussions, IT Knowledge, IT Support
|
||||
* **Response Trigger:** Mentions of `@**IT_Bot**`
|
||||
* **Language Support:** English, Georgian
|
||||
* **Message Volume:** Approximately 500 messages per day.
|
||||
* **Deployment:** Local network server
|
||||
* **Zulip Bot Config:**
|
||||
|
||||
```
|
||||
[api]
|
||||
email=IT_bot-bot@zulip.lci.ge
|
||||
key=ta8x0Rwlf5yLlZutETiTZbHFtQMVOv1z
|
||||
site=https://zulip.lci.ge
|
||||
```
|
||||
|
||||
* **Database Connection:** `zulip:BlackMoonSky89@zulip.lci.ge:5432/zulip`
|
||||
* **Gemini API Key:** `AIzaSyD_VYKUcleCUkAxZj1sX3pWLHvGk0HDe9s`
|
||||
14
requirements.txt
Normal file
14
requirements.txt
Normal file
@ -0,0 +1,14 @@
|
||||
Flask==2.2.3
|
||||
Werkzeug==2.2.3
|
||||
SQLAlchemy==2.0.9
|
||||
psycopg2-binary==2.9.6
|
||||
python-dotenv==1.0.0
|
||||
chromadb==0.4.6
|
||||
zulip==0.8.2
|
||||
google-generativeai==0.3.1
|
||||
ollama==0.1.5
|
||||
nomic==2.0.3
|
||||
cohere==5.15.0
|
||||
rank-bm25==0.2.2
|
||||
nltk==3.8.1
|
||||
openai==1.30.4
|
||||
87
reset_chromadb.py
Executable file
87
reset_chromadb.py
Executable file
@ -0,0 +1,87 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Script to reset the ChromaDB completely and properly.
|
||||
This fixes issues with the vector database that cause "Add of existing embedding ID" warnings.
|
||||
"""
|
||||
|
||||
import os
|
||||
import shutil
|
||||
import logging
|
||||
import chromadb
|
||||
from chromadb.utils import embedding_functions
|
||||
from app.utils.embeddings import EmbeddingService
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
||||
logger = logging.getLogger("reset_chromadb")
|
||||
|
||||
def main():
|
||||
"""Main function to reset ChromaDB."""
|
||||
try:
|
||||
# Default ChromaDB path used in the application
|
||||
chromadb_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "chromadb")
|
||||
|
||||
logger.info(f"Preparing to reset ChromaDB at {chromadb_path}")
|
||||
|
||||
# First check if the directory exists
|
||||
if not os.path.exists(chromadb_path):
|
||||
logger.info("ChromaDB directory doesn't exist yet. Creating a fresh one.")
|
||||
os.makedirs(chromadb_path, exist_ok=True)
|
||||
logger.info("ChromaDB directory created successfully.")
|
||||
return
|
||||
|
||||
# Backup the existing ChromaDB directory
|
||||
backup_path = f"{chromadb_path}_backup"
|
||||
logger.info(f"Creating backup of ChromaDB at {backup_path}")
|
||||
|
||||
# Remove old backup if it exists
|
||||
if os.path.exists(backup_path):
|
||||
logger.info("Removing old backup")
|
||||
shutil.rmtree(backup_path)
|
||||
|
||||
# Create backup
|
||||
shutil.copytree(chromadb_path, backup_path)
|
||||
logger.info("Backup created successfully")
|
||||
|
||||
# Delete the ChromaDB directory
|
||||
logger.info("Removing existing ChromaDB directory")
|
||||
shutil.rmtree(chromadb_path)
|
||||
|
||||
# Create fresh ChromaDB
|
||||
logger.info("Creating fresh ChromaDB")
|
||||
os.makedirs(chromadb_path, exist_ok=True)
|
||||
|
||||
# Initialize a fresh ChromaDB client and create a new collection
|
||||
logger.info("Initializing fresh ChromaDB client")
|
||||
client = chromadb.PersistentClient(
|
||||
path=chromadb_path,
|
||||
settings=chromadb.Settings(
|
||||
allow_reset=True,
|
||||
anonymized_telemetry=False
|
||||
)
|
||||
)
|
||||
|
||||
# Create a custom embedding function
|
||||
class CustomEmbeddingFunction(embedding_functions.EmbeddingFunction):
|
||||
def __call__(self, texts):
|
||||
return EmbeddingService.get_ollama_embeddings(texts)
|
||||
|
||||
# Create a fresh collection
|
||||
logger.info("Creating fresh collection")
|
||||
collection = client.create_collection(
|
||||
name="zulip_messages",
|
||||
metadata={
|
||||
"hnsw:space": "cosine"
|
||||
},
|
||||
embedding_function=CustomEmbeddingFunction()
|
||||
)
|
||||
|
||||
logger.info("ChromaDB reset completed successfully")
|
||||
logger.info(f"To restore the backup if needed, delete {chromadb_path} and rename {backup_path} to {chromadb_path}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error resetting ChromaDB: {e}")
|
||||
logger.error("ChromaDB reset failed. Please check the error and try again.")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
26
run_app.sh
Executable file
26
run_app.sh
Executable file
@ -0,0 +1,26 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Kill any process using port 5100
|
||||
echo "Checking for processes on port 5100..."
|
||||
pid=$(lsof -ti:5100)
|
||||
if [ -n "$pid" ]; then
|
||||
echo "Killing process $pid on port 5100"
|
||||
kill -9 $pid
|
||||
else
|
||||
echo "No process found on port 5100"
|
||||
fi
|
||||
|
||||
# Activate virtual environment
|
||||
echo "Activating virtual environment..."
|
||||
source venv/bin/activate
|
||||
|
||||
# Set Flask environment variables
|
||||
export FLASK_APP=app
|
||||
export FLASK_RUN_PORT=5100
|
||||
|
||||
# Run the Flask app
|
||||
echo "Starting Flask app on port 5100..."
|
||||
flask run --port=5100 --no-reload
|
||||
|
||||
# This script won't reach here unless the flask app is interrupted
|
||||
echo "Flask app stopped"
|
||||
13
setup.sh
Executable file
13
setup.sh
Executable file
@ -0,0 +1,13 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Create a virtual environment
|
||||
python3.11 -m venv venv
|
||||
|
||||
# Activate the virtual environment
|
||||
source venv/bin/activate
|
||||
|
||||
# Install the required packages
|
||||
pip install -r requirements.txt
|
||||
|
||||
echo "Setup completed successfully!"
|
||||
echo "To activate the virtual environment, run: source venv/bin/activate"
|
||||
580
sync_all_channels.py
Executable file
580
sync_all_channels.py
Executable file
@ -0,0 +1,580 @@
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
Script to sync messages from all Zulip channels (except sandbox) to ChromaDB.
|
||||
This script also excludes messages from IT_Bot and ai_bot users.
|
||||
"""
|
||||
import os
|
||||
import sys
|
||||
import argparse
|
||||
import logging
|
||||
import signal
|
||||
import time
|
||||
from datetime import datetime, timedelta
|
||||
import pickle
|
||||
|
||||
# Add the current directory to the path so we can import the app module
|
||||
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
|
||||
|
||||
# Apply NumPy compatibility patch for ChromaDB
|
||||
from app.utils import patch_chromadb_numpy
|
||||
patch_chromadb_numpy()
|
||||
|
||||
from app import create_app
|
||||
from app.db.zulip_service import ZulipDatabaseService
|
||||
from app.db.chroma_service import ChromaDBService
|
||||
from app.models.zulip import Message, Stream, Recipient, UserProfile
|
||||
from sqlalchemy import and_, not_, or_
|
||||
from app.db import get_db_session
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
||||
)
|
||||
logger = logging.getLogger("sync_all_channels")
|
||||
|
||||
# Global flag for graceful shutdown
|
||||
is_shutting_down = False
|
||||
|
||||
# Signal handler for CTRL+C
|
||||
def signal_handler(sig, frame):
|
||||
global is_shutting_down
|
||||
logger.info("Received shutdown signal, completing current operation before exiting...")
|
||||
is_shutting_down = True
|
||||
|
||||
# Register signal handler
|
||||
signal.signal(signal.SIGINT, signal_handler)
|
||||
signal.signal(signal.SIGTERM, signal_handler)
|
||||
|
||||
class AllChannelSyncService:
|
||||
"""Service for syncing messages from all channels except sandbox."""
|
||||
|
||||
# File to store the last synced message ID
|
||||
_SYNC_STATE_FILE = "all_channels_sync_state.pickle"
|
||||
|
||||
def __init__(self, batch_size=200, include_direct_messages=False):
|
||||
"""
|
||||
Initialize the sync service.
|
||||
|
||||
Args:
|
||||
batch_size (int): Number of messages to process in each batch
|
||||
include_direct_messages (bool): Whether to include direct messages
|
||||
"""
|
||||
self.batch_size = batch_size
|
||||
self.last_sync_time = None
|
||||
self.last_message_id = None
|
||||
self.state_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
self.channels_to_sync = []
|
||||
self.include_direct_messages = include_direct_messages
|
||||
|
||||
# Load the last synced state if available
|
||||
self._load_sync_state()
|
||||
|
||||
def _get_state_file_path(self):
|
||||
"""Get the full path to the sync state file."""
|
||||
return os.path.join(self.state_dir, self._SYNC_STATE_FILE)
|
||||
|
||||
def _load_sync_state(self):
|
||||
"""Load the last sync state from disk."""
|
||||
try:
|
||||
state_file = self._get_state_file_path()
|
||||
if os.path.exists(state_file):
|
||||
with open(state_file, 'rb') as f:
|
||||
state = pickle.load(f)
|
||||
self.last_sync_time = state.get('last_sync_time')
|
||||
self.last_message_id = state.get('last_message_id')
|
||||
logger.info(f"Loaded sync state: last_sync_time={self.last_sync_time}, last_message_id={self.last_message_id}")
|
||||
else:
|
||||
logger.info("No previous sync state found, starting fresh")
|
||||
except Exception as e:
|
||||
logger.error(f"Error loading sync state: {e}")
|
||||
|
||||
def _save_sync_state(self, channel_counts=None):
|
||||
"""Save the current sync state to disk."""
|
||||
try:
|
||||
state = {
|
||||
'last_sync_time': self.last_sync_time,
|
||||
'last_message_id': self.last_message_id
|
||||
}
|
||||
|
||||
if channel_counts:
|
||||
state['channel_counts'] = channel_counts
|
||||
|
||||
state_file = self._get_state_file_path()
|
||||
|
||||
# Save to a temporary file first, then rename to avoid corruption if interrupted
|
||||
temp_file = state_file + '.temp'
|
||||
with open(temp_file, 'wb') as f:
|
||||
pickle.dump(state, f)
|
||||
f.flush()
|
||||
os.fsync(f.fileno()) # Ensure data is written to disk
|
||||
|
||||
# Rename the temp file to the actual state file (atomic operation)
|
||||
os.rename(temp_file, state_file)
|
||||
|
||||
logger.info(f"Saved sync state: {state}")
|
||||
except Exception as e:
|
||||
logger.error(f"Error saving sync state: {e}")
|
||||
|
||||
def get_excluded_user_ids(self):
|
||||
"""Get the user IDs of IT_Bot and ai_bot."""
|
||||
session = get_db_session()
|
||||
excluded_users = session.query(UserProfile).filter(
|
||||
UserProfile.full_name.in_(['IT_Bot', 'ai_bot'])
|
||||
).all()
|
||||
|
||||
excluded_user_ids = [user.id for user in excluded_users]
|
||||
logger.info(f"Excluding messages from users: {[u.full_name for u in excluded_users]} (IDs: {excluded_user_ids})")
|
||||
return excluded_user_ids
|
||||
|
||||
def get_sandbox_recipient_id(self):
|
||||
"""Get the recipient ID for the sandbox channel."""
|
||||
session = get_db_session()
|
||||
sandbox_stream = session.query(Stream).filter(
|
||||
Stream.name == 'sandbox'
|
||||
).first()
|
||||
|
||||
if sandbox_stream:
|
||||
logger.info(f"Excluding messages from sandbox channel (recipient_id={sandbox_stream.recipient_id})")
|
||||
return sandbox_stream.recipient_id
|
||||
else:
|
||||
logger.warning("Sandbox channel not found")
|
||||
return None
|
||||
|
||||
def get_channels_to_sync(self):
|
||||
"""Get all active channels except sandbox with their recipient IDs."""
|
||||
session = get_db_session()
|
||||
sandbox_recipient_id = self.get_sandbox_recipient_id()
|
||||
|
||||
# Get all active streams
|
||||
streams = session.query(Stream).filter(
|
||||
Stream.deactivated == False
|
||||
).all()
|
||||
|
||||
# Filter out sandbox
|
||||
included_streams = [stream for stream in streams
|
||||
if stream.recipient_id != sandbox_recipient_id]
|
||||
|
||||
# Create a list of channels to sync with their recipient IDs
|
||||
channels = [(stream.name, stream.recipient_id) for stream in included_streams]
|
||||
|
||||
# Sort by channel name
|
||||
channels.sort(key=lambda x: x[0])
|
||||
|
||||
# Print the list of channels
|
||||
logger.info(f"Found {len(channels)} channels to sync:")
|
||||
for channel_name, recipient_id in channels:
|
||||
logger.info(f"- {channel_name} (recipient_id={recipient_id})")
|
||||
|
||||
self.channels_to_sync = channels
|
||||
|
||||
# Return just the recipient IDs for filtering
|
||||
recipient_ids = [recipient_id for _, recipient_id in channels]
|
||||
return recipient_ids
|
||||
|
||||
def get_messages_newer_than_id(self, message_id, excluded_user_ids, excluded_recipient_id):
|
||||
"""Get messages with ID greater than the specified ID."""
|
||||
session = get_db_session()
|
||||
|
||||
# Build filters
|
||||
filters = [Message.id > message_id]
|
||||
|
||||
# Add filter for excluded users
|
||||
if excluded_user_ids:
|
||||
filters.append(not_(Message.sender_id.in_(excluded_user_ids)))
|
||||
|
||||
# Add filter for excluded recipient (sandbox)
|
||||
if excluded_recipient_id:
|
||||
filters.append(Message.recipient_id != excluded_recipient_id)
|
||||
|
||||
messages = session.query(Message).filter(
|
||||
and_(*filters)
|
||||
).order_by(Message.id.asc()).limit(self.batch_size).all()
|
||||
|
||||
return messages
|
||||
|
||||
def get_messages_for_timeframe(self, since, excluded_user_ids, excluded_recipient_id, limit=1000, all_messages=False):
|
||||
"""
|
||||
Get messages from the specified timeframe.
|
||||
|
||||
Args:
|
||||
since (datetime): Get messages after this datetime
|
||||
excluded_user_ids (list): User IDs to exclude
|
||||
excluded_recipient_id (int): Recipient ID to exclude
|
||||
limit (int): Maximum number of messages to return
|
||||
all_messages (bool): If True, ignore the since parameter and get all messages
|
||||
|
||||
Returns:
|
||||
list: List of Message objects
|
||||
"""
|
||||
session = get_db_session()
|
||||
|
||||
# Build filters
|
||||
filters = []
|
||||
|
||||
# Add date filter if specified and not getting all messages
|
||||
if since and not all_messages:
|
||||
filters.append(Message.date_sent >= since)
|
||||
|
||||
# Add filter for excluded users
|
||||
if excluded_user_ids:
|
||||
filters.append(not_(Message.sender_id.in_(excluded_user_ids)))
|
||||
|
||||
# Add filter for excluded recipient (sandbox)
|
||||
if excluded_recipient_id:
|
||||
filters.append(Message.recipient_id != excluded_recipient_id)
|
||||
|
||||
# Get results
|
||||
query = session.query(Message)
|
||||
if filters:
|
||||
query = query.filter(and_(*filters))
|
||||
|
||||
messages = query.order_by(Message.id.desc()).limit(limit).all()
|
||||
|
||||
return messages
|
||||
|
||||
def get_channel_message_counts(self, since, excluded_user_ids, excluded_recipient_id, all_messages=False):
|
||||
"""Get message counts by channel for the specified timeframe."""
|
||||
session = get_db_session()
|
||||
|
||||
# Build filters
|
||||
filters = []
|
||||
|
||||
# Add date filter if specified and not getting all messages
|
||||
if since and not all_messages:
|
||||
filters.append(Message.date_sent >= since)
|
||||
|
||||
# Add filter for excluded users
|
||||
if excluded_user_ids:
|
||||
filters.append(not_(Message.sender_id.in_(excluded_user_ids)))
|
||||
|
||||
# Add filter for excluded recipient (sandbox)
|
||||
if excluded_recipient_id:
|
||||
filters.append(Message.recipient_id != excluded_recipient_id)
|
||||
|
||||
# Get all messages
|
||||
query = session.query(Message)
|
||||
if filters:
|
||||
query = query.filter(and_(*filters))
|
||||
|
||||
messages = query.all()
|
||||
|
||||
# Count messages by channel
|
||||
channel_counts = {}
|
||||
for message in messages:
|
||||
channel_name = ZulipDatabaseService.get_channel_name_for_message(message)
|
||||
if channel_name:
|
||||
if channel_name not in channel_counts:
|
||||
channel_counts[channel_name] = 0
|
||||
channel_counts[channel_name] += 1
|
||||
|
||||
# Sort by channel name
|
||||
sorted_counts = {k: channel_counts[k] for k in sorted(channel_counts.keys())}
|
||||
|
||||
# Print the message counts by channel
|
||||
logger.info(f"Message counts by channel:")
|
||||
for channel, count in sorted_counts.items():
|
||||
logger.info(f"- {channel}: {count} messages")
|
||||
|
||||
return sorted_counts
|
||||
|
||||
def sync_messages(self, days=None, force=False, max_messages=5000, all_messages=False):
|
||||
"""
|
||||
Sync messages from all Zulip channels to ChromaDB.
|
||||
|
||||
Args:
|
||||
days (int): Number of days to look back for messages (default: use sync state)
|
||||
force (bool): Whether to force sync all messages from the lookback period
|
||||
max_messages (int): Maximum total number of messages to sync
|
||||
all_messages (bool): If True, ignore date filtering and sync all messages
|
||||
"""
|
||||
global is_shutting_down
|
||||
|
||||
try:
|
||||
# Get excluded user IDs (IT_Bot and ai_bot)
|
||||
excluded_user_ids = self.get_excluded_user_ids()
|
||||
|
||||
# Get sandbox recipient ID to exclude
|
||||
excluded_recipient_id = self.get_sandbox_recipient_id()
|
||||
|
||||
# Get all channels to sync and their recipient IDs
|
||||
self.get_channels_to_sync()
|
||||
|
||||
# Reset sync state if forced
|
||||
if force:
|
||||
if all_messages:
|
||||
self.last_sync_time = None
|
||||
self.last_message_id = None
|
||||
logger.info("Force syncing ALL messages regardless of date")
|
||||
elif days:
|
||||
self.last_sync_time = datetime.now() - timedelta(days=days)
|
||||
self.last_message_id = None
|
||||
logger.info(f"Force syncing messages from the last {days} days")
|
||||
|
||||
# Set default sync time if not set yet and not syncing all messages
|
||||
if not self.last_sync_time and not all_messages and not force:
|
||||
# Start with messages from the last 30 days if no previous sync
|
||||
self.last_sync_time = datetime.now() - timedelta(days=30 if not days else days)
|
||||
logger.info(f"No previous sync time, starting from {self.last_sync_time}")
|
||||
|
||||
# Count total messages to sync if forcing
|
||||
total_messages = 0
|
||||
if force:
|
||||
since_date = None if all_messages else (datetime.now() - timedelta(days=days if days else 30))
|
||||
all_messages_count = self.get_messages_for_timeframe(
|
||||
since=since_date,
|
||||
excluded_user_ids=excluded_user_ids,
|
||||
excluded_recipient_id=excluded_recipient_id,
|
||||
limit=max_messages,
|
||||
all_messages=all_messages
|
||||
)
|
||||
total_messages = len(all_messages_count)
|
||||
logger.info(f"Found a total of {total_messages} messages to sync")
|
||||
|
||||
# Get message counts by channel
|
||||
self.get_channel_message_counts(since_date, excluded_user_ids, excluded_recipient_id, all_messages=all_messages)
|
||||
|
||||
# Run multiple batches of sync
|
||||
total_synced = 0
|
||||
already_exists_count = 0
|
||||
highest_message_id = self.last_message_id or 0
|
||||
batch_count = 0
|
||||
|
||||
# Track synced messages by channel
|
||||
channel_sync_counts = {}
|
||||
|
||||
# Time to save state
|
||||
last_save_time = time.time()
|
||||
save_interval = 10 # Save state every 10 seconds
|
||||
|
||||
while not is_shutting_down:
|
||||
batch_count += 1
|
||||
logger.info(f"Running batch {batch_count}, synced {total_synced} messages so far")
|
||||
|
||||
# Get new messages
|
||||
messages = []
|
||||
if self.last_message_id:
|
||||
# Get messages with ID greater than the last processed message ID
|
||||
messages = self.get_messages_newer_than_id(
|
||||
self.last_message_id,
|
||||
excluded_user_ids,
|
||||
excluded_recipient_id
|
||||
)
|
||||
else:
|
||||
# Get messages since the last sync time or all messages
|
||||
messages = self.get_messages_for_timeframe(
|
||||
since=self.last_sync_time,
|
||||
excluded_user_ids=excluded_user_ids,
|
||||
excluded_recipient_id=excluded_recipient_id,
|
||||
limit=self.batch_size,
|
||||
all_messages=all_messages
|
||||
)
|
||||
|
||||
if not messages:
|
||||
logger.info("No new messages found to sync")
|
||||
break
|
||||
|
||||
logger.info(f"Found {len(messages)} new messages to sync in batch {batch_count}")
|
||||
|
||||
# Process each message
|
||||
synced_in_batch = 0
|
||||
for message in messages:
|
||||
# Check if we need to shutdown
|
||||
if is_shutting_down:
|
||||
logger.info("Shutdown requested, saving state and exiting...")
|
||||
break
|
||||
|
||||
message_id = message.id
|
||||
|
||||
# Update highest message ID seen
|
||||
if message_id > highest_message_id:
|
||||
highest_message_id = message_id
|
||||
|
||||
channel_name = ZulipDatabaseService.get_channel_name_for_message(message)
|
||||
sender_name = ZulipDatabaseService.get_sender_name_for_message(message)
|
||||
|
||||
# Skip excluded channels and users
|
||||
if channel_name == "sandbox":
|
||||
continue
|
||||
|
||||
if sender_name in ["IT_Bot", "ai_bot"]:
|
||||
continue
|
||||
|
||||
# Skip direct messages unless explicitly included
|
||||
if not self.include_direct_messages and channel_name in ["Direct Message", "Group Message"]:
|
||||
logger.debug(f"Skipping {channel_name} message {message_id} (use --include-direct-messages to include)")
|
||||
continue
|
||||
|
||||
# Check if this message already exists in ChromaDB to avoid duplicates
|
||||
if ChromaDBService.message_exists(message_id):
|
||||
already_exists_count += 1
|
||||
logger.debug(f"Message {message_id} already exists in ChromaDB, skipping")
|
||||
continue
|
||||
|
||||
# Handle None channel names
|
||||
if channel_name is None:
|
||||
channel_name = "Unknown Channel"
|
||||
logger.warning(f"Found message {message_id} with None channel name, using '{channel_name}' instead")
|
||||
|
||||
# Add the message to ChromaDB
|
||||
try:
|
||||
success = ChromaDBService.add_message(
|
||||
message_id=message_id,
|
||||
content=message.content,
|
||||
channel_name=channel_name,
|
||||
subject=message.subject,
|
||||
sender_name=sender_name,
|
||||
date_sent=message.date_sent
|
||||
)
|
||||
|
||||
if success:
|
||||
synced_in_batch += 1
|
||||
total_synced += 1
|
||||
|
||||
# Update channel counts
|
||||
if channel_name not in channel_sync_counts:
|
||||
channel_sync_counts[channel_name] = 0
|
||||
channel_sync_counts[channel_name] += 1
|
||||
|
||||
# Update the last message ID after each successful addition
|
||||
self.last_message_id = message_id
|
||||
|
||||
# Save state periodically
|
||||
current_time = time.time()
|
||||
if current_time - last_save_time > save_interval:
|
||||
self.last_sync_time = datetime.now()
|
||||
self._save_sync_state(channel_sync_counts)
|
||||
last_save_time = current_time
|
||||
|
||||
else:
|
||||
logger.warning(f"Failed to add message {message_id} to ChromaDB")
|
||||
except Exception as e:
|
||||
logger.error(f"Error adding message {message_id} to ChromaDB: {e}")
|
||||
# Continue with next message
|
||||
|
||||
# Update the last sync time and message ID at the end of the batch
|
||||
self.last_sync_time = datetime.now()
|
||||
if highest_message_id > (self.last_message_id or 0):
|
||||
self.last_message_id = highest_message_id
|
||||
|
||||
# Save the sync state after each batch
|
||||
self._save_sync_state(channel_sync_counts)
|
||||
last_save_time = time.time()
|
||||
|
||||
logger.info(f"Batch {batch_count} completed. Added {synced_in_batch} new messages to ChromaDB. " +
|
||||
f"Total synced: {total_synced}. Last message ID: {self.last_message_id}")
|
||||
|
||||
# Check if we've reached the max messages limit
|
||||
if total_synced >= max_messages:
|
||||
logger.info(f"Reached max messages limit of {max_messages}")
|
||||
break
|
||||
|
||||
# If this batch had fewer messages than the batch size, we're done
|
||||
if len(messages) < self.batch_size:
|
||||
logger.info("Fetched fewer messages than batch size, assuming all messages have been processed")
|
||||
break
|
||||
|
||||
# Final state save with channel statistics
|
||||
if is_shutting_down:
|
||||
logger.info("Shutdown signal received, saving final state...")
|
||||
|
||||
# Print synced messages by channel
|
||||
if channel_sync_counts:
|
||||
logger.info("Messages synced by channel:")
|
||||
try:
|
||||
# Use a safe sorting method that handles None keys
|
||||
sorted_items = sorted(channel_sync_counts.items(),
|
||||
key=lambda item: item[0] if item[0] is not None else "")
|
||||
|
||||
for channel, count in sorted_items:
|
||||
channel_name = channel if channel is not None else "Unknown Channel"
|
||||
logger.info(f"- {channel_name}: {count} messages")
|
||||
except Exception as e:
|
||||
logger.warning(f"Error displaying channel stats: {e}")
|
||||
# Fallback display without sorting
|
||||
for channel, count in channel_sync_counts.items():
|
||||
channel_name = channel if channel is not None else "Unknown Channel"
|
||||
logger.info(f"- {channel_name}: {count} messages")
|
||||
|
||||
# Return the final stats
|
||||
stats = {
|
||||
'last_sync_time': self.last_sync_time,
|
||||
'last_message_id': self.last_message_id,
|
||||
'total_synced': total_synced,
|
||||
'batches': batch_count,
|
||||
'already_exists': already_exists_count,
|
||||
'channel_counts': channel_sync_counts
|
||||
}
|
||||
|
||||
logger.info(f"Sync completed. Current state: {stats}")
|
||||
return stats
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error syncing messages: {e}")
|
||||
# Save state on error
|
||||
self._save_sync_state()
|
||||
return None
|
||||
|
||||
def main():
|
||||
"""Main entry point."""
|
||||
# Parse command line arguments
|
||||
parser = argparse.ArgumentParser(description="Sync messages from all Zulip channels to ChromaDB")
|
||||
parser.add_argument("--days", type=int, help="Number of days to look back for messages")
|
||||
parser.add_argument("--force", action="store_true", help="Force sync all messages from the lookback period")
|
||||
parser.add_argument("--batch-size", type=int, default=200, help="Number of messages to process in each batch")
|
||||
parser.add_argument("--max-messages", type=int, default=10000, help="Maximum total number of messages to sync")
|
||||
parser.add_argument("--include-direct-messages", action="store_true", help="Include direct and group messages in sync")
|
||||
parser.add_argument("--all-messages", action="store_true", help="Sync all messages regardless of date")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Create the Flask app
|
||||
app = create_app()
|
||||
|
||||
with app.app_context():
|
||||
try:
|
||||
# Initialize sync service
|
||||
sync_service = AllChannelSyncService(
|
||||
batch_size=args.batch_size,
|
||||
include_direct_messages=args.include_direct_messages
|
||||
)
|
||||
|
||||
# Sync messages
|
||||
stats = sync_service.sync_messages(
|
||||
days=args.days,
|
||||
force=args.force,
|
||||
max_messages=args.max_messages,
|
||||
all_messages=args.all_messages
|
||||
)
|
||||
|
||||
if stats:
|
||||
channel_counts = stats.get('channel_counts', {})
|
||||
|
||||
print(f"\nSync completed at {datetime.now()}")
|
||||
print(f"Last sync time: {stats['last_sync_time']}")
|
||||
print(f"Last message ID: {stats['last_message_id']}")
|
||||
print(f"Total messages synced: {stats['total_synced']}")
|
||||
print(f"Number of batches: {stats['batches']}")
|
||||
print(f"Messages already in DB: {stats['already_exists']}")
|
||||
|
||||
if channel_counts:
|
||||
print("\nMessages synced by channel:")
|
||||
try:
|
||||
# Use a safe sorting method that handles None keys
|
||||
sorted_items = sorted(channel_counts.items(),
|
||||
key=lambda item: item[0] if item[0] is not None else "")
|
||||
|
||||
for channel, count in sorted_items:
|
||||
channel_name = channel if channel is not None else "Unknown Channel"
|
||||
print(f"- {channel_name}: {count} messages")
|
||||
except Exception as e:
|
||||
# Fallback display without sorting
|
||||
for channel, count in channel_counts.items():
|
||||
channel_name = channel if channel is not None else "Unknown Channel"
|
||||
print(f"- {channel_name}: {count} messages")
|
||||
except KeyboardInterrupt:
|
||||
print("\nSync process interrupted by user. State has been saved.")
|
||||
logger.info("Sync process interrupted by user. State has been saved.")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
157
sync_all_messages.py
Executable file
157
sync_all_messages.py
Executable file
@ -0,0 +1,157 @@
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
Script to sync ALL messages from Zulip to ChromaDB with NO restrictions.
|
||||
This script will sync everything - all channels, all users, all time periods.
|
||||
"""
|
||||
import os
|
||||
import sys
|
||||
import logging
|
||||
from datetime import datetime
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
||||
)
|
||||
logger = logging.getLogger("sync_all_messages")
|
||||
|
||||
# Add the current directory to the path so we can import the app module
|
||||
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
|
||||
|
||||
# Apply NumPy compatibility patch for ChromaDB
|
||||
from app.utils import patch_chromadb_numpy
|
||||
patch_chromadb_numpy()
|
||||
|
||||
from app import create_app
|
||||
from app.db import get_db_session
|
||||
from app.db.zulip_service import ZulipDatabaseService
|
||||
from app.db.chroma_service import ChromaDBService
|
||||
from app.models.zulip import Message
|
||||
|
||||
def sync_all_messages():
|
||||
"""
|
||||
Sync ALL messages from Zulip to ChromaDB with no restrictions.
|
||||
All messages are processed in a single pass.
|
||||
|
||||
Returns:
|
||||
dict: Statistics about the sync
|
||||
"""
|
||||
logger.info("Starting unrestricted sync of ALL messages in one pass")
|
||||
|
||||
session = get_db_session()
|
||||
total_synced = 0
|
||||
already_exists = 0
|
||||
channel_counts = {}
|
||||
|
||||
# Get all messages at once
|
||||
logger.info("Fetching ALL messages from Zulip database")
|
||||
messages = session.query(Message).order_by(Message.id).all()
|
||||
total_messages = len(messages)
|
||||
logger.info(f"Found {total_messages} total messages in Zulip database")
|
||||
|
||||
# Process all messages
|
||||
logger.info("Processing all messages")
|
||||
for i, message in enumerate(messages):
|
||||
message_id = message.id
|
||||
|
||||
# Log progress at intervals
|
||||
if i % 500 == 0 and i > 0:
|
||||
logger.info(f"Progress: {i}/{total_messages} messages processed ({(i/total_messages)*100:.1f}%)")
|
||||
|
||||
# Get message details
|
||||
try:
|
||||
channel_name = ZulipDatabaseService.get_channel_name_for_message(message)
|
||||
sender_name = ZulipDatabaseService.get_sender_name_for_message(message)
|
||||
|
||||
# Handle None channel names
|
||||
if channel_name is None:
|
||||
channel_name = "Unknown Channel"
|
||||
logger.warning(f"Message {message_id} has None channel name, using '{channel_name}' instead")
|
||||
|
||||
# Check if message already exists in ChromaDB
|
||||
if ChromaDBService.message_exists(message_id):
|
||||
already_exists += 1
|
||||
continue
|
||||
|
||||
# Add message to ChromaDB
|
||||
success = ChromaDBService.add_message(
|
||||
message_id=message_id,
|
||||
content=message.content,
|
||||
channel_name=channel_name,
|
||||
subject=message.subject,
|
||||
sender_name=sender_name,
|
||||
date_sent=message.date_sent
|
||||
)
|
||||
|
||||
if success:
|
||||
total_synced += 1
|
||||
|
||||
# Update channel counts
|
||||
if channel_name not in channel_counts:
|
||||
channel_counts[channel_name] = 0
|
||||
channel_counts[channel_name] += 1
|
||||
else:
|
||||
logger.warning(f"Failed to add message {message_id} to ChromaDB")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing message {message_id}: {e}")
|
||||
|
||||
# Print channel statistics
|
||||
if channel_counts:
|
||||
logger.info("Messages synced by channel:")
|
||||
for channel, count in sorted(channel_counts.items()):
|
||||
logger.info(f"- {channel}: {count} messages")
|
||||
|
||||
# Return statistics
|
||||
return {
|
||||
'total_messages': total_messages,
|
||||
'total_synced': total_synced,
|
||||
'already_exists': already_exists,
|
||||
'channel_counts': channel_counts
|
||||
}
|
||||
|
||||
def main():
|
||||
"""Main entry point."""
|
||||
try:
|
||||
# Create the Flask app (needed for context)
|
||||
app = create_app()
|
||||
|
||||
with app.app_context():
|
||||
print("\n====================================================")
|
||||
print("STARTING UNRESTRICTED SYNC OF ALL ZULIP MESSAGES")
|
||||
print(f"Started at: {datetime.now()}")
|
||||
print("====================================================\n")
|
||||
|
||||
# Sync all messages
|
||||
start_time = datetime.now()
|
||||
stats = sync_all_messages()
|
||||
end_time = datetime.now()
|
||||
duration = end_time - start_time
|
||||
|
||||
# Print summary
|
||||
print("\n====================================================")
|
||||
print("SYNC COMPLETE")
|
||||
print(f"Started at: {start_time}")
|
||||
print(f"Completed at: {end_time}")
|
||||
print(f"Duration: {duration}")
|
||||
print(f"Total messages in Zulip: {stats['total_messages']}")
|
||||
print(f"Total messages synced: {stats['total_synced']}")
|
||||
print(f"Messages already in ChromaDB: {stats['already_exists']}")
|
||||
|
||||
# Print channel counts
|
||||
if stats['channel_counts']:
|
||||
print("\nMessages synced by channel:")
|
||||
for channel, count in sorted(stats['channel_counts'].items()):
|
||||
print(f"- {channel}: {count} messages")
|
||||
|
||||
print("====================================================\n")
|
||||
|
||||
except KeyboardInterrupt:
|
||||
print("\nSync process interrupted by user")
|
||||
logger.info("Sync process interrupted by user")
|
||||
except Exception as e:
|
||||
print(f"\nError during sync: {e}")
|
||||
logger.error(f"Error during sync: {e}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
132
sync_and_verify.sh
Executable file
132
sync_and_verify.sh
Executable file
@ -0,0 +1,132 @@
|
||||
#!/bin/bash
|
||||
# Script to sync all messages from all channels (except sandbox) and verify
|
||||
# they're in ChromaDB
|
||||
|
||||
# Set up logging
|
||||
LOG_FILE="logs/sync_and_verify_$(date +%Y%m%d_%H%M%S).log"
|
||||
mkdir -p logs
|
||||
|
||||
# Make sure scripts are executable
|
||||
chmod +x sync_all_channels.py
|
||||
chmod +x compare_messages.py
|
||||
chmod +x fix_unknown_channels.py
|
||||
|
||||
echo "======================================================"
|
||||
echo " ZULIP CHANNEL SYNC AND VERIFY PROCESS"
|
||||
echo " $(date)"
|
||||
echo " Logging to: $LOG_FILE"
|
||||
echo "======================================================"
|
||||
echo ""
|
||||
|
||||
echo "=====================================================" | tee -a "$LOG_FILE"
|
||||
echo "SYNC AND VERIFY PROCESS - $(date)" | tee -a "$LOG_FILE"
|
||||
echo "=====================================================" | tee -a "$LOG_FILE"
|
||||
|
||||
# Activate virtual environment if it exists
|
||||
if [ -d "venv" ]; then
|
||||
echo "Activating virtual environment..." | tee -a "$LOG_FILE"
|
||||
source venv/bin/activate
|
||||
fi
|
||||
|
||||
# Set parameters for the sync
|
||||
DAYS_TO_SYNC=365 # Used for verification only
|
||||
MAX_MESSAGES=250
|
||||
FORCE_SYNC=true
|
||||
INCLUDE_DIRECT_MESSAGES=true
|
||||
ALL_MESSAGES=true # Sync all messages regardless of date
|
||||
TOTAL_BATCHES=1000 # Number of batches to run
|
||||
|
||||
echo "Configuration:" | tee -a "$LOG_FILE"
|
||||
echo "- Maximum messages per batch: $MAX_MESSAGES" | tee -a "$LOG_FILE"
|
||||
echo "- Force sync: $FORCE_SYNC" | tee -a "$LOG_FILE"
|
||||
echo "- Include direct messages: $INCLUDE_DIRECT_MESSAGES" | tee -a "$LOG_FILE"
|
||||
echo "- Sync all messages: $ALL_MESSAGES" | tee -a "$LOG_FILE"
|
||||
echo "- Number of batches: $TOTAL_BATCHES" | tee -a "$LOG_FILE"
|
||||
echo "- Days for verification: $DAYS_TO_SYNC" | tee -a "$LOG_FILE"
|
||||
echo "" | tee -a "$LOG_FILE"
|
||||
|
||||
# Step 1: Sync messages in multiple batches
|
||||
echo "" | tee -a "$LOG_FILE"
|
||||
echo "Step 1: Syncing messages from all channels (except sandbox)..." | tee -a "$LOG_FILE"
|
||||
echo "This will exclude messages from IT_Bot and ai_bot" | tee -a "$LOG_FILE"
|
||||
echo "Running $TOTAL_BATCHES batches of $MAX_MESSAGES messages each" | tee -a "$LOG_FILE"
|
||||
echo "" | tee -a "$LOG_FILE"
|
||||
|
||||
# Build the base command
|
||||
SYNC_CMD="python sync_all_channels.py --max-messages $MAX_MESSAGES"
|
||||
if [ "$INCLUDE_DIRECT_MESSAGES" = true ]; then
|
||||
SYNC_CMD="$SYNC_CMD --include-direct-messages"
|
||||
fi
|
||||
if [ "$ALL_MESSAGES" = true ]; then
|
||||
SYNC_CMD="$SYNC_CMD --all-messages"
|
||||
fi
|
||||
|
||||
# Run multiple batches
|
||||
for ((i=1; i<=$TOTAL_BATCHES; i++))
|
||||
do
|
||||
echo "Running batch $i of $TOTAL_BATCHES..." | tee -a "$LOG_FILE"
|
||||
|
||||
BATCH_CMD="$SYNC_CMD"
|
||||
# If ALL_MESSAGES is true, we should use --force for all batches to ensure we get historical data,
|
||||
# provided that FORCE_SYNC is also enabled.
|
||||
# If ALL_MESSAGES is false, then --force (if enabled by FORCE_SYNC) applies only to the first batch.
|
||||
if [ "$FORCE_SYNC" = true ]; then
|
||||
if [ "$ALL_MESSAGES" = true ] || [ $i -eq 1 ]; then
|
||||
BATCH_CMD="$BATCH_CMD --force"
|
||||
fi
|
||||
fi
|
||||
|
||||
echo "Running: $BATCH_CMD" | tee -a "$LOG_FILE"
|
||||
echo "" | tee -a "$LOG_FILE"
|
||||
|
||||
# Run the sync command
|
||||
$BATCH_CMD | tee -a "$LOG_FILE"
|
||||
|
||||
# Pause between batches
|
||||
if [ $i -lt $TOTAL_BATCHES ]; then
|
||||
echo "Pausing for 5 seconds between batches..." | tee -a "$LOG_FILE"
|
||||
sleep 5
|
||||
fi
|
||||
done
|
||||
|
||||
# Step 2: Fix Unknown Channel entries
|
||||
echo "" | tee -a "$LOG_FILE"
|
||||
echo "Step 2: Fixing 'Unknown Channel' entries..." | tee -a "$LOG_FILE"
|
||||
echo "" | tee -a "$LOG_FILE"
|
||||
|
||||
# Run the fix unknown channels script
|
||||
FIX_CMD="python fix_unknown_channels.py"
|
||||
echo "Running: $FIX_CMD" | tee -a "$LOG_FILE"
|
||||
echo "" | tee -a "$LOG_FILE"
|
||||
|
||||
$FIX_CMD | tee -a "$LOG_FILE"
|
||||
|
||||
# Step 3: Verify all messages are in ChromaDB
|
||||
echo "" | tee -a "$LOG_FILE"
|
||||
echo "Step 3: Verifying all messages are in ChromaDB..." | tee -a "$LOG_FILE"
|
||||
echo "" | tee -a "$LOG_FILE"
|
||||
|
||||
# Run comparison with the specified number of days for verification
|
||||
COMPARE_CMD="python compare_messages.py --days $DAYS_TO_SYNC"
|
||||
echo "Running: $COMPARE_CMD" | tee -a "$LOG_FILE"
|
||||
echo "" | tee -a "$LOG_FILE"
|
||||
|
||||
$COMPARE_CMD | tee -a "$LOG_FILE"
|
||||
|
||||
echo "" | tee -a "$LOG_FILE"
|
||||
echo "=====================================================" | tee -a "$LOG_FILE"
|
||||
echo "Sync and verification process completed at $(date)" | tee -a "$LOG_FILE"
|
||||
echo "See $LOG_FILE for complete log" | tee -a "$LOG_FILE"
|
||||
echo "=====================================================" | tee -a "$LOG_FILE"
|
||||
|
||||
echo ""
|
||||
echo "======================================================"
|
||||
echo " SYNC AND VERIFICATION PROCESS COMPLETED"
|
||||
echo " $(date)"
|
||||
echo " Log file: $LOG_FILE"
|
||||
echo "======================================================"
|
||||
|
||||
# If we activated a virtual environment, deactivate it
|
||||
if [ -n "$VIRTUAL_ENV" ]; then
|
||||
deactivate
|
||||
fi
|
||||
141
sync_messages.py
Executable file
141
sync_messages.py
Executable file
@ -0,0 +1,141 @@
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
Script to manually sync messages from Zulip to ChromaDB.
|
||||
This can be run standalone or as a scheduled cron job.
|
||||
"""
|
||||
import os
|
||||
import sys
|
||||
import argparse
|
||||
import logging
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
# Add the current directory to the path so we can import the app module
|
||||
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
|
||||
|
||||
# Apply NumPy compatibility patch for ChromaDB
|
||||
from app.utils import patch_chromadb_numpy
|
||||
patch_chromadb_numpy()
|
||||
|
||||
from app import create_app
|
||||
from app.utils.sync_service import MessageSyncService
|
||||
from app.db.zulip_service import ZulipDatabaseService
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
||||
)
|
||||
logger = logging.getLogger("sync_messages")
|
||||
|
||||
def sync_messages(days=None, force=False, batch_size=200, max_messages=5000):
|
||||
"""
|
||||
Sync messages from Zulip to ChromaDB.
|
||||
|
||||
Args:
|
||||
days (int): Number of days to look back for messages (default: use sync state)
|
||||
force (bool): Whether to force sync all messages from the lookback period
|
||||
batch_size (int): Number of messages to process in each batch
|
||||
max_messages (int): Maximum total number of messages to sync
|
||||
"""
|
||||
# Create the Flask app
|
||||
app = create_app()
|
||||
|
||||
with app.app_context():
|
||||
sync_service = MessageSyncService()
|
||||
|
||||
if force and days:
|
||||
# If force syncing for a specific number of days, reset the sync state
|
||||
sync_service.last_sync_time = datetime.now() - timedelta(days=days)
|
||||
sync_service.last_message_id = None
|
||||
logger.info(f"Force syncing messages from the last {days} days")
|
||||
|
||||
# Count total messages to sync
|
||||
if force:
|
||||
# Query to get message count
|
||||
since_date = datetime.now() - timedelta(days=days if days else 30)
|
||||
all_messages = ZulipDatabaseService.get_messages_from_it_channels(
|
||||
since=since_date if since_date else None,
|
||||
limit=5000
|
||||
)
|
||||
total_messages = len(all_messages)
|
||||
logger.info(f"Found a total of {total_messages} messages to sync")
|
||||
|
||||
# Run multiple batches of sync
|
||||
total_synced = 0
|
||||
batch_count = 0
|
||||
|
||||
# In force mode, we need to manually run multiple batches
|
||||
if force:
|
||||
while total_synced < min(total_messages, max_messages):
|
||||
# Manual sync with our custom batch size
|
||||
logger.info(f"Running batch {batch_count+1}, synced {total_synced} messages so far")
|
||||
|
||||
# For first batch, we already reset the sync state above
|
||||
# For subsequent batches, we'll use the last_message_id that was set
|
||||
|
||||
# Run the sync
|
||||
sync_service._set_batch_size(batch_size)
|
||||
sync_service.sync_now()
|
||||
|
||||
# Update counters
|
||||
batch_count += 1
|
||||
|
||||
# Check how many we've synced by looking at highest message ID
|
||||
if sync_service.last_message_id:
|
||||
# We've synced up to this message ID
|
||||
synced_in_batch = ZulipDatabaseService.count_messages_up_to_id(
|
||||
sync_service.last_message_id,
|
||||
since=since_date if since_date else None
|
||||
)
|
||||
|
||||
# Update total (use max to ensure we don't decrease if count is wrong)
|
||||
total_synced = max(total_synced, synced_in_batch)
|
||||
|
||||
logger.info(f"Processed {synced_in_batch} messages out of {total_messages}")
|
||||
|
||||
# If we've synced all messages or reached our limit, break
|
||||
if synced_in_batch >= total_messages or synced_in_batch >= max_messages:
|
||||
break
|
||||
else:
|
||||
# If no message ID was set, something went wrong
|
||||
logger.warning("No message ID set after sync, may not have found any messages")
|
||||
break
|
||||
else:
|
||||
# Just run a single sync with default settings
|
||||
sync_service.sync_now()
|
||||
|
||||
# Get the stats
|
||||
stats = {
|
||||
'last_sync_time': sync_service.last_sync_time,
|
||||
'last_message_id': sync_service.last_message_id,
|
||||
'total_synced': total_synced,
|
||||
'batches': batch_count
|
||||
}
|
||||
|
||||
logger.info(f"Sync completed. Current state: {stats}")
|
||||
|
||||
return stats
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Parse command line arguments
|
||||
parser = argparse.ArgumentParser(description="Sync messages from Zulip to ChromaDB")
|
||||
parser.add_argument("--days", type=int, help="Number of days to look back for messages")
|
||||
parser.add_argument("--force", action="store_true", help="Force sync all messages from the lookback period")
|
||||
parser.add_argument("--batch-size", type=int, default=200, help="Number of messages to process in each batch")
|
||||
parser.add_argument("--max-messages", type=int, default=5000, help="Maximum total number of messages to sync")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Sync messages
|
||||
stats = sync_messages(
|
||||
days=args.days,
|
||||
force=args.force,
|
||||
batch_size=args.batch_size,
|
||||
max_messages=args.max_messages
|
||||
)
|
||||
|
||||
print(f"\nSync completed at {datetime.now()}")
|
||||
print(f"Last sync time: {stats['last_sync_time']}")
|
||||
print(f"Last message ID: {stats['last_message_id']}")
|
||||
print(f"Total messages synced: {stats['total_synced']}")
|
||||
print(f"Number of batches: {stats['batches']}")
|
||||
BIN
sync_state.pickle
Normal file
BIN
sync_state.pickle
Normal file
Binary file not shown.
42
update_to_openai.sh
Executable file
42
update_to_openai.sh
Executable file
@ -0,0 +1,42 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Script to migrate from Google Gemini to OpenAI GPT-4o
|
||||
|
||||
echo "Migrating from Google Gemini to OpenAI GPT-4o..."
|
||||
|
||||
# 1. Activate the virtual environment
|
||||
source venv/bin/activate
|
||||
|
||||
# 2. Install OpenAI package
|
||||
echo "Installing OpenAI package..."
|
||||
pip install openai==1.30.4
|
||||
|
||||
# 3. Prompt for OpenAI API key
|
||||
read -p "Enter your OpenAI API key: " openai_api_key
|
||||
|
||||
# 4. Update the .env file
|
||||
echo "Updating .env file..."
|
||||
if grep -q "OPENAI_API_KEY" .env; then
|
||||
# Replace existing OPENAI_API_KEY
|
||||
sed -i "s/OPENAI_API_KEY=.*/OPENAI_API_KEY=$openai_api_key/" .env
|
||||
else
|
||||
# Add new OPENAI_API_KEY entry
|
||||
sed -i "/GEMINI_API_KEY/i # OpenAI GPT-4o (new)\nOPENAI_API_KEY=$openai_api_key\n" .env
|
||||
fi
|
||||
|
||||
# 5. Reset and rebuild the ChromaDB
|
||||
echo "Do you want to reset and rebuild the ChromaDB? (y/n)"
|
||||
read -p "> " rebuild_db
|
||||
|
||||
if [[ $rebuild_db == "y" || $rebuild_db == "Y" ]]; then
|
||||
echo "Resetting ChromaDB..."
|
||||
./reset_chromadb.py
|
||||
|
||||
echo "Rebuilding database (syncing past 7 days of messages)..."
|
||||
python sync_messages.py --force --days 7
|
||||
fi
|
||||
|
||||
echo "Migration completed successfully!"
|
||||
echo "Please restart your application to apply the changes:"
|
||||
echo " 1. Stop the current process"
|
||||
echo " 2. Run ./run_app.sh to start with OpenAI integration"
|
||||
247
venv/bin/Activate.ps1
Normal file
247
venv/bin/Activate.ps1
Normal file
@ -0,0 +1,247 @@
|
||||
<#
|
||||
.Synopsis
|
||||
Activate a Python virtual environment for the current PowerShell session.
|
||||
|
||||
.Description
|
||||
Pushes the python executable for a virtual environment to the front of the
|
||||
$Env:PATH environment variable and sets the prompt to signify that you are
|
||||
in a Python virtual environment. Makes use of the command line switches as
|
||||
well as the `pyvenv.cfg` file values present in the virtual environment.
|
||||
|
||||
.Parameter VenvDir
|
||||
Path to the directory that contains the virtual environment to activate. The
|
||||
default value for this is the parent of the directory that the Activate.ps1
|
||||
script is located within.
|
||||
|
||||
.Parameter Prompt
|
||||
The prompt prefix to display when this virtual environment is activated. By
|
||||
default, this prompt is the name of the virtual environment folder (VenvDir)
|
||||
surrounded by parentheses and followed by a single space (ie. '(.venv) ').
|
||||
|
||||
.Example
|
||||
Activate.ps1
|
||||
Activates the Python virtual environment that contains the Activate.ps1 script.
|
||||
|
||||
.Example
|
||||
Activate.ps1 -Verbose
|
||||
Activates the Python virtual environment that contains the Activate.ps1 script,
|
||||
and shows extra information about the activation as it executes.
|
||||
|
||||
.Example
|
||||
Activate.ps1 -VenvDir C:\Users\MyUser\Common\.venv
|
||||
Activates the Python virtual environment located in the specified location.
|
||||
|
||||
.Example
|
||||
Activate.ps1 -Prompt "MyPython"
|
||||
Activates the Python virtual environment that contains the Activate.ps1 script,
|
||||
and prefixes the current prompt with the specified string (surrounded in
|
||||
parentheses) while the virtual environment is active.
|
||||
|
||||
.Notes
|
||||
On Windows, it may be required to enable this Activate.ps1 script by setting the
|
||||
execution policy for the user. You can do this by issuing the following PowerShell
|
||||
command:
|
||||
|
||||
PS C:\> Set-ExecutionPolicy -ExecutionPolicy RemoteSigned -Scope CurrentUser
|
||||
|
||||
For more information on Execution Policies:
|
||||
https://go.microsoft.com/fwlink/?LinkID=135170
|
||||
|
||||
#>
|
||||
Param(
|
||||
[Parameter(Mandatory = $false)]
|
||||
[String]
|
||||
$VenvDir,
|
||||
[Parameter(Mandatory = $false)]
|
||||
[String]
|
||||
$Prompt
|
||||
)
|
||||
|
||||
<# Function declarations --------------------------------------------------- #>
|
||||
|
||||
<#
|
||||
.Synopsis
|
||||
Remove all shell session elements added by the Activate script, including the
|
||||
addition of the virtual environment's Python executable from the beginning of
|
||||
the PATH variable.
|
||||
|
||||
.Parameter NonDestructive
|
||||
If present, do not remove this function from the global namespace for the
|
||||
session.
|
||||
|
||||
#>
|
||||
function global:deactivate ([switch]$NonDestructive) {
|
||||
# Revert to original values
|
||||
|
||||
# The prior prompt:
|
||||
if (Test-Path -Path Function:_OLD_VIRTUAL_PROMPT) {
|
||||
Copy-Item -Path Function:_OLD_VIRTUAL_PROMPT -Destination Function:prompt
|
||||
Remove-Item -Path Function:_OLD_VIRTUAL_PROMPT
|
||||
}
|
||||
|
||||
# The prior PYTHONHOME:
|
||||
if (Test-Path -Path Env:_OLD_VIRTUAL_PYTHONHOME) {
|
||||
Copy-Item -Path Env:_OLD_VIRTUAL_PYTHONHOME -Destination Env:PYTHONHOME
|
||||
Remove-Item -Path Env:_OLD_VIRTUAL_PYTHONHOME
|
||||
}
|
||||
|
||||
# The prior PATH:
|
||||
if (Test-Path -Path Env:_OLD_VIRTUAL_PATH) {
|
||||
Copy-Item -Path Env:_OLD_VIRTUAL_PATH -Destination Env:PATH
|
||||
Remove-Item -Path Env:_OLD_VIRTUAL_PATH
|
||||
}
|
||||
|
||||
# Just remove the VIRTUAL_ENV altogether:
|
||||
if (Test-Path -Path Env:VIRTUAL_ENV) {
|
||||
Remove-Item -Path env:VIRTUAL_ENV
|
||||
}
|
||||
|
||||
# Just remove VIRTUAL_ENV_PROMPT altogether.
|
||||
if (Test-Path -Path Env:VIRTUAL_ENV_PROMPT) {
|
||||
Remove-Item -Path env:VIRTUAL_ENV_PROMPT
|
||||
}
|
||||
|
||||
# Just remove the _PYTHON_VENV_PROMPT_PREFIX altogether:
|
||||
if (Get-Variable -Name "_PYTHON_VENV_PROMPT_PREFIX" -ErrorAction SilentlyContinue) {
|
||||
Remove-Variable -Name _PYTHON_VENV_PROMPT_PREFIX -Scope Global -Force
|
||||
}
|
||||
|
||||
# Leave deactivate function in the global namespace if requested:
|
||||
if (-not $NonDestructive) {
|
||||
Remove-Item -Path function:deactivate
|
||||
}
|
||||
}
|
||||
|
||||
<#
|
||||
.Description
|
||||
Get-PyVenvConfig parses the values from the pyvenv.cfg file located in the
|
||||
given folder, and returns them in a map.
|
||||
|
||||
For each line in the pyvenv.cfg file, if that line can be parsed into exactly
|
||||
two strings separated by `=` (with any amount of whitespace surrounding the =)
|
||||
then it is considered a `key = value` line. The left hand string is the key,
|
||||
the right hand is the value.
|
||||
|
||||
If the value starts with a `'` or a `"` then the first and last character is
|
||||
stripped from the value before being captured.
|
||||
|
||||
.Parameter ConfigDir
|
||||
Path to the directory that contains the `pyvenv.cfg` file.
|
||||
#>
|
||||
function Get-PyVenvConfig(
|
||||
[String]
|
||||
$ConfigDir
|
||||
) {
|
||||
Write-Verbose "Given ConfigDir=$ConfigDir, obtain values in pyvenv.cfg"
|
||||
|
||||
# Ensure the file exists, and issue a warning if it doesn't (but still allow the function to continue).
|
||||
$pyvenvConfigPath = Join-Path -Resolve -Path $ConfigDir -ChildPath 'pyvenv.cfg' -ErrorAction Continue
|
||||
|
||||
# An empty map will be returned if no config file is found.
|
||||
$pyvenvConfig = @{ }
|
||||
|
||||
if ($pyvenvConfigPath) {
|
||||
|
||||
Write-Verbose "File exists, parse `key = value` lines"
|
||||
$pyvenvConfigContent = Get-Content -Path $pyvenvConfigPath
|
||||
|
||||
$pyvenvConfigContent | ForEach-Object {
|
||||
$keyval = $PSItem -split "\s*=\s*", 2
|
||||
if ($keyval[0] -and $keyval[1]) {
|
||||
$val = $keyval[1]
|
||||
|
||||
# Remove extraneous quotations around a string value.
|
||||
if ("'""".Contains($val.Substring(0, 1))) {
|
||||
$val = $val.Substring(1, $val.Length - 2)
|
||||
}
|
||||
|
||||
$pyvenvConfig[$keyval[0]] = $val
|
||||
Write-Verbose "Adding Key: '$($keyval[0])'='$val'"
|
||||
}
|
||||
}
|
||||
}
|
||||
return $pyvenvConfig
|
||||
}
|
||||
|
||||
|
||||
<# Begin Activate script --------------------------------------------------- #>
|
||||
|
||||
# Determine the containing directory of this script
|
||||
$VenvExecPath = Split-Path -Parent $MyInvocation.MyCommand.Definition
|
||||
$VenvExecDir = Get-Item -Path $VenvExecPath
|
||||
|
||||
Write-Verbose "Activation script is located in path: '$VenvExecPath'"
|
||||
Write-Verbose "VenvExecDir Fullname: '$($VenvExecDir.FullName)"
|
||||
Write-Verbose "VenvExecDir Name: '$($VenvExecDir.Name)"
|
||||
|
||||
# Set values required in priority: CmdLine, ConfigFile, Default
|
||||
# First, get the location of the virtual environment, it might not be
|
||||
# VenvExecDir if specified on the command line.
|
||||
if ($VenvDir) {
|
||||
Write-Verbose "VenvDir given as parameter, using '$VenvDir' to determine values"
|
||||
}
|
||||
else {
|
||||
Write-Verbose "VenvDir not given as a parameter, using parent directory name as VenvDir."
|
||||
$VenvDir = $VenvExecDir.Parent.FullName.TrimEnd("\\/")
|
||||
Write-Verbose "VenvDir=$VenvDir"
|
||||
}
|
||||
|
||||
# Next, read the `pyvenv.cfg` file to determine any required value such
|
||||
# as `prompt`.
|
||||
$pyvenvCfg = Get-PyVenvConfig -ConfigDir $VenvDir
|
||||
|
||||
# Next, set the prompt from the command line, or the config file, or
|
||||
# just use the name of the virtual environment folder.
|
||||
if ($Prompt) {
|
||||
Write-Verbose "Prompt specified as argument, using '$Prompt'"
|
||||
}
|
||||
else {
|
||||
Write-Verbose "Prompt not specified as argument to script, checking pyvenv.cfg value"
|
||||
if ($pyvenvCfg -and $pyvenvCfg['prompt']) {
|
||||
Write-Verbose " Setting based on value in pyvenv.cfg='$($pyvenvCfg['prompt'])'"
|
||||
$Prompt = $pyvenvCfg['prompt'];
|
||||
}
|
||||
else {
|
||||
Write-Verbose " Setting prompt based on parent's directory's name. (Is the directory name passed to venv module when creating the virtual environment)"
|
||||
Write-Verbose " Got leaf-name of $VenvDir='$(Split-Path -Path $venvDir -Leaf)'"
|
||||
$Prompt = Split-Path -Path $venvDir -Leaf
|
||||
}
|
||||
}
|
||||
|
||||
Write-Verbose "Prompt = '$Prompt'"
|
||||
Write-Verbose "VenvDir='$VenvDir'"
|
||||
|
||||
# Deactivate any currently active virtual environment, but leave the
|
||||
# deactivate function in place.
|
||||
deactivate -nondestructive
|
||||
|
||||
# Now set the environment variable VIRTUAL_ENV, used by many tools to determine
|
||||
# that there is an activated venv.
|
||||
$env:VIRTUAL_ENV = $VenvDir
|
||||
|
||||
if (-not $Env:VIRTUAL_ENV_DISABLE_PROMPT) {
|
||||
|
||||
Write-Verbose "Setting prompt to '$Prompt'"
|
||||
|
||||
# Set the prompt to include the env name
|
||||
# Make sure _OLD_VIRTUAL_PROMPT is global
|
||||
function global:_OLD_VIRTUAL_PROMPT { "" }
|
||||
Copy-Item -Path function:prompt -Destination function:_OLD_VIRTUAL_PROMPT
|
||||
New-Variable -Name _PYTHON_VENV_PROMPT_PREFIX -Description "Python virtual environment prompt prefix" -Scope Global -Option ReadOnly -Visibility Public -Value $Prompt
|
||||
|
||||
function global:prompt {
|
||||
Write-Host -NoNewline -ForegroundColor Green "($_PYTHON_VENV_PROMPT_PREFIX) "
|
||||
_OLD_VIRTUAL_PROMPT
|
||||
}
|
||||
$env:VIRTUAL_ENV_PROMPT = $Prompt
|
||||
}
|
||||
|
||||
# Clear PYTHONHOME
|
||||
if (Test-Path -Path Env:PYTHONHOME) {
|
||||
Copy-Item -Path Env:PYTHONHOME -Destination Env:_OLD_VIRTUAL_PYTHONHOME
|
||||
Remove-Item -Path Env:PYTHONHOME
|
||||
}
|
||||
|
||||
# Add the venv to the PATH
|
||||
Copy-Item -Path Env:PATH -Destination Env:_OLD_VIRTUAL_PATH
|
||||
$Env:PATH = "$VenvExecDir$([System.IO.Path]::PathSeparator)$Env:PATH"
|
||||
63
venv/bin/activate
Normal file
63
venv/bin/activate
Normal file
@ -0,0 +1,63 @@
|
||||
# This file must be used with "source bin/activate" *from bash*
|
||||
# you cannot run it directly
|
||||
|
||||
deactivate () {
|
||||
# reset old environment variables
|
||||
if [ -n "${_OLD_VIRTUAL_PATH:-}" ] ; then
|
||||
PATH="${_OLD_VIRTUAL_PATH:-}"
|
||||
export PATH
|
||||
unset _OLD_VIRTUAL_PATH
|
||||
fi
|
||||
if [ -n "${_OLD_VIRTUAL_PYTHONHOME:-}" ] ; then
|
||||
PYTHONHOME="${_OLD_VIRTUAL_PYTHONHOME:-}"
|
||||
export PYTHONHOME
|
||||
unset _OLD_VIRTUAL_PYTHONHOME
|
||||
fi
|
||||
|
||||
# Call hash to forget past commands. Without forgetting
|
||||
# past commands the $PATH changes we made may not be respected
|
||||
hash -r 2> /dev/null
|
||||
|
||||
if [ -n "${_OLD_VIRTUAL_PS1:-}" ] ; then
|
||||
PS1="${_OLD_VIRTUAL_PS1:-}"
|
||||
export PS1
|
||||
unset _OLD_VIRTUAL_PS1
|
||||
fi
|
||||
|
||||
unset VIRTUAL_ENV
|
||||
unset VIRTUAL_ENV_PROMPT
|
||||
if [ ! "${1:-}" = "nondestructive" ] ; then
|
||||
# Self destruct!
|
||||
unset -f deactivate
|
||||
fi
|
||||
}
|
||||
|
||||
# unset irrelevant variables
|
||||
deactivate nondestructive
|
||||
|
||||
VIRTUAL_ENV=/home/adminuser/zulip_bots/venv
|
||||
export VIRTUAL_ENV
|
||||
|
||||
_OLD_VIRTUAL_PATH="$PATH"
|
||||
PATH="$VIRTUAL_ENV/"bin":$PATH"
|
||||
export PATH
|
||||
|
||||
# unset PYTHONHOME if set
|
||||
# this will fail if PYTHONHOME is set to the empty string (which is bad anyway)
|
||||
# could use `if (set -u; : $PYTHONHOME) ;` in bash
|
||||
if [ -n "${PYTHONHOME:-}" ] ; then
|
||||
_OLD_VIRTUAL_PYTHONHOME="${PYTHONHOME:-}"
|
||||
unset PYTHONHOME
|
||||
fi
|
||||
|
||||
if [ -z "${VIRTUAL_ENV_DISABLE_PROMPT:-}" ] ; then
|
||||
_OLD_VIRTUAL_PS1="${PS1:-}"
|
||||
PS1='(venv) '"${PS1:-}"
|
||||
export PS1
|
||||
VIRTUAL_ENV_PROMPT='(venv) '
|
||||
export VIRTUAL_ENV_PROMPT
|
||||
fi
|
||||
|
||||
# Call hash to forget past commands. Without forgetting
|
||||
# past commands the $PATH changes we made may not be respected
|
||||
hash -r 2> /dev/null
|
||||
26
venv/bin/activate.csh
Normal file
26
venv/bin/activate.csh
Normal file
@ -0,0 +1,26 @@
|
||||
# This file must be used with "source bin/activate.csh" *from csh*.
|
||||
# You cannot run it directly.
|
||||
# Created by Davide Di Blasi <davidedb@gmail.com>.
|
||||
# Ported to Python 3.3 venv by Andrew Svetlov <andrew.svetlov@gmail.com>
|
||||
|
||||
alias deactivate 'test $?_OLD_VIRTUAL_PATH != 0 && setenv PATH "$_OLD_VIRTUAL_PATH" && unset _OLD_VIRTUAL_PATH; rehash; test $?_OLD_VIRTUAL_PROMPT != 0 && set prompt="$_OLD_VIRTUAL_PROMPT" && unset _OLD_VIRTUAL_PROMPT; unsetenv VIRTUAL_ENV; unsetenv VIRTUAL_ENV_PROMPT; test "\!:*" != "nondestructive" && unalias deactivate'
|
||||
|
||||
# Unset irrelevant variables.
|
||||
deactivate nondestructive
|
||||
|
||||
setenv VIRTUAL_ENV /home/adminuser/zulip_bots/venv
|
||||
|
||||
set _OLD_VIRTUAL_PATH="$PATH"
|
||||
setenv PATH "$VIRTUAL_ENV/"bin":$PATH"
|
||||
|
||||
|
||||
set _OLD_VIRTUAL_PROMPT="$prompt"
|
||||
|
||||
if (! "$?VIRTUAL_ENV_DISABLE_PROMPT") then
|
||||
set prompt = '(venv) '"$prompt"
|
||||
setenv VIRTUAL_ENV_PROMPT '(venv) '
|
||||
endif
|
||||
|
||||
alias pydoc python -m pydoc
|
||||
|
||||
rehash
|
||||
69
venv/bin/activate.fish
Normal file
69
venv/bin/activate.fish
Normal file
@ -0,0 +1,69 @@
|
||||
# This file must be used with "source <venv>/bin/activate.fish" *from fish*
|
||||
# (https://fishshell.com/); you cannot run it directly.
|
||||
|
||||
function deactivate -d "Exit virtual environment and return to normal shell environment"
|
||||
# reset old environment variables
|
||||
if test -n "$_OLD_VIRTUAL_PATH"
|
||||
set -gx PATH $_OLD_VIRTUAL_PATH
|
||||
set -e _OLD_VIRTUAL_PATH
|
||||
end
|
||||
if test -n "$_OLD_VIRTUAL_PYTHONHOME"
|
||||
set -gx PYTHONHOME $_OLD_VIRTUAL_PYTHONHOME
|
||||
set -e _OLD_VIRTUAL_PYTHONHOME
|
||||
end
|
||||
|
||||
if test -n "$_OLD_FISH_PROMPT_OVERRIDE"
|
||||
set -e _OLD_FISH_PROMPT_OVERRIDE
|
||||
# prevents error when using nested fish instances (Issue #93858)
|
||||
if functions -q _old_fish_prompt
|
||||
functions -e fish_prompt
|
||||
functions -c _old_fish_prompt fish_prompt
|
||||
functions -e _old_fish_prompt
|
||||
end
|
||||
end
|
||||
|
||||
set -e VIRTUAL_ENV
|
||||
set -e VIRTUAL_ENV_PROMPT
|
||||
if test "$argv[1]" != "nondestructive"
|
||||
# Self-destruct!
|
||||
functions -e deactivate
|
||||
end
|
||||
end
|
||||
|
||||
# Unset irrelevant variables.
|
||||
deactivate nondestructive
|
||||
|
||||
set -gx VIRTUAL_ENV /home/adminuser/zulip_bots/venv
|
||||
|
||||
set -gx _OLD_VIRTUAL_PATH $PATH
|
||||
set -gx PATH "$VIRTUAL_ENV/"bin $PATH
|
||||
|
||||
# Unset PYTHONHOME if set.
|
||||
if set -q PYTHONHOME
|
||||
set -gx _OLD_VIRTUAL_PYTHONHOME $PYTHONHOME
|
||||
set -e PYTHONHOME
|
||||
end
|
||||
|
||||
if test -z "$VIRTUAL_ENV_DISABLE_PROMPT"
|
||||
# fish uses a function instead of an env var to generate the prompt.
|
||||
|
||||
# Save the current fish_prompt function as the function _old_fish_prompt.
|
||||
functions -c fish_prompt _old_fish_prompt
|
||||
|
||||
# With the original prompt function renamed, we can override with our own.
|
||||
function fish_prompt
|
||||
# Save the return status of the last command.
|
||||
set -l old_status $status
|
||||
|
||||
# Output the venv prompt; color taken from the blue of the Python logo.
|
||||
printf "%s%s%s" (set_color 4B8BBE) '(venv) ' (set_color normal)
|
||||
|
||||
# Restore the return status of the previous command.
|
||||
echo "exit $old_status" | .
|
||||
# Output the original/"old" prompt.
|
||||
_old_fish_prompt
|
||||
end
|
||||
|
||||
set -gx _OLD_FISH_PROMPT_OVERRIDE "$VIRTUAL_ENV"
|
||||
set -gx VIRTUAL_ENV_PROMPT '(venv) '
|
||||
end
|
||||
8
venv/bin/coloredlogs
Executable file
8
venv/bin/coloredlogs
Executable file
@ -0,0 +1,8 @@
|
||||
#!/home/adminuser/zulip_bots/venv/bin/python3.11
|
||||
# -*- coding: utf-8 -*-
|
||||
import re
|
||||
import sys
|
||||
from coloredlogs.cli import main
|
||||
if __name__ == '__main__':
|
||||
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
||||
sys.exit(main())
|
||||
8
venv/bin/distro
Executable file
8
venv/bin/distro
Executable file
@ -0,0 +1,8 @@
|
||||
#!/home/adminuser/zulip_bots/venv/bin/python3.11
|
||||
# -*- coding: utf-8 -*-
|
||||
import re
|
||||
import sys
|
||||
from distro.distro import main
|
||||
if __name__ == '__main__':
|
||||
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
||||
sys.exit(main())
|
||||
8
venv/bin/dotenv
Executable file
8
venv/bin/dotenv
Executable file
@ -0,0 +1,8 @@
|
||||
#!/home/adminuser/zulip_bots/venv/bin/python3.11
|
||||
# -*- coding: utf-8 -*-
|
||||
import re
|
||||
import sys
|
||||
from dotenv.__main__ import cli
|
||||
if __name__ == '__main__':
|
||||
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
||||
sys.exit(cli())
|
||||
8
venv/bin/f2py
Executable file
8
venv/bin/f2py
Executable file
@ -0,0 +1,8 @@
|
||||
#!/home/adminuser/zulip_bots/venv/bin/python3.11
|
||||
# -*- coding: utf-8 -*-
|
||||
import re
|
||||
import sys
|
||||
from numpy.f2py.f2py2e import main
|
||||
if __name__ == '__main__':
|
||||
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
||||
sys.exit(main())
|
||||
8
venv/bin/fastavro
Executable file
8
venv/bin/fastavro
Executable file
@ -0,0 +1,8 @@
|
||||
#!/home/adminuser/zulip_bots/venv/bin/python3.11
|
||||
# -*- coding: utf-8 -*-
|
||||
import re
|
||||
import sys
|
||||
from fastavro.__main__ import main
|
||||
if __name__ == '__main__':
|
||||
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
||||
sys.exit(main())
|
||||
8
venv/bin/flask
Executable file
8
venv/bin/flask
Executable file
@ -0,0 +1,8 @@
|
||||
#!/home/adminuser/zulip_bots/venv/bin/python3.11
|
||||
# -*- coding: utf-8 -*-
|
||||
import re
|
||||
import sys
|
||||
from flask.cli import main
|
||||
if __name__ == '__main__':
|
||||
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
||||
sys.exit(main())
|
||||
8
venv/bin/httpx
Executable file
8
venv/bin/httpx
Executable file
@ -0,0 +1,8 @@
|
||||
#!/home/adminuser/zulip_bots/venv/bin/python3.11
|
||||
# -*- coding: utf-8 -*-
|
||||
import re
|
||||
import sys
|
||||
from httpx import main
|
||||
if __name__ == '__main__':
|
||||
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
||||
sys.exit(main())
|
||||
8
venv/bin/huggingface-cli
Executable file
8
venv/bin/huggingface-cli
Executable file
@ -0,0 +1,8 @@
|
||||
#!/home/adminuser/zulip_bots/venv/bin/python3.11
|
||||
# -*- coding: utf-8 -*-
|
||||
import re
|
||||
import sys
|
||||
from huggingface_hub.commands.huggingface_cli import main
|
||||
if __name__ == '__main__':
|
||||
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
||||
sys.exit(main())
|
||||
8
venv/bin/humanfriendly
Executable file
8
venv/bin/humanfriendly
Executable file
@ -0,0 +1,8 @@
|
||||
#!/home/adminuser/zulip_bots/venv/bin/python3.11
|
||||
# -*- coding: utf-8 -*-
|
||||
import re
|
||||
import sys
|
||||
from humanfriendly.cli import main
|
||||
if __name__ == '__main__':
|
||||
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
||||
sys.exit(main())
|
||||
8
venv/bin/isympy
Executable file
8
venv/bin/isympy
Executable file
@ -0,0 +1,8 @@
|
||||
#!/home/adminuser/zulip_bots/venv/bin/python3.11
|
||||
# -*- coding: utf-8 -*-
|
||||
import re
|
||||
import sys
|
||||
from isympy import main
|
||||
if __name__ == '__main__':
|
||||
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
||||
sys.exit(main())
|
||||
8
venv/bin/markdown-it
Executable file
8
venv/bin/markdown-it
Executable file
@ -0,0 +1,8 @@
|
||||
#!/home/adminuser/zulip_bots/venv/bin/python3.11
|
||||
# -*- coding: utf-8 -*-
|
||||
import re
|
||||
import sys
|
||||
from markdown_it.cli.parse import main
|
||||
if __name__ == '__main__':
|
||||
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
||||
sys.exit(main())
|
||||
8
venv/bin/nltk
Executable file
8
venv/bin/nltk
Executable file
@ -0,0 +1,8 @@
|
||||
#!/home/adminuser/zulip_bots/venv/bin/python3.11
|
||||
# -*- coding: utf-8 -*-
|
||||
import re
|
||||
import sys
|
||||
from nltk.cli import cli
|
||||
if __name__ == '__main__':
|
||||
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
||||
sys.exit(cli())
|
||||
8
venv/bin/nomic
Executable file
8
venv/bin/nomic
Executable file
@ -0,0 +1,8 @@
|
||||
#!/home/adminuser/zulip_bots/venv/bin/python3.11
|
||||
# -*- coding: utf-8 -*-
|
||||
import re
|
||||
import sys
|
||||
from nomic.cli import cli
|
||||
if __name__ == '__main__':
|
||||
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
||||
sys.exit(cli())
|
||||
8
venv/bin/normalizer
Executable file
8
venv/bin/normalizer
Executable file
@ -0,0 +1,8 @@
|
||||
#!/home/adminuser/zulip_bots/venv/bin/python3.11
|
||||
# -*- coding: utf-8 -*-
|
||||
import re
|
||||
import sys
|
||||
from charset_normalizer import cli
|
||||
if __name__ == '__main__':
|
||||
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
||||
sys.exit(cli.cli_detect())
|
||||
8
venv/bin/numpy-config
Executable file
8
venv/bin/numpy-config
Executable file
@ -0,0 +1,8 @@
|
||||
#!/home/adminuser/zulip_bots/venv/bin/python3.11
|
||||
# -*- coding: utf-8 -*-
|
||||
import re
|
||||
import sys
|
||||
from numpy._configtool import main
|
||||
if __name__ == '__main__':
|
||||
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
||||
sys.exit(main())
|
||||
8
venv/bin/onnxruntime_test
Executable file
8
venv/bin/onnxruntime_test
Executable file
@ -0,0 +1,8 @@
|
||||
#!/home/adminuser/zulip_bots/venv/bin/python3.11
|
||||
# -*- coding: utf-8 -*-
|
||||
import re
|
||||
import sys
|
||||
from onnxruntime.tools.onnxruntime_test import main
|
||||
if __name__ == '__main__':
|
||||
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
||||
sys.exit(main())
|
||||
8
venv/bin/openai
Executable file
8
venv/bin/openai
Executable file
@ -0,0 +1,8 @@
|
||||
#!/home/adminuser/zulip_bots/venv/bin/python3.11
|
||||
# -*- coding: utf-8 -*-
|
||||
import re
|
||||
import sys
|
||||
from openai.cli import main
|
||||
if __name__ == '__main__':
|
||||
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
||||
sys.exit(main())
|
||||
8
venv/bin/pip
Executable file
8
venv/bin/pip
Executable file
@ -0,0 +1,8 @@
|
||||
#!/home/adminuser/zulip_bots/venv/bin/python3.11
|
||||
# -*- coding: utf-8 -*-
|
||||
import re
|
||||
import sys
|
||||
from pip._internal.cli.main import main
|
||||
if __name__ == '__main__':
|
||||
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
||||
sys.exit(main())
|
||||
8
venv/bin/pip3
Executable file
8
venv/bin/pip3
Executable file
@ -0,0 +1,8 @@
|
||||
#!/home/adminuser/zulip_bots/venv/bin/python3.11
|
||||
# -*- coding: utf-8 -*-
|
||||
import re
|
||||
import sys
|
||||
from pip._internal.cli.main import main
|
||||
if __name__ == '__main__':
|
||||
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
||||
sys.exit(main())
|
||||
8
venv/bin/pip3.11
Executable file
8
venv/bin/pip3.11
Executable file
@ -0,0 +1,8 @@
|
||||
#!/home/adminuser/zulip_bots/venv/bin/python3.11
|
||||
# -*- coding: utf-8 -*-
|
||||
import re
|
||||
import sys
|
||||
from pip._internal.cli.main import main
|
||||
if __name__ == '__main__':
|
||||
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
||||
sys.exit(main())
|
||||
8
venv/bin/pygmentize
Executable file
8
venv/bin/pygmentize
Executable file
@ -0,0 +1,8 @@
|
||||
#!/home/adminuser/zulip_bots/venv/bin/python3.11
|
||||
# -*- coding: utf-8 -*-
|
||||
import re
|
||||
import sys
|
||||
from pygments.cmdline import main
|
||||
if __name__ == '__main__':
|
||||
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
||||
sys.exit(main())
|
||||
8
venv/bin/pyrsa-decrypt
Executable file
8
venv/bin/pyrsa-decrypt
Executable file
@ -0,0 +1,8 @@
|
||||
#!/home/adminuser/zulip_bots/venv/bin/python3.11
|
||||
# -*- coding: utf-8 -*-
|
||||
import re
|
||||
import sys
|
||||
from rsa.cli import decrypt
|
||||
if __name__ == '__main__':
|
||||
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
||||
sys.exit(decrypt())
|
||||
8
venv/bin/pyrsa-encrypt
Executable file
8
venv/bin/pyrsa-encrypt
Executable file
@ -0,0 +1,8 @@
|
||||
#!/home/adminuser/zulip_bots/venv/bin/python3.11
|
||||
# -*- coding: utf-8 -*-
|
||||
import re
|
||||
import sys
|
||||
from rsa.cli import encrypt
|
||||
if __name__ == '__main__':
|
||||
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
||||
sys.exit(encrypt())
|
||||
8
venv/bin/pyrsa-keygen
Executable file
8
venv/bin/pyrsa-keygen
Executable file
@ -0,0 +1,8 @@
|
||||
#!/home/adminuser/zulip_bots/venv/bin/python3.11
|
||||
# -*- coding: utf-8 -*-
|
||||
import re
|
||||
import sys
|
||||
from rsa.cli import keygen
|
||||
if __name__ == '__main__':
|
||||
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
||||
sys.exit(keygen())
|
||||
8
venv/bin/pyrsa-priv2pub
Executable file
8
venv/bin/pyrsa-priv2pub
Executable file
@ -0,0 +1,8 @@
|
||||
#!/home/adminuser/zulip_bots/venv/bin/python3.11
|
||||
# -*- coding: utf-8 -*-
|
||||
import re
|
||||
import sys
|
||||
from rsa.util import private_to_public
|
||||
if __name__ == '__main__':
|
||||
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
||||
sys.exit(private_to_public())
|
||||
8
venv/bin/pyrsa-sign
Executable file
8
venv/bin/pyrsa-sign
Executable file
@ -0,0 +1,8 @@
|
||||
#!/home/adminuser/zulip_bots/venv/bin/python3.11
|
||||
# -*- coding: utf-8 -*-
|
||||
import re
|
||||
import sys
|
||||
from rsa.cli import sign
|
||||
if __name__ == '__main__':
|
||||
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
||||
sys.exit(sign())
|
||||
8
venv/bin/pyrsa-verify
Executable file
8
venv/bin/pyrsa-verify
Executable file
@ -0,0 +1,8 @@
|
||||
#!/home/adminuser/zulip_bots/venv/bin/python3.11
|
||||
# -*- coding: utf-8 -*-
|
||||
import re
|
||||
import sys
|
||||
from rsa.cli import verify
|
||||
if __name__ == '__main__':
|
||||
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
||||
sys.exit(verify())
|
||||
1
venv/bin/python
Symbolic link
1
venv/bin/python
Symbolic link
@ -0,0 +1 @@
|
||||
python3.11
|
||||
1
venv/bin/python3
Symbolic link
1
venv/bin/python3
Symbolic link
@ -0,0 +1 @@
|
||||
python3.11
|
||||
1
venv/bin/python3.11
Symbolic link
1
venv/bin/python3.11
Symbolic link
@ -0,0 +1 @@
|
||||
/usr/bin/python3.11
|
||||
8
venv/bin/tqdm
Executable file
8
venv/bin/tqdm
Executable file
@ -0,0 +1,8 @@
|
||||
#!/home/adminuser/zulip_bots/venv/bin/python3.11
|
||||
# -*- coding: utf-8 -*-
|
||||
import re
|
||||
import sys
|
||||
from tqdm.cli import main
|
||||
if __name__ == '__main__':
|
||||
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
||||
sys.exit(main())
|
||||
8
venv/bin/uvicorn
Executable file
8
venv/bin/uvicorn
Executable file
@ -0,0 +1,8 @@
|
||||
#!/home/adminuser/zulip_bots/venv/bin/python3.11
|
||||
# -*- coding: utf-8 -*-
|
||||
import re
|
||||
import sys
|
||||
from uvicorn.main import main
|
||||
if __name__ == '__main__':
|
||||
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
||||
sys.exit(main())
|
||||
8
venv/bin/watchfiles
Executable file
8
venv/bin/watchfiles
Executable file
@ -0,0 +1,8 @@
|
||||
#!/home/adminuser/zulip_bots/venv/bin/python3.11
|
||||
# -*- coding: utf-8 -*-
|
||||
import re
|
||||
import sys
|
||||
from watchfiles.cli import cli
|
||||
if __name__ == '__main__':
|
||||
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
||||
sys.exit(cli())
|
||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user