zulip_bot/sync_and_verify.sh
2025-05-16 18:00:22 +04:00

132 lines
4.6 KiB
Bash
Executable File

#!/bin/bash
# Script to sync all messages from all channels (except sandbox) and verify
# they're in ChromaDB
# Set up logging
LOG_FILE="logs/sync_and_verify_$(date +%Y%m%d_%H%M%S).log"
mkdir -p logs
# Make sure scripts are executable
chmod +x sync_all_channels.py
chmod +x compare_messages.py
chmod +x fix_unknown_channels.py
echo "======================================================"
echo " ZULIP CHANNEL SYNC AND VERIFY PROCESS"
echo " $(date)"
echo " Logging to: $LOG_FILE"
echo "======================================================"
echo ""
echo "=====================================================" | tee -a "$LOG_FILE"
echo "SYNC AND VERIFY PROCESS - $(date)" | tee -a "$LOG_FILE"
echo "=====================================================" | tee -a "$LOG_FILE"
# Activate virtual environment if it exists
if [ -d "venv" ]; then
echo "Activating virtual environment..." | tee -a "$LOG_FILE"
source venv/bin/activate
fi
# Set parameters for the sync
DAYS_TO_SYNC=365 # Used for verification only
MAX_MESSAGES=250
FORCE_SYNC=true
INCLUDE_DIRECT_MESSAGES=true
ALL_MESSAGES=true # Sync all messages regardless of date
TOTAL_BATCHES=1000 # Number of batches to run
echo "Configuration:" | tee -a "$LOG_FILE"
echo "- Maximum messages per batch: $MAX_MESSAGES" | tee -a "$LOG_FILE"
echo "- Force sync: $FORCE_SYNC" | tee -a "$LOG_FILE"
echo "- Include direct messages: $INCLUDE_DIRECT_MESSAGES" | tee -a "$LOG_FILE"
echo "- Sync all messages: $ALL_MESSAGES" | tee -a "$LOG_FILE"
echo "- Number of batches: $TOTAL_BATCHES" | tee -a "$LOG_FILE"
echo "- Days for verification: $DAYS_TO_SYNC" | tee -a "$LOG_FILE"
echo "" | tee -a "$LOG_FILE"
# Step 1: Sync messages in multiple batches
echo "" | tee -a "$LOG_FILE"
echo "Step 1: Syncing messages from all channels (except sandbox)..." | tee -a "$LOG_FILE"
echo "This will exclude messages from IT_Bot and ai_bot" | tee -a "$LOG_FILE"
echo "Running $TOTAL_BATCHES batches of $MAX_MESSAGES messages each" | tee -a "$LOG_FILE"
echo "" | tee -a "$LOG_FILE"
# Build the base command
SYNC_CMD="python sync_all_channels.py --max-messages $MAX_MESSAGES"
if [ "$INCLUDE_DIRECT_MESSAGES" = true ]; then
SYNC_CMD="$SYNC_CMD --include-direct-messages"
fi
if [ "$ALL_MESSAGES" = true ]; then
SYNC_CMD="$SYNC_CMD --all-messages"
fi
# Run multiple batches
for ((i=1; i<=$TOTAL_BATCHES; i++))
do
echo "Running batch $i of $TOTAL_BATCHES..." | tee -a "$LOG_FILE"
BATCH_CMD="$SYNC_CMD"
# If ALL_MESSAGES is true, we should use --force for all batches to ensure we get historical data,
# provided that FORCE_SYNC is also enabled.
# If ALL_MESSAGES is false, then --force (if enabled by FORCE_SYNC) applies only to the first batch.
if [ "$FORCE_SYNC" = true ]; then
if [ "$ALL_MESSAGES" = true ] || [ $i -eq 1 ]; then
BATCH_CMD="$BATCH_CMD --force"
fi
fi
echo "Running: $BATCH_CMD" | tee -a "$LOG_FILE"
echo "" | tee -a "$LOG_FILE"
# Run the sync command
$BATCH_CMD | tee -a "$LOG_FILE"
# Pause between batches
if [ $i -lt $TOTAL_BATCHES ]; then
echo "Pausing for 5 seconds between batches..." | tee -a "$LOG_FILE"
sleep 5
fi
done
# Step 2: Fix Unknown Channel entries
echo "" | tee -a "$LOG_FILE"
echo "Step 2: Fixing 'Unknown Channel' entries..." | tee -a "$LOG_FILE"
echo "" | tee -a "$LOG_FILE"
# Run the fix unknown channels script
FIX_CMD="python fix_unknown_channels.py"
echo "Running: $FIX_CMD" | tee -a "$LOG_FILE"
echo "" | tee -a "$LOG_FILE"
$FIX_CMD | tee -a "$LOG_FILE"
# Step 3: Verify all messages are in ChromaDB
echo "" | tee -a "$LOG_FILE"
echo "Step 3: Verifying all messages are in ChromaDB..." | tee -a "$LOG_FILE"
echo "" | tee -a "$LOG_FILE"
# Run comparison with the specified number of days for verification
COMPARE_CMD="python compare_messages.py --days $DAYS_TO_SYNC"
echo "Running: $COMPARE_CMD" | tee -a "$LOG_FILE"
echo "" | tee -a "$LOG_FILE"
$COMPARE_CMD | tee -a "$LOG_FILE"
echo "" | tee -a "$LOG_FILE"
echo "=====================================================" | tee -a "$LOG_FILE"
echo "Sync and verification process completed at $(date)" | tee -a "$LOG_FILE"
echo "See $LOG_FILE for complete log" | tee -a "$LOG_FILE"
echo "=====================================================" | tee -a "$LOG_FILE"
echo ""
echo "======================================================"
echo " SYNC AND VERIFICATION PROCESS COMPLETED"
echo " $(date)"
echo " Log file: $LOG_FILE"
echo "======================================================"
# If we activated a virtual environment, deactivate it
if [ -n "$VIRTUAL_ENV" ]; then
deactivate
fi