"""
Reranker Service for improving search results by reranking candidate documents.

This service uses a custom reranking approach combining multiple signals
to improve the relevance of search results.
"""
import re
import numpy as np
from typing import Dict, List, Optional, Tuple, Union
import logging

# Set up logging
logger = logging.getLogger("reranker_service")

class RerankerService:
    """Service for reranking search results using a custom approach."""
    
    # Cache for reranked results
    _rerank_cache = {}
    
    @staticmethod
    def rerank(query: str, documents: List[Dict], top_k: int = 20) -> List[Dict]:
        """
        Rerank documents based on relevance to query using a multi-factor approach.
        
        Args:
            query (str): Query text
            documents (List[Dict]): List of document dictionaries with 'id' and 'content'
            top_k (int): Number of results to return
            
        Returns:
            List[Dict]: Reranked documents
        """
        # Return all documents if there are fewer than top_k
        if len(documents) <= top_k:
            return documents
            
        # Create cache key
        cache_key = f"{query}_{sorted([doc.get('id', '') for doc in documents])}"
        
        # Check if we have this reranking cached
        if cache_key in RerankerService._rerank_cache:
            return RerankerService._rerank_cache[cache_key][:top_k]
        
        try:
            # Prepare query
            query_terms = RerankerService._tokenize(query)
            query_lower = query.lower()
            
            # Calculate multi-factor relevance score for each document
            scored_docs = []
            for doc in documents:
                content = doc.get('content', '')
                content_lower = content.lower()
                
                # 1. Term frequency scoring (similar to BM25)
                term_score = RerankerService._calculate_term_score(content_lower, query_terms)
                
                # 2. Exact phrase matching
                phrase_score = RerankerService._calculate_phrase_score(content_lower, query_lower)
                
                # 3. Semantic similarity (use existing score if available)
                semantic_score = RerankerService._get_semantic_score(doc)
                
                # 4. Document position bonus
                position_score = RerankerService._calculate_position_score(content_lower, query_terms)
                
                # 5. Document length normalization
                length_factor = RerankerService._calculate_length_factor(content)
                
                # Calculate final combined score
                # Weights can be adjusted based on performance
                final_score = (
                    0.35 * term_score + 
                    0.30 * phrase_score + 
                    0.25 * semantic_score + 
                    0.10 * position_score
                ) * length_factor
                
                scored_doc = doc.copy()
                scored_doc['score'] = final_score
                scored_doc['_term_score'] = term_score
                scored_doc['_phrase_score'] = phrase_score
                scored_doc['_semantic_score'] = semantic_score
                scored_doc['_position_score'] = position_score
                
                scored_docs.append(scored_doc)
            
            # Sort by final score (highest first)
            scored_docs.sort(key=lambda x: x.get('score', 0), reverse=True)
            
            # Take the top_k
            result = scored_docs[:top_k]
            
            # Clean up diagnostic scores before returning
            for doc in result:
                doc.pop('_term_score', None)
                doc.pop('_phrase_score', None)
                doc.pop('_semantic_score', None)
                doc.pop('_position_score', None)
            
            # Cache the results
            RerankerService._rerank_cache[cache_key] = result
            
            return result
            
        except Exception as e:
            logger.error(f"Error reranking documents: {e}")
            
            # Fallback: simple sorting based on combined_score if available
            documents.sort(key=lambda x: x.get('combined_score', 0), reverse=True)
            return documents[:top_k]
    
    @staticmethod
    def _tokenize(text: str) -> List[str]:
        """
        Tokenize a string into terms.
        
        Args:
            text (str): Text to tokenize
            
        Returns:
            List[str]: List of tokens
        """
        # Simple tokenization by splitting on whitespace and removing punctuation
        tokens = re.findall(r'\b\w+\b', text.lower())
        return tokens
    
    @staticmethod
    def _calculate_term_score(content: str, query_terms: List[str]) -> float:
        """
        Calculate term frequency score.
        
        Args:
            content (str): Document content
            query_terms (List[str]): Query terms
            
        Returns:
            float: Term frequency score
        """
        score = 0
        content_tokens = RerankerService._tokenize(content)
        
        # Simple term frequency calculation
        for term in query_terms:
            term_count = content_tokens.count(term)
            score += term_count
        
        # Normalize by document length
        if len(content_tokens) > 0:
            score = score / len(content_tokens)
            
        return score
    
    @staticmethod
    def _calculate_phrase_score(content: str, query: str) -> float:
        """
        Calculate exact phrase matching score.
        
        Args:
            content (str): Document content
            query (str): Original query
            
        Returns:
            float: Phrase matching score
        """
        # Count exact matches of the query in the content
        exact_matches = content.count(query)
        
        # Calculating score for sentence fragments
        score = exact_matches * 2.0  # Higher weight for exact matches
        
        # Check for partial matches if no exact matches
        if exact_matches == 0 and len(query) > 5:
            # Generate query n-grams (only for longer queries)
            query_parts = [query[i:i+4] for i in range(0, len(query)-3)]
            for part in query_parts:
                if len(part) >= 4:  # Only consider meaningful parts
                    score += 0.2 * content.count(part)
        
        return min(score, 10.0)  # Cap to avoid extremely high scores
    
    @staticmethod
    def _get_semantic_score(doc: Dict) -> float:
        """
        Extract semantic similarity score from document.
        
        Args:
            doc (Dict): Document
            
        Returns:
            float: Semantic similarity score
        """
        # Use vector_score if available (from vector search)
        if 'vector_score' in doc:
            return doc['vector_score']
        
        # Use combined_score as fallback
        if 'combined_score' in doc:
            return doc['combined_score']
            
        return 0.5  # Default middle value if no scores available
    
    @staticmethod
    def _calculate_position_score(content: str, query_terms: List[str]) -> float:
        """
        Calculate score based on position of match in document.
        Earlier matches often indicate higher relevance.
        
        Args:
            content (str): Document content
            query_terms (List[str]): Query terms
            
        Returns:
            float: Position score
        """
        score = 0
        # Check for terms in the first 20% of the document
        first_section = content[:int(len(content) * 0.2)]
        
        for term in query_terms:
            if term in first_section:
                score += 0.5
                
        return min(score, 1.0)  # Normalize to maximum of 1.0
    
    @staticmethod
    def _calculate_length_factor(content: str) -> float:
        """
        Calculate length normalization factor.
        Prevents extremely short documents from ranking too high.
        
        Args:
            content (str): Document content
            
        Returns:
            float: Length normalization factor
        """
        token_count = len(RerankerService._tokenize(content))
        
        # Penalize very short documents
        if token_count < 10:
            return 0.7
        
        # Slightly favor mid-sized documents
        if 20 <= token_count <= 300:
            return 1.1
            
        return 1.0  # Neutral factor for other documents