from typing import Dict, List, Tuple, Optional, DefaultDict
import pandas as pd
import numpy as np
from bayesian_elo import update_bayesian_elo as update_bayesian_elo
from bayesian_elo_noise_vectorized import update_bayesian_elo as update_bayesian_elo_noise
from google_elo_processor import GoogleEloProcessor
from models import (
    Metric, Session, Slate, Rating, Stimulus, QuerySet,
    MIN_COMPARISONS_PER_USER,
    elo_to_skill
)
from collections import defaultdict
import math
import os
import random
from multiprocessing import Pool
from tqdm import tqdm
from scipy.stats import spearmanr
import time
import argparse

model_factory = {
    'bayesian_elo': update_bayesian_elo,
    'bayesian_elo_noise': update_bayesian_elo_noise,
}

class DataProcessor:
    """Handles data processing and validation."""
    
    def __init__(self, data_path: str):
        self.data_path = data_path
        self.df: Optional[pd.DataFrame] = None
        
    def load_data(self) -> bool:
        """Load and validate the input data."""
        print(f"Loading data from {self.data_path}")
        if not os.path.exists(self.data_path):
            return False
            
        self.df = pd.read_csv(self.data_path)

        # drop users with less than MIN_COMPARISONS_PER_USER comparisons
        self.df = self.df[self.df['answerer'].isin(self.get_valid_users())]
        
        # Validate required columns for Google Elo format
        required_columns = ['methodA', 'methodB', 'answerValue', 'answerer']
        if not all(col in self.df.columns for col in required_columns):
            print(f"Missing required columns. Expected: {required_columns}")
            print(f"Found columns: {list(self.df.columns)}")
            return False
        
        return True
        
    def get_valid_users(self) -> List[str]:
        """Get list of users with sufficient non-training comparisons."""
        if self.df is None:
            return []
        
        # Count comparisons per rater
        user_comparisons = defaultdict(int)
        for _, row in self.df.iterrows():
            user_comparisons[row['answerer']] += 1
        
        return [user for user, count in user_comparisons.items() 
                if count >= MIN_COMPARISONS_PER_USER]
    
    



class EloWrapper:
    """Wrapper for ELO model operations."""

    def get_user_sessions(self, df: pd.DataFrame, valid_users: List[str]) -> Dict[str, List[Dict]]:
        """Get sessions for valid users in a data-oriented format, excluding training questions."""
        sessions = defaultdict(list)
        
        # Filter out training questions
        df_filtered = df[df['is_training'] == False]
        
        for session_id, session_group in df_filtered.groupby('session_id'):
            if session_id not in valid_users:
                continue
                
            for slate_id, slate_group in session_group.groupby('slate_id'):
                if len(slate_group) != 2:
                    continue
                    
                slate_data = {
                    'slate_id': slate_id,
                    'ratings': [
                        {
                            'score': float(row['score']),
                            'stimulus': os.path.basename(str(row['stimulus']).strip()) if pd.notna(row['stimulus']) else 'unknown'
                        }
                        for _, row in slate_group.iterrows()
                    ]
                }
                sessions[session_id].append(slate_data)
                
        return dict(sessions)
    
    def convert_to_elo_format(self, df: pd.DataFrame, valid_users: List[str]) -> List[Session]:
        """Convert data-oriented format to ELO model format."""
        elo_sessions = []
        sessions_dict = self.get_user_sessions(df, valid_users)
        
        for rater_id, slates in sessions_dict.items():
            elo_slates = []
            for slate in slates:
                ratings = [
                    Rating(r['score'], Stimulus(r['stimulus']))
                    for r in slate['ratings']
                ]
                elo_slates.append(Slate(ratings))
            
            session = Session(elo_slates, rater=rater_id)
            session.id = rater_id
            elo_sessions.append(session)
            
        return elo_sessions
        
    @staticmethod
    def get_elo_model_df(metric: Metric) -> pd.DataFrame:
        """Return a DataFrame with the ELO model statistics."""
        if not hasattr(metric, 'state') or 'scores' not in metric.state:
            return pd.DataFrame(columns=['Method', 'ELO Score', 'Lower CI (99%)', 'Upper CI (99%)'])
            
        data = []
        for method, score_data in sorted(metric.state['scores'].items(), 
                                       key=lambda x: x[1]['value'], 
                                       reverse=True):
            data.append({
                'Method': method,
                'ELO Score': score_data['value'],
                'Lower CI (99%)': score_data['p005'],
                'Upper CI (99%)': score_data['p995']
            })
        return pd.DataFrame(data)


class EloProcessor:
    def __init__(self, df: pd.DataFrame, valid_users: List[str]):
        """Initialize the analyzer with data and valid users.
        
        Args:
            df: DataFrame containing the comparison data
            valid_users: List of valid user IDs
        """
        self.df = df
        self.valid_users = valid_users
        self.metric = None  # Store the metric for later access

    def process(self, df: pd.DataFrame = None, valid_users: List[str] = None, model: str = 'google_elo', uncertainty: bool = False):  
        if model == 'google_elo':
            # Use Google Elo processor directly
            google_processor = GoogleEloProcessor(df, valid_users)
            results, time = google_processor.process(df, valid_users, uncertainty=uncertainty)
        else:
            # Convert to CLIC2024 format for Bayesian models
            clic2024_df = self._convert_google_elo_to_clic2024(df)
            elo_wrapper = EloWrapper()
            elo_sessions = elo_wrapper.convert_to_elo_format(clic2024_df, valid_users)
            final_metric = Metric()
            final_sessions_qs = QuerySet(elo_sessions)
            time = model_factory[model](final_metric, final_sessions_qs)      
            results = EloWrapper.get_elo_model_df(final_metric)
            if results.empty:
                return pd.DataFrame()
            if not uncertainty:
                results = results.drop(columns=['Lower CI (99%)', 'Upper CI (99%)'])
            self.metric = final_metric
        return results, time
    
    def _convert_google_elo_to_clic2024(self, df: pd.DataFrame) -> pd.DataFrame:
        """Convert Google Elo format to CLIC2024 format for Bayesian models."""
        converted_data = []
        
        for idx, row in df.iterrows():
            # Create two rows for each comparison (A and B)
            # if golden skip
            if row['isGolden'] == 'true':
                continue
            # Row A
            score_a = 1.0 if row['answerValue'] == 'A' else (-1.0 if row['answerValue'] == 'B' else 0.0)
            # header order should be: score,job_id,session_id,rater_id,slate_id,stimulus_id,stimulus,is_training
            converted_data.append({
                'score': score_a,
                'job_id': 'nan',
                'session_id': row['answerer'],
                'rater_id': row['answerer'],
                'slate_id': idx,
                'stimulus_id': row['methodA'],
                'stimulus': row['methodA'],
                'is_training': False,
            })
            
            # Row B
            score_b = 1.0 if row['answerValue'] == 'B' else (-1.0 if row['answerValue'] == 'A' else 0.0)
            converted_data.append({
                'score': score_b,
                'job_id': 'nan',
                'session_id': row['answerer'],
                'rater_id': row['answerer'],
                'slate_id': idx,
                'stimulus_id': row['methodB'],
                'stimulus': row['methodB'],
                'is_training': False,
            })
        return pd.DataFrame(converted_data)

    
