"""
Preprocessing utilities for sports data.

This module provides functions for preprocessing raw sports data into a format
suitable for modeling.
"""

import pandas as pd
import numpy as np
from datetime import datetime, timedelta

def preprocess_nhl_shots(df, min_shots_per_player=20):
    """
    Preprocess NHL shot data.
    
    Parameters:
    -----------
    df : pandas.DataFrame
        DataFrame containing NHL shot data
    min_shots_per_player : int, optional (default=20)
        Minimum number of shots required for a player to be included
    
    Returns:
    --------
    processed_df : pandas.DataFrame
        Preprocessed DataFrame
    """
    # Make a copy to avoid modifying the original
    processed_df = df.copy()
    
    # Convert period_time to seconds
    processed_df['period_seconds'] = processed_df['period_time'].apply(
        lambda x: int(x.split(':')[0]) * 60 + int(x.split(':')[1]) if isinstance(x, str) else 0
    )
    
    # Calculate game time in seconds
    processed_df['game_seconds'] = (processed_df['period'] - 1) * 1200 + processed_df['period_seconds']
    
    # Calculate remaining seconds in period
    processed_df['period_seconds_remaining'] = 1200 - processed_df['period_seconds']
    
    # Convert date_time to datetime
    processed_df['date_time'] = pd.to_datetime(processed_df['date_time'])
    
    # Calculate score differential (team - opponent)
    processed_df['score_differential'] = np.where(
        processed_df['team_id'] == processed_df['home_team_id'],
        processed_df['goals_home'] - processed_df['goals_away'],
        processed_df['goals_away'] - processed_df['goals_home']
    )
    
    # Filter for players with minimum number of shots
    player_shot_counts = processed_df['shooter_id'].value_counts()
    valid_players = player_shot_counts[player_shot_counts >= min_shots_per_player].index
    processed_df = processed_df[processed_df['shooter_id'].isin(valid_players)]
    
    # Sort by player, date, and game time
    processed_df = processed_df.sort_values(['shooter_id', 'date_time', 'game_seconds'])
    
    # Reset index
    processed_df = processed_df.reset_index(drop=True)
    
    return processed_df

def preprocess_mlb_atbats(df, min_atbats_per_player=20):
    """
    Preprocess MLB at-bat data.
    
    Parameters:
    -----------
    df : pandas.DataFrame
        DataFrame containing MLB at-bat data
    min_atbats_per_player : int, optional (default=20)
        Minimum number of at-bats required for a player to be included
    
    Returns:
    --------
    processed_df : pandas.DataFrame
        Preprocessed DataFrame
    """
    # Make a copy to avoid modifying the original
    processed_df = df.copy()
    
    # Convert inning to numeric
    processed_df['inning'] = pd.to_numeric(processed_df['inning'], errors='coerce')
    
    # Calculate game situation
    processed_df['is_late_inning'] = processed_df['inning'] >= 7
    
    # Calculate score differential
    # Note: This assumes the data contains home and away scores, which may need adjustment
    if 'home_score' in processed_df.columns and 'away_score' in processed_df.columns:
        processed_df['score_differential'] = np.where(
            processed_df['half_inning'] == 'top',
            processed_df['away_score'] - processed_df['home_score'],
            processed_df['home_score'] - processed_df['away_score']
        )
    
    # Filter for players with minimum number of at-bats
    player_atbat_counts = processed_df['batter_id'].value_counts()
    valid_players = player_atbat_counts[player_atbat_counts >= min_atbats_per_player].index
    processed_df = processed_df[processed_df['batter_id'].isin(valid_players)]
    
    # Sort by player, game, inning, and at-bat
    processed_df = processed_df.sort_values(['batter_id', 'game_id', 'inning', 'event_idx'])
    
    # Reset index
    processed_df = processed_df.reset_index(drop=True)
    
    return processed_df

def preprocess_nba_shots(df, min_shots_per_player=20):
    """
    Preprocess NBA shot data.
    
    Parameters:
    -----------
    df : pandas.DataFrame
        DataFrame containing NBA shot data
    min_shots_per_player : int, optional (default=20)
        Minimum number of shots required for a player to be included
    
    Returns:
    --------
    processed_df : pandas.DataFrame
        Preprocessed DataFrame
    """
    # Make a copy to avoid modifying the original
    processed_df = df.copy()
    
    # Convert play clock to seconds
    processed_df['play_seconds'] = processed_df['play_clock'].apply(
        lambda x: int(x.split(':')[0]) * 60 + int(x.split(':')[1]) if isinstance(x, str) else 0
    )
    
    # Calculate game time in seconds
    processed_df['game_seconds'] = (processed_df['period'] - 1) * 720 + processed_df['play_seconds']
    
    # Calculate remaining seconds in period
    processed_df['period_seconds_remaining'] = 720 - processed_df['play_seconds']
    
    # Calculate score differential (team - opponent)
    processed_df['score_differential'] = np.where(
        processed_df['team_id'] == processed_df['home_team_id'],
        processed_df['home_score'] - processed_df['away_score'],
        processed_df['away_score'] - processed_df['home_score']
    )
    
    # Filter for players with minimum number of shots
    player_shot_counts = processed_df['player1_id'].value_counts()
    valid_players = player_shot_counts[player_shot_counts >= min_shots_per_player].index
    processed_df = processed_df[processed_df['player1_id'].isin(valid_players)]
    
    # Sort by player, game, and time
    processed_df = processed_df.sort_values(['player1_id', 'game_id', 'game_seconds'])
    
    # Reset index
    processed_df = processed_df.reset_index(drop=True)
    
    return processed_df

def calculate_rolling_stats(df, player_id_col, outcome_col, window_sizes=[5, 10, 20]):
    """
    Calculate rolling statistics for each player.
    
    Parameters:
    -----------
    df : pandas.DataFrame
        DataFrame containing shot data
    player_id_col : str
        Name of the column containing player IDs
    outcome_col : str
        Name of the column containing outcome (1 for success, 0 for failure)
    window_sizes : list of int, optional
        List of window sizes for rolling statistics
    
    Returns:
    --------
    df : pandas.DataFrame
        DataFrame with added rolling statistics
    """
    # Make a copy to avoid modifying the original
    df = df.copy()
    
    # Sort by player and time
    df = df.sort_values([player_id_col, 'date_time'])
    
    # Calculate rolling statistics for each window size
    for window in window_sizes:
        # Calculate rolling success rate
        df[f'rolling_{outcome_col}_{window}'] = df.groupby(player_id_col)[outcome_col].transform(
            lambda x: x.rolling(window, min_periods=1).mean()
        )
        
        # Calculate streak (consecutive successes or failures)
        df[f'streak_{window}'] = df.groupby(player_id_col)[outcome_col].transform(
            lambda x: x.rolling(window, min_periods=1).sum()
        )
        df[f'streak_{window}'] = df.groupby(player_id_col)[f'streak_{window}'].transform(
            lambda x: x.fillna(method='ffill')
        )
    
    return df

def split_train_test(df, player_id_col, test_size=0.2, random_state=42):
    """
    Split data into training and testing sets by player.
    
    Parameters:
    -----------
    df : pandas.DataFrame
        DataFrame containing shot data
    player_id_col : str
        Name of the column containing player IDs
    test_size : float, optional (default=0.2)
        Proportion of data to use for testing
    random_state : int, optional (default=42)
        Random seed for reproducibility
    
    Returns:
    --------
    train_df : pandas.DataFrame
        Training data
    test_df : pandas.DataFrame
        Testing data
    """
    # Make a copy to avoid modifying the original
    df = df.copy()
    
    # Get unique players
    players = df[player_id_col].unique()
    
    # Randomly select players for training and testing
    np.random.seed(random_state)
    np.random.shuffle(players)
    
    n_test = int(len(players) * test_size)
    test_players = players[:n_test]
    train_players = players[n_test:]
    
    # Split data
    train_df = df[df[player_id_col].isin(train_players)]
    test_df = df[df[player_id_col].isin(test_players)]
    
    return train_df, test_df

def create_player_datasets(df, player_id_col, outcome_col, feature_cols):
    """
    Create separate datasets for each player.
    
    Parameters:
    -----------
    df : pandas.DataFrame
        DataFrame containing shot data
    player_id_col : str
        Name of the column containing player IDs
    outcome_col : str
        Name of the column containing outcome (1 for success, 0 for failure)
    feature_cols : list of str
        List of feature column names
    
    Returns:
    --------
    player_datasets : dict
        Dictionary mapping player IDs to (X, y) tuples
    """
    # Make a copy to avoid modifying the original
    df = df.copy()
    
    # Get unique players
    players = df[player_id_col].unique()
    
    # Create datasets for each player
    player_datasets = {}
    
    for player_id in players:
        # Get player data
        player_df = df[df[player_id_col] == player_id]
        
        # Extract features and target
        X = player_df[feature_cols].values
        y = player_df[outcome_col].values
        
        # Store in dictionary
        player_datasets[player_id] = (X, y)
    
    return player_datasets


