"""
Sequence data processing utilities.

This module provides functions for processing sequence data for sports events.
"""

import pandas as pd
import numpy as np
from collections import defaultdict

def extract_sequences(df, event_types, player_id_col, max_sequence_length=10, 
                    min_sequences_per_player=5):
    """
    Extract event sequences leading up to shot attempts.
    
    Parameters:
    -----------
    df : pandas.DataFrame
        DataFrame containing play-by-play data
    event_types : list of str
        List of event types to include in sequences
    player_id_col : str
        Name of the column containing player IDs
    max_sequence_length : int, optional (default=10)
        Maximum length of sequences to extract
    min_sequences_per_player : int, optional (default=5)
        Minimum number of sequences required for a player to be included
    
    Returns:
    --------
    sequences : list of dict
        List of sequence dictionaries, each containing:
        - player_id: ID of the player
        - events: List of event dictionaries
        - outcome: Outcome of the final event (1 for success, 0 for failure)
    """
    # Make a copy to avoid modifying the original
    df = df.copy()
    
    # Sort by game, period, and time
    df = df.sort_values(['game_id', 'period', 'game_seconds'])
    
    # Initialize sequences
    sequences = []
    
    # Group by game
    game_groups = df.groupby('game_id')
    
    for game_id, game_df in game_groups:
        # Initialize current sequence
        current_sequence = []
        
        # Process events in order
        for _, event in game_df.iterrows():
            event_type = event['event_type']
            
            # Check if this is a shot event
            is_shot_event = event_type in ['SHOT', 'GOAL', 'MISSED_SHOT', 'BLOCKED_SHOT']
            
            # If it's a relevant event, add to current sequence
            if event_type in event_types:
                # Convert event to dictionary
                event_dict = event.to_dict()
                
                # Add to current sequence
                current_sequence.append(event_dict)
                
                # If this is a shot event, finalize the sequence
                if is_shot_event:
                    # Check if the sequence has a valid player ID
                    if player_id_col in event and pd.notna(event[player_id_col]):
                        player_id = event[player_id_col]
                        
                        # Determine outcome
                        outcome = 1 if event_type == 'GOAL' else 0
                        
                        # Limit sequence length
                        if len(current_sequence) > max_sequence_length:
                            current_sequence = current_sequence[-max_sequence_length:]
                        
                        # Create sequence dictionary
                        sequence = {
                            'player_id': player_id,
                            'events': current_sequence.copy(),
                            'outcome': outcome
                        }
                        
                        # Add to sequences
                        sequences.append(sequence)
                    
                    # Reset current sequence
                    current_sequence = []
            
            # If it's a period end or game end, reset the sequence
            elif event_type in ['PERIOD_END', 'GAME_END']:
                current_sequence = []
    
    # Filter for players with minimum number of sequences
    player_sequence_counts = defaultdict(int)
    for sequence in sequences:
        player_sequence_counts[sequence['player_id']] += 1
    
    valid_players = [player_id for player_id, count in player_sequence_counts.items()
                   if count >= min_sequences_per_player]
    
    filtered_sequences = [sequence for sequence in sequences
                        if sequence['player_id'] in valid_players]
    
    return filtered_sequences

def extract_sequence_features(sequence, feature_extractors):
    """
    Extract features from a sequence.
    
    Parameters:
    -----------
    sequence : dict
        Sequence dictionary containing:
        - player_id: ID of the player
        - events: List of event dictionaries
        - outcome: Outcome of the final event (1 for success, 0 for failure)
    feature_extractors : list of callable
        List of feature extractor functions, each taking a sequence and returning a feature value
    
    Returns:
    --------
    features : list
        List of extracted feature values
    """
    features = []
    
    for extractor in feature_extractors:
        feature_value = extractor(sequence)
        features.append(feature_value)
    
    return features

def create_sequence_datasets(sequences, feature_extractors):
    """
    Create datasets from sequences.
    
    Parameters:
    -----------
    sequences : list of dict
        List of sequence dictionaries
    feature_extractors : list of callable
        List of feature extractor functions
    
    Returns:
    --------
    X : ndarray, shape (n_sequences, n_features)
        Feature matrix
    y : ndarray, shape (n_sequences,)
        Target vector
    player_ids : ndarray, shape (n_sequences,)
        Player IDs
    """
    # Initialize arrays
    n_sequences = len(sequences)
    n_features = len(feature_extractors)
    
    X = np.zeros((n_sequences, n_features))
    y = np.zeros(n_sequences)
    player_ids = np.zeros(n_sequences)
    
    # Process each sequence
    for i, sequence in enumerate(sequences):
        # Extract features
        X[i] = extract_sequence_features(sequence, feature_extractors)
        
        # Extract outcome
        y[i] = sequence['outcome']
        
        # Extract player ID
        player_ids[i] = sequence['player_id']
    
    return X, y, player_ids

def create_player_sequence_datasets(sequences, feature_extractors):
    """
    Create separate datasets for each player from sequences.
    
    Parameters:
    -----------
    sequences : list of dict
        List of sequence dictionaries
    feature_extractors : list of callable
        List of feature extractor functions
    
    Returns:
    --------
    player_datasets : dict
        Dictionary mapping player IDs to (X, y) tuples
    """
    # Group sequences by player
    player_sequences = defaultdict(list)
    
    for sequence in sequences:
        player_id = sequence['player_id']
        player_sequences[player_id].append(sequence)
    
    # Create datasets for each player
    player_datasets = {}
    
    for player_id, player_seqs in player_sequences.items():
        # Create dataset for this player
        X, y, _ = create_sequence_datasets(player_seqs, feature_extractors)
        
        # Store in dictionary
        player_datasets[player_id] = (X, y)
    
    return player_datasets

# Example feature extractors

def sequence_length(sequence):
    """Extract the length of the sequence."""
    return len(sequence['events'])

def time_since_last_event(sequence):
    """Extract the time since the last event."""
    events = sequence['events']
    if len(events) < 2:
        return 0
    
    last_event = events[-1]
    prev_event = events[-2]
    
    last_time = last_event.get('game_seconds', 0)
    prev_time = prev_event.get('game_seconds', 0)
    
    return last_time - prev_time

def count_event_type(sequence, event_type):
    """Count occurrences of a specific event type in the sequence."""
    events = sequence['events']
    return sum(1 for event in events if event.get('event_type') == event_type)

def has_event_type(sequence, event_type):
    """Check if the sequence contains a specific event type."""
    events = sequence['events']
    return any(event.get('event_type') == event_type for event in events)

def last_event_type(sequence, event_type):
    """Check if the last event is of a specific type."""
    events = sequence['events']
    if not events:
        return False
    
    return events[-1].get('event_type') == event_type


