"""
Data loading utilities for sports data.

This module provides functions for loading raw data from various sports leagues
and converting them to a standardized format. It supports both traditional data loading
from JSON files and loading data collected via the crawlers.
"""

import os
import json
import pandas as pd
import numpy as np
from datetime import datetime
import logging
from typing import Optional, List, Dict, Any, Union, Tuple

# Setup logging
logger = logging.getLogger(__name__)

# Try to import converters
try:
    from .converters import convert_mlb_data, convert_nba_data, convert_nhl_data
    CONVERTERS_AVAILABLE = True
except ImportError:
    CONVERTERS_AVAILABLE = False
    logger.warning("Data converters not available. Some functionality will be limited.")

def load_nhl_data(data_path, seasons=None, force_rebuild=False):
    """
    Load NHL play-by-play data.
    
    Parameters:
    -----------
    data_path : str
        Path to the directory containing NHL data
    seasons : list of str, optional
        List of seasons to load (e.g., ['2019', '2020'])
    force_rebuild : bool, optional (default=False)
        If True, rebuild the data from raw files even if processed data exists
    
    Returns:
    --------
    df : pandas.DataFrame
        DataFrame containing play-by-play data
    """
    # Define path for processed data
    processed_file = os.path.join(data_path, 'processed', 'nhl', 'play_by_play.csv')
    
    # Check if processed data exists and we don't need to rebuild
    if os.path.exists(processed_file) and not force_rebuild:
        print(f"Loading processed NHL data from {processed_file}")
        return pd.read_csv(processed_file)
    
    print("Building NHL data from raw files...")
    
    # Define seasons to load if not provided
    if seasons is None:
        seasons = ['2022', '2023', '2024']
    
    # Initialize empty list to store data
    all_data = []
    
    # Load data for each season
    for season in seasons:
        season_path = os.path.join(data_path, 'raw', 'nhl_nhle', season, 'games')
        
        # Check if the directory exists
        if not os.path.exists(season_path):
            print(f"Warning: Directory {season_path} does not exist. Skipping.")
            continue
        
        # Get list of game files
        game_files = [f for f in os.listdir(season_path) if f.endswith('.json')]
        
        # Load each game file
        for game_file in game_files:
            file_path = os.path.join(season_path, game_file)
            
            try:
                with open(file_path, 'r') as f:
                    game_data = json.load(f)
                
                # Extract play-by-play data
                if 'liveData' in game_data and 'plays' in game_data['liveData']:
                    plays = game_data['liveData']['plays']['allPlays']
                    
                    # Extract relevant information from each play
                    for play in plays:
                        play_data = {
                            'game_id': game_data['gamePk'],
                            'season': season,
                            'event_idx': play['about']['eventIdx'],
                            'event_id': play['result'].get('eventId', ''),
                            'event_type': play['result'].get('eventTypeId', ''),
                            'period': play['about'].get('period', 0),
                            'period_time': play['about'].get('periodTime', ''),
                            'period_time_remaining': play['about'].get('periodTimeRemaining', ''),
                            'date_time': play['about'].get('dateTime', ''),
                            'goals_away': play['about'].get('goals', {}).get('away', 0),
                            'goals_home': play['about'].get('goals', {}).get('home', 0),
                            'description': play['result'].get('description', '')
                        }
                        
                        # Add coordinates if available
                        if 'coordinates' in play:
                            play_data['x'] = play['coordinates'].get('x', None)
                            play_data['y'] = play['coordinates'].get('y', None)
                        else:
                            play_data['x'] = None
                            play_data['y'] = None
                        
                        # Add player information if available
                        if 'players' in play:
                            for player_info in play['players']:
                                player_type = player_info['playerType']
                                player_id = player_info['player']['id']
                                player_name = player_info['player']['fullName']
                                
                                play_data[f'{player_type}_id'] = player_id
                                play_data[f'{player_type}_name'] = player_name
                        
                        # Add team information if available
                        if 'team' in play:
                            play_data['team_id'] = play['team'].get('id', None)
                            play_data['team_name'] = play['team'].get('name', None)
                        
                        all_data.append(play_data)
            except Exception as e:
                print(f"Error loading {file_path}: {e}")
    
    # Convert to DataFrame
    df = pd.DataFrame(all_data)
    
    # Ensure directory exists
    os.makedirs(os.path.dirname(processed_file), exist_ok=True)
    
    # Save processed data
    df.to_csv(processed_file, index=False)
    
    print(f"NHL data loaded: {len(df)} plays")
    
    return df

def load_mlb_data(data_path, seasons=None, force_rebuild=False):
    """
    Load MLB play-by-play data.
    
    Parameters:
    -----------
    data_path : str
        Path to the directory containing MLB data
    seasons : list of str, optional
        List of seasons to load (e.g., ['2022', '2023'])
    force_rebuild : bool, optional (default=False)
        If True, rebuild the data from raw files even if processed data exists
    
    Returns:
    --------
    df : pandas.DataFrame
        DataFrame containing play-by-play data
    """
    # Define path for processed data
    processed_file = os.path.join(data_path, 'processed', 'mlb', 'play_by_play.csv')
    
    # Check if processed data exists and we don't need to rebuild
    if os.path.exists(processed_file) and not force_rebuild:
        print(f"Loading processed MLB data from {processed_file}")
        return pd.read_csv(processed_file)
    
    print("Building MLB data from raw files...")
    
    # Define seasons to load if not provided
    if seasons is None:
        seasons = ['2022', '2023', '2024']
    
    # Initialize empty list to store data
    all_data = []
    
    # Load data for each season
    for season in seasons:
        # Define date range for the season
        season_path = os.path.join(data_path, 'raw', 'mlb', f'{season}-01-01_{season}-12-31', 'games')
        
        # Check if the directory exists
        if not os.path.exists(season_path):
            print(f"Warning: Directory {season_path} does not exist. Skipping.")
            continue
        
        # Get list of game files
        game_files = [f for f in os.listdir(season_path) if f.endswith('.json')]
        
        # Load each game file
        for game_file in game_files:
            file_path = os.path.join(season_path, game_file)
            
            try:
                with open(file_path, 'r') as f:
                    game_data = json.load(f)
                
                # Extract play-by-play data
                if 'allPlays' in game_data:
                    plays = game_data['allPlays']
                    
                    # Extract relevant information from each play
                    for play in plays:
                        play_data = {
                            'game_id': game_data.get('gamePk', ''),
                            'season': season,
                            'event_idx': play.get('atBatIndex', ''),
                            'event_type': play.get('result', {}).get('eventType', ''),
                            'description': play.get('result', {}).get('description', ''),
                            'inning': play.get('about', {}).get('inning', 0),
                            'inning_half': play.get('about', {}).get('halfInning', ''),
                            'is_top_inning': play.get('about', {}).get('isTopInning', False),
                            'outs': play.get('count', {}).get('outs', 0),
                            'balls': play.get('count', {}).get('balls', 0),
                            'strikes': play.get('count', {}).get('strikes', 0)
                        }
                        
                        # Add player information if available
                        if 'matchup' in play:
                            play_data['batter_id'] = play['matchup'].get('batter', {}).get('id', None)
                            play_data['batter_name'] = play['matchup'].get('batter', {}).get('fullName', None)
                            play_data['pitcher_id'] = play['matchup'].get('pitcher', {}).get('id', None)
                            play_data['pitcher_name'] = play['matchup'].get('pitcher', {}).get('fullName', None)
                        
                        # Add hit data if available
                        if 'hitData' in play:
                            play_data['launch_speed'] = play['hitData'].get('launchSpeed', None)
                            play_data['launch_angle'] = play['hitData'].get('launchAngle', None)
                            play_data['hit_distance'] = play['hitData'].get('totalDistance', None)
                        
                        # Add outcome information
                        if 'result' in play:
                            play_data['event'] = play['result'].get('event', '')
                            play_data['is_hit'] = play['result'].get('event', '') in [
                                'Single', 'Double', 'Triple', 'Home Run']
                        
                        all_data.append(play_data)
            except Exception as e:
                print(f"Error loading {file_path}: {e}")
    
    # Convert to DataFrame
    df = pd.DataFrame(all_data)
    
    # Ensure directory exists
    os.makedirs(os.path.dirname(processed_file), exist_ok=True)
    
    # Save processed data
    df.to_csv(processed_file, index=False)
    
    print(f"MLB data loaded: {len(df)} plays")
    
    return df

def load_nba_data(data_path, seasons=None, force_rebuild=False):
    """
    Load NBA play-by-play data.
    
    Parameters:
    -----------
    data_path : str
        Path to the directory containing NBA data
    seasons : list of str, optional
        List of seasons to load (e.g., ['2022', '2023'])
    force_rebuild : bool, optional (default=False)
        If True, rebuild the data from raw files even if processed data exists
    
    Returns:
    --------
    df : pandas.DataFrame
        DataFrame containing play-by-play data
    """
    # Define path for processed data
    processed_file = os.path.join(data_path, 'processed', 'nba', 'play_by_play.csv')
    
    # Check if processed data exists and we don't need to rebuild
    if os.path.exists(processed_file) and not force_rebuild:
        print(f"Loading processed NBA data from {processed_file}")
        return pd.read_csv(processed_file)
    
    print("Building NBA data from raw files...")
    
    # For NBA, we'll combine play-by-play data with shot data
    pbp_dir = os.path.join(data_path, 'raw', 'nba_stats', 'pbp')
    shots_dir = os.path.join(data_path, 'raw', 'nba_stats', 'shots')
    
    # Check if directories exist
    if not os.path.exists(pbp_dir) or not os.path.exists(shots_dir):
        print(f"Warning: NBA data directories not found.")
        return pd.DataFrame()
    
    # Load play-by-play data
    pbp_files = [f for f in os.listdir(pbp_dir) if f.endswith('.json')]
    
    all_pbp_data = []
    
    for pbp_file in pbp_files:
        file_path = os.path.join(pbp_dir, pbp_file)
        
        try:
            with open(file_path, 'r') as f:
                game_data = json.load(f)
            
            # Extract game ID and season
            game_id = pbp_file.split('.')[0]
            season = game_data.get('basicGameData', {}).get('seasonYear', '')
            
            # Skip if not in requested seasons
            if seasons is not None and str(season) not in seasons:
                continue
            
            # Extract play-by-play data
            if 'playByPlayEvents' in game_data:
                plays = game_data['playByPlayEvents']
                
                # Extract relevant information from each play
                for play in plays:
                    play_data = {
                        'game_id': game_id,
                        'season': season,
                        'event_num': play.get('eventNum', ''),
                        'event_msg_type': play.get('eventMsgType', ''),
                        'event_msg_action_type': play.get('eventMsgActionType', ''),
                        'period': play.get('period', 0),
                        'play_clock': play.get('playTime', ''),
                        'home_score': play.get('homeScore', 0),
                        'away_score': play.get('awayScore', 0),
                        'description': play.get('description', '')
                    }
                    
                    # Add player information if available
                    play_data['player1_id'] = play.get('personId', None)
                    play_data['player1_name'] = play.get('playerName', None)
                    play_data['player2_id'] = play.get('player2Id', None)
                    play_data['player2_name'] = play.get('player2Name', None)
                    play_data['player3_id'] = play.get('player3Id', None)
                    play_data['player3_name'] = play.get('player3Name', None)
                    
                    # Add team information if available
                    play_data['team_id'] = play.get('teamId', None)
                    play_data['team_name'] = play.get('teamName', None)
                    
                    # Determine if it's a shot attempt
                    is_shot = play.get('eventMsgType', '') in ['1', '2']  # 1: made shot, 2: missed shot
                    play_data['is_shot'] = is_shot
                    play_data['is_made'] = play.get('eventMsgType', '') == '1'
                    
                    all_pbp_data.append(play_data)
        except Exception as e:
            print(f"Error loading {file_path}: {e}")
    
    # Convert to DataFrame
    pbp_df = pd.DataFrame(all_pbp_data)
    
    # Load shot data
    shots_files = [f for f in os.listdir(shots_dir) if f.endswith('.json')]
    
    all_shots_data = []
    
    for shots_file in shots_files:
        file_path = os.path.join(shots_dir, shots_file)
        
        try:
            with open(file_path, 'r') as f:
                shots_data = json.load(f)
            
            # Extract game ID and season
            game_id = shots_file.split('_')[0]
            season = shots_file.split('_')[1].split('.')[0]
            
            # Skip if not in requested seasons
            if seasons is not None and str(season) not in seasons:
                continue
            
            # Extract shot data
            if 'resultSets' in shots_data and len(shots_data['resultSets']) > 0:
                headers = shots_data['resultSets'][0]['headers']
                rows = shots_data['resultSets'][0]['rowSet']
                
                for row in rows:
                    shot_data = dict(zip(headers, row))
                    shot_data['game_id'] = game_id
                    shot_data['season'] = season
                    
                    all_shots_data.append(shot_data)
        except Exception as e:
            print(f"Error loading {file_path}: {e}")
    
    # Convert to DataFrame
    shots_df = pd.DataFrame(all_shots_data)
    
    # Merge play-by-play and shot data
    if not pbp_df.empty and not shots_df.empty:
        # Create keys for merging
        pbp_df['merge_key'] = pbp_df['game_id'] + '_' + pbp_df['event_num'].astype(str)
        shots_df['merge_key'] = shots_df['GAME_ID'].astype(str) + '_' + shots_df['EVENT_NUM'].astype(str)
        
        # Merge on game_id and event_num
        df = pd.merge(pbp_df, shots_df, on='merge_key', how='left')
        
        # Drop merge key
        df = df.drop(columns=['merge_key'])
    else:
        df = pbp_df
    
    # Ensure directory exists
    os.makedirs(os.path.dirname(processed_file), exist_ok=True)
    
    # Save processed data
    df.to_csv(processed_file, index=False)
    
    print(f"NBA data loaded: {len(df)} plays")
    
    return df

def extract_shots_data(df, sport):
    """
    Extract shot data from play-by-play data.
    
    Parameters:
    -----------
    df : pandas.DataFrame
        DataFrame containing play-by-play data
    sport : str
        Sport name ('nhl', 'mlb', or 'nba')
    
    Returns:
    --------
    shots_df : pandas.DataFrame
        DataFrame containing only shot data
    """
    if sport.lower() == 'nhl':
        # Filter for shot events
        shot_events = ['SHOT', 'GOAL', 'MISSED_SHOT', 'BLOCKED_SHOT']
        shots_df = df[df['event_type'].isin(shot_events)].copy()
        
        # Add is_goal column
        shots_df['is_goal'] = shots_df['event_type'] == 'GOAL'
        
        # Extract shooter and goalie information
        shots_df['shooter_id'] = shots_df['Shooter_id'].fillna(shots_df['Scorer_id'])
        shots_df['shooter_name'] = shots_df['Shooter_name'].fillna(shots_df['Scorer_name'])
        shots_df['goalie_id'] = shots_df['Goalie_id']
        shots_df['goalie_name'] = shots_df['Goalie_name']
        
    elif sport.lower() == 'mlb':
        # Filter for at-bat events that resulted in a hit or out
        shots_df = df[(df['event_type'] == 'atBat')].copy()
        
        # Add is_hit column
        shots_df['is_hit'] = shots_df['is_hit'].fillna(False)
        
    elif sport.lower() == 'nba':
        # Filter for shot events
        shots_df = df[df['is_shot'] == True].copy()
        
        # Add is_made column if not already present
        if 'is_made' not in shots_df.columns:
            shots_df['is_made'] = shots_df['event_msg_type'] == '1'
        
    else:
        raise ValueError(f"Unsupported sport: {sport}")
    
    return shots_df

def load_and_extract_shots(data_path, sport, seasons=None, force_rebuild=False):
    """
    Load play-by-play data and extract shot data.
    
    Parameters:
    -----------
    data_path : str
        Path to the directory containing sports data
    sport : str
        Sport name ('nhl', 'mlb', or 'nba')
    seasons : list of str, optional
        List of seasons to load (e.g., ['2019', '2020'])
    force_rebuild : bool, optional (default=False)
        If True, rebuild the data from raw files even if processed data exists
    
    Returns:
    --------
    shots_df : pandas.DataFrame
        DataFrame containing shot data
    """
    # Load play-by-play data
    if sport.lower() == 'nhl':
        df = load_nhl_data(data_path, seasons, force_rebuild)
    elif sport.lower() == 'mlb':
        df = load_mlb_data(data_path, seasons, force_rebuild)
    elif sport.lower() == 'nba':
        df = load_nba_data(data_path, seasons, force_rebuild)
    else:
        raise ValueError(f"Unsupported sport: {sport}")
    
    # Extract shot data
    shots_df = extract_shots_data(df, sport)
    
    return shots_df


def load_crawled_data(data_path: str, sport: str, convert: bool = True, 
                     output_path: Optional[str] = None,
                     min_sequence_length: int = 3) -> pd.DataFrame:
    """
    Load data collected by the crawlers and optionally convert it to HMM-GLM format.
    
    Parameters:
    -----------
    data_path : str
        Path to the directory containing crawled data
    sport : str
        Sport name ('nhl', 'mlb', or 'nba')
    convert : bool, optional (default=True)
        If True, convert the data to HMM-GLM format
    output_path : str, optional
        Path to save the converted data
    min_sequence_length : int, optional (default=3)
        Minimum sequence length to include when converting
    
    Returns:
    --------
    df : pandas.DataFrame
        DataFrame containing the loaded (and optionally converted) data
    """
    # Check if converters are available
    if convert and not CONVERTERS_AVAILABLE:
        logger.warning("Data converters not available. Loading raw data without conversion.")
        convert = False
    
    # Load and convert data
    if sport.lower() == 'nhl':
        if convert:
            df, metadata = convert_nhl_data(data_path, output_path, min_sequence_length)
            logger.info(f"Converted NHL data: {len(df)} rows, {metadata['n_sequences']} sequences")
        else:
            # Load raw data
            if os.path.isdir(data_path):
                csv_files = [f for f in os.listdir(data_path) if f.endswith('.csv') and ('pbp_' in f or 'shots' in f)]
                dfs = []
                for file in csv_files:
                    try:
                        df_file = pd.read_csv(os.path.join(data_path, file))
                        dfs.append(df_file)
                    except Exception as e:
                        logger.error(f"Error loading {file}: {e}")
                
                if dfs:
                    df = pd.concat(dfs, ignore_index=True)
                else:
                    df = pd.DataFrame()
            else:
                df = pd.read_csv(data_path)
            
            logger.info(f"Loaded raw NHL data: {len(df)} rows")
    
    elif sport.lower() == 'mlb':
        if convert:
            df, metadata = convert_mlb_data(data_path, output_path, min_sequence_length)
            logger.info(f"Converted MLB data: {len(df)} rows, {metadata['n_sequences']} sequences")
        else:
            # Load raw data
            if os.path.isdir(data_path):
                csv_files = [f for f in os.listdir(data_path) if f.endswith('.csv')]
                dfs = []
                for file in csv_files:
                    try:
                        df_file = pd.read_csv(os.path.join(data_path, file))
                        dfs.append(df_file)
                    except Exception as e:
                        logger.error(f"Error loading {file}: {e}")
                
                if dfs:
                    df = pd.concat(dfs, ignore_index=True)
                else:
                    df = pd.DataFrame()
            else:
                df = pd.read_csv(data_path)
            
            logger.info(f"Loaded raw MLB data: {len(df)} rows")
    
    elif sport.lower() == 'nba':
        if convert:
            df, metadata = convert_nba_data(data_path, output_path, min_sequence_length)
            logger.info(f"Converted NBA data: {len(df)} rows, {metadata['n_sequences']} sequences")
        else:
            # Load raw data
            if os.path.isdir(data_path):
                csv_files = [f for f in os.listdir(data_path) if f.endswith('.csv') and ('pbp_' in f or 'shotchart' in f)]
                dfs = []
                for file in csv_files:
                    try:
                        df_file = pd.read_csv(os.path.join(data_path, file))
                        dfs.append(df_file)
                    except Exception as e:
                        logger.error(f"Error loading {file}: {e}")
                
                if dfs:
                    df = pd.concat(dfs, ignore_index=True)
                else:
                    df = pd.DataFrame()
            else:
                df = pd.read_csv(data_path)
            
            logger.info(f"Loaded raw NBA data: {len(df)} rows")
    
    else:
        raise ValueError(f"Unsupported sport: {sport}")
    
    return df


def load_and_convert_crawled_data(data_path: str, sport: str, 
                                 output_path: Optional[str] = None,
                                 min_sequence_length: int = 3) -> Tuple[pd.DataFrame, Dict[str, Any]]:
    """
    Load crawled data and convert it to HMM-GLM format.
    
    Parameters:
    -----------
    data_path : str
        Path to the directory containing crawled data
    sport : str
        Sport name ('nhl', 'mlb', or 'nba')
    output_path : str, optional
        Path to save the converted data
    min_sequence_length : int, optional (default=3)
        Minimum sequence length to include
    
    Returns:
    --------
    tuple: (DataFrame, metadata)
        DataFrame containing the converted data and metadata dictionary
    """
    # Check if converters are available
    if not CONVERTERS_AVAILABLE:
        raise ImportError("Data converters not available. Cannot convert crawled data.")
    
    # Convert data based on sport
    if sport.lower() == 'nhl':
        df, metadata = convert_nhl_data(data_path, output_path, min_sequence_length)
    elif sport.lower() == 'mlb':
        df, metadata = convert_mlb_data(data_path, output_path, min_sequence_length)
    elif sport.lower() == 'nba':
        df, metadata = convert_nba_data(data_path, output_path, min_sequence_length)
    else:
        raise ValueError(f"Unsupported sport: {sport}")
    
    return df, metadata

