"""
MLB Play-by-Play Data Crawler

This module provides functionality to crawl and process MLB play-by-play data
from Baseball Savant (Statcast) and Baseball-Reference.
"""

import os
import json
import time
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from typing import Dict, List, Any, Optional, Union, Tuple
import logging

from .utils import create_session, rate_limit, save_to_csv, setup_logging


class MLBCrawler:
    """
    MLB Play-by-Play data crawler for Statcast and Baseball-Reference data.
    
    This class provides methods to fetch and process MLB play-by-play data,
    focusing on pitch-by-pitch information, game states, and outcomes.
    """
    
    # Base URLs
    STATCAST_URL = "https://baseballsavant.mlb.com/statcast_search/csv"
    BASEBALL_REF_URL = "https://www.baseball-reference.com/boxes/{}/{}.shtml"
    GAMEDAY_URL = "https://statsapi.mlb.com/api/v1.1/game/{}/feed/live"
    
    # Team IDs mapping
    TEAM_IDS = {
        "ARI": 109, "ATL": 144, "BAL": 110, "BOS": 111, "CHC": 112,
        "CWS": 145, "CIN": 113, "CLE": 114, "COL": 115, "DET": 116,
        "HOU": 117, "KC": 118, "LAA": 108, "LAD": 119, "MIA": 146,
        "MIL": 158, "MIN": 142, "NYM": 121, "NYY": 147, "OAK": 133,
        "PHI": 143, "PIT": 134, "SD": 135, "SEA": 136, "SF": 137,
        "STL": 138, "TB": 139, "TEX": 140, "TOR": 141, "WSH": 120
    }
    
    def __init__(self, output_dir: str = "data/mlb", log_file: Optional[str] = None):
        """
        Initialize the MLB crawler.
        
        Args:
            output_dir: Directory to save output files
            log_file: Path to log file (optional)
        """
        self.output_dir = output_dir
        self.session = create_session(retries=5, backoff_factor=0.5)
        self.logger = setup_logging(log_file=log_file)
        
        # Create output directory
        os.makedirs(output_dir, exist_ok=True)
    
    def get_statcast_data(self, start_date: str, end_date: str, 
                          player_type: str = "batter", 
                          team: Optional[str] = None) -> pd.DataFrame:
        """
        Fetch Statcast data for a specific date range.
        
        Args:
            start_date: Start date in YYYY-MM-DD format
            end_date: End date in YYYY-MM-DD format
            player_type: 'batter' or 'pitcher'
            team: Team abbreviation (optional)
            
        Returns:
            DataFrame containing Statcast data
        """
        params = {
            "hfPT": "",
            "hfAB": "",
            "hfGT": "R",  # Regular season games
            "hfPR": "",
            "hfZ": "",
            "stadium": "",
            "hfBBL": "",
            "hfNewZones": "",
            "hfPull": "",
            "hfC": "",
            "hfSea": f"{start_date[:4]}|",
            "hfSit": "",
            "player_type": player_type,
            "hfOuts": "",
            "opponent": "",
            "pitcher_throws": "",
            "batter_stands": "",
            "hfSA": "",
            "game_date_gt": start_date,
            "game_date_lt": end_date,
            "hfMo": "",
            "hfTeam": team or "",
            "home_road": "",
            "hfRO": "",
            "position": "",
            "hfInfield": "",
            "hfOutfield": "",
            "hfInn": "",
            "hfBBT": "",
            "hfFlag": "",
            "metric_1": "",
            "group_by": "name",
            "min_pitches": "0",
            "min_results": "0",
            "min_pas": "0",
            "sort_col": "pitches",
            "player_event_sort": "api_p_release_speed",
            "sort_order": "desc",
            "min_abs": "0",
            "type": "details"
        }
        
        self.logger.info(f"Fetching Statcast data from {start_date} to {end_date}")
        
        try:
            response = self.session.get(self.STATCAST_URL, params=params)
            response.raise_for_status()
            
            # Parse CSV data
            data = pd.read_csv(pd.StringIO(response.text))
            self.logger.info(f"Retrieved {len(data)} records")
            
            return data
        
        except Exception as e:
            self.logger.error(f"Error fetching Statcast data: {e}")
            return pd.DataFrame()
    
    def get_gameday_data(self, game_id: str) -> Dict[str, Any]:
        """
        Fetch detailed game data from MLB GameDay API.
        
        Args:
            game_id: MLB game ID (format: YYYY_MM_DD_teamcode_teamcode)
            
        Returns:
            Dictionary containing game data
        """
        url = self.GAMEDAY_URL.format(game_id)
        
        try:
            self.logger.info(f"Fetching GameDay data for game {game_id}")
            response = self.session.get(url)
            response.raise_for_status()
            
            data = response.json()
            return data
        
        except Exception as e:
            self.logger.error(f"Error fetching GameDay data for {game_id}: {e}")
            return {}
    
    def process_gameday_plays(self, game_data: Dict[str, Any]) -> List[Dict[str, Any]]:
        """
        Process play-by-play data from GameDay API response.
        
        Args:
            game_data: GameDay API response data
            
        Returns:
            List of play events with processed information
        """
        if not game_data or 'liveData' not in game_data:
            return []
        
        plays = []
        
        try:
            all_plays = game_data['liveData']['plays']['allPlays']
            game_info = game_data['gameData']
            
            for play in all_plays:
                # Extract basic play information
                play_event = {
                    'game_id': game_data['gameData']['game']['pk'],
                    'game_date': game_data['gameData']['datetime']['originalDate'],
                    'inning': play['about']['inning'],
                    'inning_half': play['about']['halfInning'],
                    'play_id': play['about']['atBatIndex'],
                    'event': play['result'].get('event', ''),
                    'event_type': play['result'].get('eventType', ''),
                    'description': play['result'].get('description', ''),
                    'is_hit': play['result'].get('isHit', False),
                    'is_ab': play['result'].get('isAtBat', False),
                    'rbi': play['result'].get('rbi', 0),
                    'outs_on_play': play['count'].get('outs', 0),
                }
                
                # Add count information
                if 'count' in play:
                    play_event.update({
                        'balls': play['count'].get('balls', 0),
                        'strikes': play['count'].get('strikes', 0),
                    })
                
                # Add player information
                if 'matchup' in play:
                    play_event.update({
                        'batter_id': play['matchup']['batter'].get('id', ''),
                        'batter_name': play['matchup']['batter'].get('fullName', ''),
                        'pitcher_id': play['matchup']['pitcher'].get('id', ''),
                        'pitcher_name': play['matchup']['pitcher'].get('fullName', ''),
                        'batter_side': play['matchup'].get('batSide', {}).get('code', ''),
                        'pitcher_hand': play['matchup'].get('pitchHand', {}).get('code', ''),
                    })
                
                # Add pitch data if available
                if 'playEvents' in play:
                    pitch_data = [event for event in play['playEvents'] if event['isPitch']]
                    if pitch_data:
                        last_pitch = pitch_data[-1]
                        
                        if 'pitchData' in last_pitch:
                            pitch_info = last_pitch['pitchData']
                            play_event.update({
                                'pitch_type': pitch_info.get('pitchType', ''),
                                'pitch_speed': pitch_info.get('startSpeed', np.nan),
                                'pitch_zone': pitch_info.get('zone', np.nan),
                                'pitch_x': pitch_info.get('coordinates', {}).get('x', np.nan),
                                'pitch_y': pitch_info.get('coordinates', {}).get('y', np.nan),
                                'pitch_z': pitch_info.get('coordinates', {}).get('z', np.nan),
                            })
                        
                        if 'hitData' in last_pitch:
                            hit_info = last_pitch['hitData']
                            play_event.update({
                                'exit_velocity': hit_info.get('launchSpeed', np.nan),
                                'launch_angle': hit_info.get('launchAngle', np.nan),
                                'total_distance': hit_info.get('totalDistance', np.nan),
                                'hit_direction': hit_info.get('trajectory', ''),
                            })
                
                # Add game state information
                if 'runners' in play:
                    base_state = ""
                    if any(r.get('start', {}).get('base') == '1B' for r in play['runners']):
                        base_state += "1"
                    if any(r.get('start', {}).get('base') == '2B' for r in play['runners']):
                        base_state += "2"
                    if any(r.get('start', {}).get('base') == '3B' for r in play['runners']):
                        base_state += "3"
                    
                    play_event['base_state'] = base_state if base_state else "0"
                
                plays.append(play_event)
            
            return plays
        
        except Exception as e:
            self.logger.error(f"Error processing GameDay plays: {e}")
            return []
    
    def crawl_season(self, year: int, start_date: Optional[str] = None, 
                     end_date: Optional[str] = None, 
                     teams: Optional[List[str]] = None) -> None:
        """
        Crawl play-by-play data for an entire MLB season or date range.
        
        Args:
            year: Season year
            start_date: Start date in YYYY-MM-DD format (optional, defaults to season start)
            end_date: End date in YYYY-MM-DD format (optional, defaults to season end)
            teams: List of team abbreviations to crawl (optional, defaults to all teams)
        """
        # Set default dates for the season
        if not start_date:
            start_date = f"{year}-03-20"  # Approximate season start
        if not end_date:
            end_date = f"{year}-11-01"  # Approximate season end including postseason
        
        # Set default teams to all teams
        if not teams:
            teams = list(self.TEAM_IDS.keys())
        
        self.logger.info(f"Starting MLB season crawl for {year} ({start_date} to {end_date})")
        
        # Process date range in chunks to avoid timeout/memory issues
        start_dt = datetime.strptime(start_date, "%Y-%m-%d")
        end_dt = datetime.strptime(end_date, "%Y-%m-%d")
        
        current_dt = start_dt
        chunk_size = timedelta(days=7)  # Process one week at a time
        
        while current_dt <= end_dt:
            chunk_end = min(current_dt + chunk_size, end_dt)
            chunk_start_str = current_dt.strftime("%Y-%m-%d")
            chunk_end_str = chunk_end.strftime("%Y-%m-%d")
            
            for team in teams:
                self.logger.info(f"Processing team {team} from {chunk_start_str} to {chunk_end_str}")
                
                # Get Statcast data
                statcast_data = self.get_statcast_data(
                    start_date=chunk_start_str,
                    end_date=chunk_end_str,
                    team=team
                )
                
                if not statcast_data.empty:
                    # Save Statcast data
                    output_file = os.path.join(
                        self.output_dir, 
                        f"statcast_{team}_{chunk_start_str}_{chunk_end_str}.csv"
                    )
                    save_to_csv(statcast_data, output_file)
                    
                    # Get unique game IDs
                    if 'game_pk' in statcast_data.columns:
                        game_ids = statcast_data['game_pk'].unique()
                        
                        # Process each game
                        for game_id in game_ids:
                            # Avoid hitting rate limits
                            rate_limit(1.5, 3.0)
                            
                            # Get GameDay data
                            gameday_data = self.get_gameday_data(str(game_id))
                            
                            if gameday_data:
                                # Process play-by-play data
                                plays = self.process_gameday_plays(gameday_data)
                                
                                if plays:
                                    # Save play-by-play data
                                    pbp_file = os.path.join(
                                        self.output_dir, 
                                        f"pbp_{game_id}.csv"
                                    )
                                    save_to_csv(plays, pbp_file)
                
                # Avoid hitting rate limits between teams
                rate_limit(2.0, 4.0)
            
            # Move to next chunk
            current_dt = chunk_end + timedelta(days=1)
    
    def merge_season_data(self, year: int, output_file: Optional[str] = None) -> pd.DataFrame:
        """
        Merge all crawled data for a season into a single dataset.
        
        Args:
            year: Season year
            output_file: Path to save the merged data (optional)
            
        Returns:
            DataFrame containing the merged data
        """
        self.logger.info(f"Merging data for {year} season")
        
        # Find all relevant CSV files
        statcast_files = [
            os.path.join(self.output_dir, f) 
            for f in os.listdir(self.output_dir) 
            if f.startswith("statcast_") and f.endswith(".csv") and str(year) in f
        ]
        
        pbp_files = [
            os.path.join(self.output_dir, f) 
            for f in os.listdir(self.output_dir) 
            if f.startswith("pbp_") and f.endswith(".csv")
        ]
        
        # Read and merge Statcast data
        statcast_dfs = []
        for file in statcast_files:
            try:
                df = pd.read_csv(file)
                statcast_dfs.append(df)
            except Exception as e:
                self.logger.error(f"Error reading {file}: {e}")
        
        if statcast_dfs:
            statcast_data = pd.concat(statcast_dfs, ignore_index=True)
            statcast_data = statcast_data.drop_duplicates()
        else:
            statcast_data = pd.DataFrame()
        
        # Read and merge play-by-play data
        pbp_dfs = []
        for file in pbp_files:
            try:
                df = pd.read_csv(file)
                pbp_dfs.append(df)
            except Exception as e:
                self.logger.error(f"Error reading {file}: {e}")
        
        if pbp_dfs:
            pbp_data = pd.concat(pbp_dfs, ignore_index=True)
            pbp_data = pbp_data.drop_duplicates()
        else:
            pbp_data = pd.DataFrame()
        
        # Merge the datasets if both exist
        if not statcast_data.empty and not pbp_data.empty:
            # Perform the merge based on common keys
            # This is a simplified merge and might need adjustment based on actual data structure
            if 'game_pk' in statcast_data.columns and 'game_id' in pbp_data.columns:
                merged_data = pd.merge(
                    statcast_data,
                    pbp_data,
                    left_on=['game_pk', 'at_bat_number'],
                    right_on=['game_id', 'play_id'],
                    how='outer',
                    suffixes=('_statcast', '_pbp')
                )
            else:
                self.logger.warning("Cannot merge datasets: missing key columns")
                merged_data = statcast_data  # Default to Statcast data
        elif not statcast_data.empty:
            merged_data = statcast_data
        elif not pbp_data.empty:
            merged_data = pbp_data
        else:
            self.logger.warning("No data found to merge")
            merged_data = pd.DataFrame()
        
        # Save merged data if requested
        if output_file and not merged_data.empty:
            save_to_csv(merged_data, output_file)
        
        return merged_data


if __name__ == "__main__":
    # Example usage
    crawler = MLBCrawler(output_dir="data/mlb", log_file="logs/mlb_crawler.log")
    
    # Crawl 2022 season data for a few teams
    crawler.crawl_season(
        year=2022,
        start_date="2022-04-07",  # 2022 Opening Day
        end_date="2022-04-14",    # Just crawl one week for this example
        teams=["NYY", "BOS", "LAD"]  # Just a few teams for the example
    )
    
    # Merge the data
    merged_data = crawler.merge_season_data(
        year=2022,
        output_file="data/mlb/mlb_2022_merged.csv"
    )

