"""
NBA Play-by-Play Data Crawler

This module provides functionality to crawl and process NBA play-by-play data
from NBA Stats API and Basketball-Reference.
"""

import os
import json
import time
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from typing import Dict, List, Any, Optional, Union, Tuple
import logging

from .utils import create_session, rate_limit, save_to_csv, setup_logging


class NBACrawler:
    """
    NBA Play-by-Play data crawler for NBA Stats API and Basketball-Reference.
    
    This class provides methods to fetch and process NBA play-by-play data,
    focusing on shot-by-shot information, game states, and player tracking data.
    """
    
    # Base URLs
    NBA_STATS_BASE_URL = "https://stats.nba.com/stats"
    NBA_PBP_ENDPOINT = "/playbyplayv2"
    NBA_SHOTCHART_ENDPOINT = "/shotchartdetail"
    NBA_SCHEDULE_ENDPOINT = "/leaguegamefinder"
    NBA_TRACKING_ENDPOINT = "/leagueplayertrackingdata"
    
    # Basketball-Reference URL
    BREF_URL = "https://www.basketball-reference.com/boxscores/pbp/{game_id}.html"
    
    # Team IDs mapping
    TEAM_IDS = {
        "ATL": 1610612737, "BOS": 1610612738, "BKN": 1610612751, "CHA": 1610612766,
        "CHI": 1610612741, "CLE": 1610612739, "DAL": 1610612742, "DEN": 1610612743,
        "DET": 1610612765, "GSW": 1610612744, "HOU": 1610612745, "IND": 1610612754,
        "LAC": 1610612746, "LAL": 1610612747, "MEM": 1610612763, "MIA": 1610612748,
        "MIL": 1610612749, "MIN": 1610612750, "NOP": 1610612740, "NYK": 1610612752,
        "OKC": 1610612760, "ORL": 1610612753, "PHI": 1610612755, "PHX": 1610612756,
        "POR": 1610612757, "SAC": 1610612758, "SAS": 1610612759, "TOR": 1610612761,
        "UTA": 1610612762, "WAS": 1610612764
    }
    
    def __init__(self, output_dir: str = "data/nba", log_file: Optional[str] = None):
        """
        Initialize the NBA crawler.
        
        Args:
            output_dir: Directory to save output files
            log_file: Path to log file (optional)
        """
        self.output_dir = output_dir
        self.session = create_session(retries=5, backoff_factor=0.5)
        self.logger = setup_logging(log_file=log_file)
        
        # Set NBA Stats API specific headers
        self.session.headers.update({
            "Accept": "application/json, text/plain, */*",
            "Accept-Encoding": "gzip, deflate, br",
            "Accept-Language": "en-US,en;q=0.9",
            "Connection": "keep-alive",
            "Host": "stats.nba.com",
            "Origin": "https://www.nba.com",
            "Referer": "https://www.nba.com/",
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
            "x-nba-stats-origin": "stats",
            "x-nba-stats-token": "true"
        })
        
        # Create output directory
        os.makedirs(output_dir, exist_ok=True)
    
    def get_schedule(self, season: str, season_type: str = "Regular Season") -> pd.DataFrame:
        """
        Get NBA schedule for a specific season.
        
        Args:
            season: Season in format "YYYY-YY" (e.g., "2021-22")
            season_type: "Regular Season", "Playoffs", "Pre Season", etc.
            
        Returns:
            DataFrame containing schedule data
        """
        url = f"{self.NBA_STATS_BASE_URL}{self.NBA_SCHEDULE_ENDPOINT}"
        
        params = {
            "LeagueID": "00",  # NBA
            "Season": season,
            "SeasonType": season_type
        }
        
        self.logger.info(f"Fetching NBA schedule for {season} {season_type}")
        
        try:
            response = self.session.get(url, params=params)
            response.raise_for_status()
            
            data = response.json()
            if 'resultSets' in data and len(data['resultSets']) > 0:
                headers = data['resultSets'][0]['headers']
                rows = data['resultSets'][0]['rowSet']
                
                df = pd.DataFrame(rows, columns=headers)
                self.logger.info(f"Retrieved {len(df)} games")
                return df
            
            self.logger.warning("No schedule data found")
            return pd.DataFrame()
        
        except Exception as e:
            self.logger.error(f"Error fetching NBA schedule: {e}")
            return pd.DataFrame()
    
    def get_play_by_play(self, game_id: str) -> pd.DataFrame:
        """
        Get play-by-play data for a specific NBA game.
        
        Args:
            game_id: NBA game ID (format: "0022100001" for regular season)
            
        Returns:
            DataFrame containing play-by-play data
        """
        url = f"{self.NBA_STATS_BASE_URL}{self.NBA_PBP_ENDPOINT}"
        
        params = {
            "GameID": game_id,
            "StartPeriod": 0,
            "EndPeriod": 10,  # Cover all possible periods including OT
            "RangeType": 0,
            "StartRange": 0,
            "EndRange": 55800  # Maximum possible game time in seconds (15 min * 60 sec * 6.2 periods)
        }
        
        self.logger.info(f"Fetching play-by-play data for game {game_id}")
        
        try:
            response = self.session.get(url, params=params)
            response.raise_for_status()
            
            data = response.json()
            if 'resultSets' in data and len(data['resultSets']) > 0:
                headers = data['resultSets'][0]['headers']
                rows = data['resultSets'][0]['rowSet']
                
                df = pd.DataFrame(rows, columns=headers)
                self.logger.info(f"Retrieved {len(df)} play-by-play events")
                return df
            
            self.logger.warning(f"No play-by-play data found for game {game_id}")
            return pd.DataFrame()
        
        except Exception as e:
            self.logger.error(f"Error fetching play-by-play data for game {game_id}: {e}")
            return pd.DataFrame()
    
    def get_shot_chart(self, game_id: str) -> pd.DataFrame:
        """
        Get shot chart data for a specific NBA game.
        
        Args:
            game_id: NBA game ID
            
        Returns:
            DataFrame containing shot chart data
        """
        url = f"{self.NBA_STATS_BASE_URL}{self.NBA_SHOTCHART_ENDPOINT}"
        
        params = {
            "GameID": game_id,
            "Season": "ALL",  # Get all seasons
            "SeasonType": "Regular Season",
            "TeamID": 0,
            "PlayerID": 0,
            "Outcome": "",
            "Location": "",
            "Month": 0,
            "SeasonSegment": "",
            "DateFrom": "",
            "DateTo": "",
            "OpponentTeamID": 0,
            "VsConference": "",
            "VsDivision": "",
            "Position": "",
            "RookieYear": "",
            "GameSegment": "",
            "Period": 0,
            "LastNGames": 0,
            "ContextMeasure": "FGA"
        }
        
        self.logger.info(f"Fetching shot chart data for game {game_id}")
        
        try:
            response = self.session.get(url, params=params)
            response.raise_for_status()
            
            data = response.json()
            if 'resultSets' in data and len(data['resultSets']) > 0:
                headers = data['resultSets'][0]['headers']
                rows = data['resultSets'][0]['rowSet']
                
                df = pd.DataFrame(rows, columns=headers)
                self.logger.info(f"Retrieved {len(df)} shots")
                return df
            
            self.logger.warning(f"No shot chart data found for game {game_id}")
            return pd.DataFrame()
        
        except Exception as e:
            self.logger.error(f"Error fetching shot chart data for game {game_id}: {e}")
            return pd.DataFrame()
    
    def get_player_tracking(self, game_id: str) -> pd.DataFrame:
        """
        Get player tracking data for a specific NBA game.
        
        Args:
            game_id: NBA game ID
            
        Returns:
            DataFrame containing player tracking data
        """
        url = f"{self.NBA_STATS_BASE_URL}{self.NBA_TRACKING_ENDPOINT}"
        
        params = {
            "GameID": game_id,
            "LeagueID": "00",
            "Season": "ALL",
            "SeasonType": "Regular Season",
            "PORound": 0,
            "PerMode": "Totals",
            "PtMeasureType": "SpeedDistance"  # Options: SpeedDistance, Possessions, CatchShoot, PullUpShot, Defense, Drives, Passing, ElbowTouch, PostTouch, PaintTouch, Efficiency
        }
        
        self.logger.info(f"Fetching player tracking data for game {game_id}")
        
        try:
            response = self.session.get(url, params=params)
            response.raise_for_status()
            
            data = response.json()
            if 'resultSets' in data and len(data['resultSets']) > 0:
                headers = data['resultSets'][0]['headers']
                rows = data['resultSets'][0]['rowSet']
                
                df = pd.DataFrame(rows, columns=headers)
                self.logger.info(f"Retrieved tracking data for {len(df)} players")
                return df
            
            self.logger.warning(f"No player tracking data found for game {game_id}")
            return pd.DataFrame()
        
        except Exception as e:
            self.logger.error(f"Error fetching player tracking data for game {game_id}: {e}")
            return pd.DataFrame()
    
    def get_bref_pbp(self, game_id: str) -> pd.DataFrame:
        """
        Get play-by-play data from Basketball-Reference.
        
        Args:
            game_id: Basketball-Reference game ID (format: YYYYMMDD0XXX)
            
        Returns:
            DataFrame containing play-by-play data
        """
        url = self.BREF_URL.format(game_id=game_id)
        
        self.logger.info(f"Fetching Basketball-Reference play-by-play for game {game_id}")
        
        try:
            response = self.session.get(url)
            response.raise_for_status()
            
            # Parse tables from HTML
            tables = pd.read_html(response.text)
            
            # Find the play-by-play table
            pbp_df = None
            for table in tables:
                if len(table.columns) >= 6 and "Time" in table.columns[0]:
                    pbp_df = table
                    break
            
            if pbp_df is not None:
                self.logger.info(f"Retrieved {len(pbp_df)} play-by-play events from Basketball-Reference")
                
                # Add game_id column
                pbp_df['game_id'] = game_id
                
                return pbp_df
            
            self.logger.warning(f"No play-by-play data found on Basketball-Reference for game {game_id}")
            return pd.DataFrame()
        
        except Exception as e:
            self.logger.error(f"Error fetching Basketball-Reference play-by-play for game {game_id}: {e}")
            return pd.DataFrame()
    
    def process_pbp_data(self, pbp_df: pd.DataFrame) -> pd.DataFrame:
        """
        Process raw play-by-play data to extract additional features.
        
        Args:
            pbp_df: Raw play-by-play DataFrame
            
        Returns:
            Processed play-by-play DataFrame with additional features
        """
        if pbp_df.empty:
            return pbp_df
        
        try:
            # Create a copy to avoid modifying the original
            df = pbp_df.copy()
            
            # Extract shot information
            if 'HOMEDESCRIPTION' in df.columns and 'VISITORDESCRIPTION' in df.columns:
                # Combine home and visitor descriptions
                df['description'] = df['HOMEDESCRIPTION'].fillna('') + ' ' + df['VISITORDESCRIPTION'].fillna('')
                
                # Extract shot type (2PT, 3PT)
                df['is_three_pointer'] = df['description'].str.contains('3PT', case=False, na=False)
                
                # Extract shot outcome (made or missed)
                df['is_made_shot'] = df['description'].str.contains('MADE', case=False, na=False)
                
                # Extract shot distance if available
                df['shot_distance'] = np.nan
                distance_mask = df['description'].str.contains('ft', case=False, na=False)
                
                if distance_mask.any():
                    # Extract distance from descriptions like "24 ft"
                    df.loc[distance_mask, 'shot_distance'] = df.loc[distance_mask, 'description'].str.extract(r'(\d+)\s*ft', expand=False).astype(float)
            
            # Calculate score differential
            if 'SCORE' in df.columns:
                # Extract home and away scores
                df[['home_score', 'away_score']] = df['SCORE'].str.split('-', expand=True).apply(pd.to_numeric, errors='coerce')
                
                # Calculate score differential (home - away)
                df['score_differential'] = df['home_score'] - df['away_score']
            
            # Convert period to quarter name
            if 'PERIOD' in df.columns:
                period_map = {1: '1st Quarter', 2: '2nd Quarter', 3: '3rd Quarter', 4: '4th Quarter'}
                df['quarter'] = df['PERIOD'].map(lambda x: period_map.get(x, f'OT{x-4}' if x > 4 else 'Unknown'))
            
            # Calculate time remaining in game (in seconds)
            if 'PERIOD' in df.columns and 'PCTIMESTRING' in df.columns:
                # Convert time string (MM:SS) to seconds
                df['period_seconds'] = df['PCTIMESTRING'].apply(
                    lambda x: int(x.split(':')[0]) * 60 + int(x.split(':')[1]) if isinstance(x, str) and ':' in x else np.nan
                )
                
                # Calculate total seconds remaining
                df['seconds_remaining'] = (
                    (4 - df['PERIOD'].clip(upper=4)) * 12 * 60  # Regular quarters remaining
                    + df['period_seconds']  # Seconds in current period
                    + (df['PERIOD'] > 4) * (df['PERIOD'] - 4) * 5 * 60  # Add overtime periods (5 min each)
                )
            
            # Extract player IDs for shots
            if 'PLAYER1_ID' in df.columns:
                df['shooter_id'] = df['PLAYER1_ID']
            
            if 'PLAYER2_ID' in df.columns:
                df['assister_id'] = df['PLAYER2_ID']
            
            if 'PLAYER3_ID' in df.columns:
                df['blocker_id'] = df['PLAYER3_ID']
            
            return df
        
        except Exception as e:
            self.logger.error(f"Error processing play-by-play data: {e}")
            return pbp_df
    
    def combine_pbp_with_shotchart(self, pbp_df: pd.DataFrame, shotchart_df: pd.DataFrame) -> pd.DataFrame:
        """
        Combine play-by-play data with shot chart data.
        
        Args:
            pbp_df: Play-by-play DataFrame
            shotchart_df: Shot chart DataFrame
            
        Returns:
            Combined DataFrame with shot location information
        """
        if pbp_df.empty or shotchart_df.empty:
            return pbp_df
        
        try:
            # Create copies to avoid modifying the originals
            pbp = pbp_df.copy()
            shots = shotchart_df.copy()
            
            # Filter play-by-play to shot events only
            shot_events = ['MISS', 'SHOT']
            pbp_shots = pbp[pbp['EVENTMSGTYPE'].isin([1, 2])]  # 1=SHOT, 2=MISS
            
            # Prepare for merge
            if 'GAME_ID' in shots.columns and 'GAME_ID' in pbp_shots.columns:
                # Convert to string for consistent joining
                pbp_shots['GAME_ID'] = pbp_shots['GAME_ID'].astype(str)
                shots['GAME_ID'] = shots['GAME_ID'].astype(str)
                
                # Merge on game_id, player_id, and similar event time
                # This is an approximate merge and may need refinement
                merged = pd.merge(
                    pbp_shots,
                    shots,
                    left_on=['GAME_ID', 'PLAYER1_ID'],
                    right_on=['GAME_ID', 'PLAYER_ID'],
                    suffixes=('_pbp', '_shot'),
                    how='left'
                )
                
                # Keep only the closest shot by time for each play-by-play event
                if 'GAME_CLOCK' in merged.columns and 'MINUTES_REMAINING' in merged.columns:
                    # Convert pbp time to seconds
                    merged['pbp_seconds'] = merged['PCTIMESTRING'].apply(
                        lambda x: int(x.split(':')[0]) * 60 + int(x.split(':')[1]) if isinstance(x, str) and ':' in x else np.nan
                    )
                    
                    # Convert shot chart time to seconds
                    merged['shot_seconds'] = merged['MINUTES_REMAINING'] * 60 + merged['SECONDS_REMAINING']
                    
                    # Calculate time difference
                    merged['time_diff'] = abs(merged['pbp_seconds'] - merged['shot_seconds'])
                    
                    # Group by play-by-play event and keep the closest shot
                    merged = merged.sort_values('time_diff').groupby('EVENTNUM').first().reset_index()
                
                # Add shot location to original play-by-play
                location_cols = ['LOC_X', 'LOC_Y', 'SHOT_DISTANCE', 'SHOT_TYPE', 'SHOT_ZONE_BASIC', 
                                'SHOT_ZONE_AREA', 'SHOT_ZONE_RANGE']
                
                for col in location_cols:
                    if col in merged.columns:
                        pbp = pd.merge(
                            pbp,
                            merged[['EVENTNUM', col]],
                            on='EVENTNUM',
                            how='left'
                        )
            
            return pbp
        
        except Exception as e:
            self.logger.error(f"Error combining play-by-play with shot chart: {e}")
            return pbp_df
    
    def crawl_season(self, season: str, start_date: Optional[str] = None, 
                     end_date: Optional[str] = None, 
                     teams: Optional[List[str]] = None,
                     include_tracking: bool = False) -> None:
        """
        Crawl play-by-play data for an entire NBA season or date range.
        
        Args:
            season: Season in format "YYYY-YY" (e.g., "2021-22")
            start_date: Start date in YYYY-MM-DD format (optional)
            end_date: End date in YYYY-MM-DD format (optional)
            teams: List of team abbreviations to crawl (optional)
            include_tracking: Whether to include player tracking data (optional)
        """
        # Get the schedule
        schedule_df = self.get_schedule(season, "Regular Season")
        
        if schedule_df.empty:
            self.logger.error(f"Could not retrieve schedule for {season}")
            return
        
        # Filter by date if specified
        if start_date:
            schedule_df = schedule_df[schedule_df['GAME_DATE'] >= start_date]
        
        if end_date:
            schedule_df = schedule_df[schedule_df['GAME_DATE'] <= end_date]
        
        # Filter by teams if specified
        if teams:
            team_ids = [self.TEAM_IDS.get(team) for team in teams if team in self.TEAM_IDS]
            schedule_df = schedule_df[
                (schedule_df['TEAM_ID'].isin(team_ids)) | 
                (schedule_df['OPPONENT_TEAM_ID'].isin(team_ids))
            ]
        
        # Get unique game IDs
        game_ids = schedule_df['GAME_ID'].unique()
        self.logger.info(f"Found {len(game_ids)} games to process")
        
        # Process each game
        for i, game_id in enumerate(game_ids):
            self.logger.info(f"Processing game {i+1}/{len(game_ids)}: {game_id}")
            
            # Get play-by-play data
            pbp_df = self.get_play_by_play(str(game_id))
            
            if not pbp_df.empty:
                # Save raw play-by-play data
                pbp_file = os.path.join(self.output_dir, f"pbp_raw_{game_id}.csv")
                save_to_csv(pbp_df, pbp_file)
                
                # Get shot chart data
                shotchart_df = self.get_shot_chart(str(game_id))
                
                if not shotchart_df.empty:
                    # Save shot chart data
                    shot_file = os.path.join(self.output_dir, f"shotchart_{game_id}.csv")
                    save_to_csv(shotchart_df, shot_file)
                    
                    # Combine play-by-play with shot chart
                    combined_df = self.combine_pbp_with_shotchart(pbp_df, shotchart_df)
                    
                    # Process the combined data
                    processed_df = self.process_pbp_data(combined_df)
                    
                    # Save processed play-by-play data
                    processed_file = os.path.join(self.output_dir, f"pbp_processed_{game_id}.csv")
                    save_to_csv(processed_df, processed_file)
                
                # Get player tracking data if requested
                if include_tracking:
                    tracking_df = self.get_player_tracking(str(game_id))
                    
                    if not tracking_df.empty:
                        # Save tracking data
                        tracking_file = os.path.join(self.output_dir, f"tracking_{game_id}.csv")
                        save_to_csv(tracking_df, tracking_file)
            
            # Get Basketball-Reference data
            # Convert NBA game_id to Basketball-Reference format (approximate conversion)
            # This is a simplified conversion and may not work for all games
            try:
                game_date = schedule_df[schedule_df['GAME_ID'] == game_id]['GAME_DATE'].iloc[0]
                game_date_str = pd.to_datetime(game_date).strftime('%Y%m%d')
                bref_id = f"{game_date_str}0{i % 10}"  # Approximate
                
                bref_df = self.get_bref_pbp(bref_id)
                
                if not bref_df.empty:
                    # Save Basketball-Reference play-by-play data
                    bref_file = os.path.join(self.output_dir, f"bref_pbp_{game_id}.csv")
                    save_to_csv(bref_df, bref_file)
            except Exception as e:
                self.logger.error(f"Error processing Basketball-Reference data for game {game_id}: {e}")
            
            # Avoid hitting rate limits
            rate_limit(2.0, 4.0)
    
    def merge_season_data(self, season: str, output_file: Optional[str] = None) -> pd.DataFrame:
        """
        Merge all crawled data for a season into a single dataset.
        
        Args:
            season: Season in format "YYYY-YY" (e.g., "2021-22")
            output_file: Path to save the merged data (optional)
            
        Returns:
            DataFrame containing the merged data
        """
        self.logger.info(f"Merging data for {season} season")
        
        # Find all processed play-by-play files
        pbp_files = [
            os.path.join(self.output_dir, f) 
            for f in os.listdir(self.output_dir) 
            if f.startswith("pbp_processed_") and f.endswith(".csv")
        ]
        
        # Read and merge play-by-play data
        pbp_dfs = []
        for file in pbp_files:
            try:
                df = pd.read_csv(file)
                pbp_dfs.append(df)
            except Exception as e:
                self.logger.error(f"Error reading {file}: {e}")
        
        if pbp_dfs:
            merged_data = pd.concat(pbp_dfs, ignore_index=True)
            merged_data = merged_data.drop_duplicates()
            
            # Save merged data if requested
            if output_file:
                save_to_csv(merged_data, output_file)
            
            return merged_data
        else:
            self.logger.warning("No data found to merge")
            return pd.DataFrame()


if __name__ == "__main__":
    # Example usage
    crawler = NBACrawler(output_dir="data/nba", log_file="logs/nba_crawler.log")
    
    # Crawl part of the 2021-22 season for a few teams
    crawler.crawl_season(
        season="2021-22",
        start_date="2021-10-19",  # 2021-22 Opening night
        end_date="2021-10-26",    # Just crawl one week for this example
        teams=["LAL", "GSW", "BOS"],  # Just a few teams for the example
        include_tracking=True
    )
    
    # Merge the data
    merged_data = crawler.merge_season_data(
        season="2021-22",
        output_file="data/nba/nba_2021_22_merged.csv"
    )

