"""
NHL Play-by-Play Data Crawler

This module provides functionality to crawl and process NHL play-by-play data
from the NHL Stats API and Hockey-Reference.
"""

import os
import json
import time
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from typing import Dict, List, Any, Optional, Union, Tuple
import logging

from .utils import create_session, rate_limit, save_to_csv, setup_logging


class NHLCrawler:
    """
    NHL Play-by-Play data crawler for NHL Stats API and Hockey-Reference.
    
    This class provides methods to fetch and process NHL play-by-play data,
    focusing on shot-by-shot information, game states, and player tracking data.
    """
    
    # Base URLs
    NHL_API_BASE = "https://statsapi.web.nhl.com/api/v1"
    NHL_SCHEDULE_ENDPOINT = "/schedule"
    NHL_GAME_ENDPOINT = "/game/{}/feed/live"
    NHL_BOXSCORE_ENDPOINT = "/game/{}/boxscore"
    NHL_SHIFTS_ENDPOINT = "/game/{}/shifts"
    
    # Hockey-Reference URL
    HOCKEY_REF_URL = "https://www.hockey-reference.com/boxscores/{}.html"
    
    # Team IDs mapping
    TEAM_IDS = {
        "ANA": 24, "ARI": 53, "BOS": 6, "BUF": 7, "CGY": 20,
        "CAR": 12, "CHI": 16, "COL": 21, "CBJ": 29, "DAL": 25,
        "DET": 17, "EDM": 22, "FLA": 13, "LAK": 26, "MIN": 30,
        "MTL": 8, "NSH": 18, "NJD": 1, "NYI": 2, "NYR": 3,
        "OTT": 9, "PHI": 4, "PIT": 5, "SJS": 28, "SEA": 55,
        "STL": 19, "TBL": 14, "TOR": 10, "VAN": 23, "VGK": 54,
        "WSH": 15, "WPG": 52
    }
    
    # Event types
    EVENT_TYPES = {
        "SHOT": "Shot",
        "GOAL": "Goal",
        "MISSED_SHOT": "Missed Shot",
        "BLOCKED_SHOT": "Blocked Shot",
        "HIT": "Hit",
        "FACEOFF": "Faceoff",
        "GIVEAWAY": "Giveaway",
        "TAKEAWAY": "Takeaway",
        "PENALTY": "Penalty",
        "STOP": "Stoppage"
    }
    
    def __init__(self, output_dir: str = "data/nhl", log_file: Optional[str] = None):
        """
        Initialize the NHL crawler.
        
        Args:
            output_dir: Directory to save output files
            log_file: Path to log file (optional)
        """
        self.output_dir = output_dir
        self.session = create_session(retries=5, backoff_factor=0.5)
        self.logger = setup_logging(log_file=log_file)
        
        # Create output directory
        os.makedirs(output_dir, exist_ok=True)
    
    def get_schedule(self, start_date: str, end_date: str, team: Optional[str] = None) -> List[Dict[str, Any]]:
        """
        Get NHL schedule for a specific date range.
        
        Args:
            start_date: Start date in YYYY-MM-DD format
            end_date: End date in YYYY-MM-DD format
            team: Team abbreviation (optional)
            
        Returns:
            List of game dictionaries
        """
        url = f"{self.NHL_API_BASE}{self.NHL_SCHEDULE_ENDPOINT}"
        
        params = {
            "startDate": start_date,
            "endDate": end_date,
            "expand": "schedule.teams,schedule.linescore,schedule.game.content.media.epg"
        }
        
        if team and team in self.TEAM_IDS:
            params["teamId"] = self.TEAM_IDS[team]
        
        self.logger.info(f"Fetching NHL schedule from {start_date} to {end_date}")
        
        try:
            response = self.session.get(url, params=params)
            response.raise_for_status()
            
            data = response.json()
            games = []
            
            if "dates" in data:
                for date in data["dates"]:
                    if "games" in date:
                        games.extend(date["games"])
            
            self.logger.info(f"Retrieved {len(games)} games")
            return games
        
        except Exception as e:
            self.logger.error(f"Error fetching NHL schedule: {e}")
            return []
    
    def get_game_data(self, game_id: str) -> Dict[str, Any]:
        """
        Get detailed game data from NHL Stats API.
        
        Args:
            game_id: NHL game ID
            
        Returns:
            Dictionary containing game data
        """
        url = f"{self.NHL_API_BASE}{self.NHL_GAME_ENDPOINT.format(game_id)}"
        
        self.logger.info(f"Fetching game data for game {game_id}")
        
        try:
            response = self.session.get(url)
            response.raise_for_status()
            
            data = response.json()
            return data
        
        except Exception as e:
            self.logger.error(f"Error fetching game data for {game_id}: {e}")
            return {}
    
    def get_boxscore(self, game_id: str) -> Dict[str, Any]:
        """
        Get boxscore data from NHL Stats API.
        
        Args:
            game_id: NHL game ID
            
        Returns:
            Dictionary containing boxscore data
        """
        url = f"{self.NHL_API_BASE}{self.NHL_BOXSCORE_ENDPOINT.format(game_id)}"
        
        self.logger.info(f"Fetching boxscore for game {game_id}")
        
        try:
            response = self.session.get(url)
            response.raise_for_status()
            
            data = response.json()
            return data
        
        except Exception as e:
            self.logger.error(f"Error fetching boxscore for {game_id}: {e}")
            return {}
    
    def get_shifts(self, game_id: str) -> Dict[str, Any]:
        """
        Get shift data from NHL Stats API.
        
        Args:
            game_id: NHL game ID
            
        Returns:
            Dictionary containing shift data
        """
        url = f"{self.NHL_API_BASE}{self.NHL_SHIFTS_ENDPOINT.format(game_id)}"
        
        self.logger.info(f"Fetching shifts for game {game_id}")
        
        try:
            response = self.session.get(url)
            response.raise_for_status()
            
            data = response.json()
            return data
        
        except Exception as e:
            self.logger.error(f"Error fetching shifts for {game_id}: {e}")
            return {}
    
    def process_game_events(self, game_data: Dict[str, Any]) -> List[Dict[str, Any]]:
        """
        Process play-by-play events from game data.
        
        Args:
            game_data: Game data from NHL Stats API
            
        Returns:
            List of processed play events
        """
        if not game_data or "liveData" not in game_data or "plays" not in game_data["liveData"]:
            return []
        
        plays = []
        all_plays = game_data["liveData"]["plays"]["allPlays"]
        
        # Get game metadata
        game_id = game_data["gamePk"]
        game_date = game_data["gameData"]["datetime"]["dateTime"].split("T")[0]
        home_team = game_data["gameData"]["teams"]["home"]["abbreviation"]
        away_team = game_data["gameData"]["teams"]["away"]["abbreviation"]
        
        # Get player info for mapping IDs to names
        player_info = {}
        if "players" in game_data["gameData"]:
            for player_id, player_data in game_data["gameData"]["players"].items():
                if player_id.startswith("ID"):
                    player_id = player_id[2:]  # Remove "ID" prefix
                player_info[player_id] = {
                    "fullName": player_data["fullName"],
                    "position": player_data["primaryPosition"]["code"]
                }
        
        # Process each play
        for idx, play in enumerate(all_plays):
            event_type = play["result"]["eventTypeId"]
            
            # Create base event dictionary
            event = {
                "game_id": game_id,
                "game_date": game_date,
                "event_idx": idx,
                "event_id": play["about"]["eventIdx"],
                "period": play["about"]["period"],
                "period_type": play["about"]["periodType"],
                "period_time": play["about"]["periodTime"],
                "period_time_remaining": play["about"]["periodTimeRemaining"],
                "event_type": self.EVENT_TYPES.get(event_type, event_type),
                "description": play["result"].get("description", ""),
                "home_team": home_team,
                "away_team": away_team,
                "home_score": play["about"].get("goals", {}).get("home", 0),
                "away_score": play["about"].get("goals", {}).get("away", 0),
                "score_differential": play["about"].get("goals", {}).get("home", 0) - play["about"].get("goals", {}).get("away", 0),
                "x_coord": play.get("coordinates", {}).get("x"),
                "y_coord": play.get("coordinates", {}).get("y")
            }
            
            # Add team info
            if "team" in play:
                event["team_id"] = play["team"].get("id")
                event["team_name"] = play["team"].get("name")
                event["team_abbrev"] = play["team"].get("triCode")
            
            # Add player info
            if "players" in play:
                for i, player in enumerate(play["players"]):
                    player_id = str(player["player"]["id"])
                    player_type = player["playerType"]
                    
                    # Add player ID
                    event[f"player{i+1}_id"] = player_id
                    
                    # Add player name if available
                    if player_id in player_info:
                        event[f"player{i+1}_name"] = player_info[player_id]["fullName"]
                        event[f"player{i+1}_position"] = player_info[player_id]["position"]
                    
                    # Add player role
                    event[f"player{i+1}_role"] = player_type
            
            # Add specific fields based on event type
            if event_type == "SHOT" or event_type == "GOAL" or event_type == "MISSED_SHOT":
                event["is_shot"] = True
                event["is_goal"] = (event_type == "GOAL")
                
                # Extract shot type if available
                if "secondaryType" in play["result"]:
                    event["shot_type"] = play["result"]["secondaryType"]
            
            elif event_type == "BLOCKED_SHOT":
                event["is_shot"] = True
                event["is_goal"] = False
                event["is_blocked"] = True
            
            # Add strength info for goals
            if event_type == "GOAL" and "strength" in play["result"]:
                event["strength"] = play["result"]["strength"]["name"]
                event["is_empty_net"] = play["result"]["strength"].get("code") == "EN"
            
            # Add penalty info
            if event_type == "PENALTY" and "secondaryType" in play["result"]:
                event["penalty_type"] = play["result"]["secondaryType"]
                event["penalty_minutes"] = play["result"].get("penaltyMinutes", 0)
            
            plays.append(event)
        
        return plays
    
    def process_shot_sequence(self, plays: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
        """
        Process shot sequences to add context from preceding events.
        
        Args:
            plays: List of play events
            
        Returns:
            List of plays with added sequence context
        """
        if not plays:
            return plays
        
        # Sort plays by event index to ensure chronological order
        plays_sorted = sorted(plays, key=lambda x: x["event_idx"])
        
        # Initialize sequence features
        for i, play in enumerate(plays_sorted):
            # Only process shots
            if play.get("is_shot", False):
                # Find preceding events (up to 3)
                preceding_events = []
                preceding_idx = i - 1
                count = 0
                
                while preceding_idx >= 0 and count < 3:
                    preceding_events.append(plays_sorted[preceding_idx]["event_type"])
                    preceding_idx -= 1
                    count += 1
                
                # Add preceding event types
                play["preceding_event_1"] = preceding_events[0] if len(preceding_events) > 0 else None
                play["preceding_event_2"] = preceding_events[1] if len(preceding_events) > 1 else None
                play["preceding_event_3"] = preceding_events[2] if len(preceding_events) > 2 else None
                
                # Check if this is a rebound shot (previous event was also a shot)
                play["is_rebound"] = play["preceding_event_1"] in ["Shot", "Missed Shot", "Blocked Shot"]
                
                # Check if this is a rush shot (quick transition)
                if len(preceding_events) >= 2:
                    transition_events = ["Takeaway", "Faceoff", "Giveaway"]
                    play["is_rush"] = any(event in transition_events for event in preceding_events[:2])
                else:
                    play["is_rush"] = False
                
                # Calculate time since previous shot
                if play["is_rebound"] and i > 0:
                    prev_shot_idx = i - 1
                    while prev_shot_idx >= 0:
                        prev_play = plays_sorted[prev_shot_idx]
                        if prev_play.get("is_shot", False):
                            # Calculate time difference in seconds
                            curr_period = play["period"]
                            prev_period = prev_play["period"]
                            
                            if curr_period == prev_period:
                                # Same period, simple time difference
                                curr_time = self._convert_time_to_seconds(play["period_time"])
                                prev_time = self._convert_time_to_seconds(prev_play["period_time"])
                                play["seconds_since_last_shot"] = curr_time - prev_time
                            else:
                                # Different periods, more complex calculation
                                # For simplicity, we'll just set a large value
                                play["seconds_since_last_shot"] = 100  # Arbitrary large value
                            
                            break
                        prev_shot_idx -= 1
        
        return plays_sorted
    
    def _convert_time_to_seconds(self, time_str: str) -> int:
        """
        Convert MM:SS time string to seconds.
        
        Args:
            time_str: Time string in MM:SS format
            
        Returns:
            Time in seconds
        """
        if not time_str or ":" not in time_str:
            return 0
        
        try:
            parts = time_str.split(":")
            minutes = int(parts[0])
            seconds = int(parts[1])
            return minutes * 60 + seconds
        except:
            return 0
    
    def add_goalie_info(self, plays: List[Dict[str, Any]], boxscore: Dict[str, Any]) -> List[Dict[str, Any]]:
        """
        Add goalie information to shot events.
        
        Args:
            plays: List of play events
            boxscore: Boxscore data from NHL API
            
        Returns:
            List of plays with added goalie information
        """
        if not plays or not boxscore or "teams" not in boxscore:
            return plays
        
        try:
            # Extract goalie information from boxscore
            home_goalies = {}
            away_goalies = {}
            
            home_team = boxscore["teams"]["home"]
            away_team = boxscore["teams"]["away"]
            
            # Get home goalies
            if "goalies" in home_team["players"]:
                for goalie_id in home_team["players"]["goalies"]:
                    player_key = f"ID{goalie_id}"
                    if player_key in home_team["players"]:
                        goalie_data = home_team["players"][player_key]
                        home_goalies[goalie_id] = {
                            "name": goalie_data["person"]["fullName"],
                            "jersey": goalie_data["jerseyNumber"]
                        }
            
            # Get away goalies
            if "goalies" in away_team["players"]:
                for goalie_id in away_team["players"]["goalies"]:
                    player_key = f"ID{goalie_id}"
                    if player_key in away_team["players"]:
                        goalie_data = away_team["players"][player_key]
                        away_goalies[goalie_id] = {
                            "name": goalie_data["person"]["fullName"],
                            "jersey": goalie_data["jerseyNumber"]
                        }
            
            # Add goalie info to shot events
            for play in plays:
                if play.get("is_shot", False):
                    # Determine which team took the shot
                    shooting_team = play.get("team_abbrev")
                    
                    if shooting_team:
                        # Determine goalie based on shooting team
                        if shooting_team == play["home_team"]:
                            # Home team shooting at away goalie
                            if away_goalies:
                                # Just use the first goalie for simplicity
                                # In a real implementation, you'd need to track goalie changes
                                goalie_id = list(away_goalies.keys())[0]
                                play["goalie_id"] = goalie_id
                                play["goalie_name"] = away_goalies[goalie_id]["name"]
                        else:
                            # Away team shooting at home goalie
                            if home_goalies:
                                goalie_id = list(home_goalies.keys())[0]
                                play["goalie_id"] = goalie_id
                                play["goalie_name"] = home_goalies[goalie_id]["name"]
            
            return plays
        
        except Exception as e:
            self.logger.error(f"Error adding goalie info: {e}")
            return plays
    
    def calculate_shot_features(self, plays: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
        """
        Calculate additional shot features.
        
        Args:
            plays: List of play events
            
        Returns:
            List of plays with added shot features
        """
        if not plays:
            return plays
        
        for play in plays:
            if play.get("is_shot", False):
                # Calculate shot distance if coordinates are available
                x = play.get("x_coord")
                y = play.get("y_coord")
                
                if x is not None and y is not None:
                    # NHL rink is 200 feet long and 85 feet wide
                    # The goal is located at x = 89 feet from center (or -89 for the other goal)
                    # We need to determine which goal the shot was directed at
                    
                    # Simplified calculation: assume shots with positive x are toward the right goal
                    # and shots with negative x are toward the left goal
                    goal_x = 89 if x < 0 else -89
                    
                    # Calculate Euclidean distance to goal
                    play["shot_distance"] = np.sqrt((x - goal_x)**2 + y**2)
                    
                    # Calculate shot angle (in degrees)
                    # 0 degrees is straight on, 90 degrees is from the side
                    if x != goal_x:  # Avoid division by zero
                        angle_rad = np.abs(np.arctan(y / (x - goal_x)))
                        play["shot_angle"] = np.degrees(angle_rad)
                    else:
                        play["shot_angle"] = 90.0
        
        return plays
    
    def crawl_season(self, year: int, start_date: Optional[str] = None, 
                     end_date: Optional[str] = None, 
                     teams: Optional[List[str]] = None) -> None:
        """
        Crawl play-by-play data for an entire NHL season or date range.
        
        Args:
            year: Season year (e.g., 2022 for the 2021-22 season)
            start_date: Start date in YYYY-MM-DD format (optional)
            end_date: End date in YYYY-MM-DD format (optional)
            teams: List of team abbreviations to crawl (optional)
        """
        # Set default dates for the season
        if not start_date:
            start_date = f"{year-1}-10-01"  # Approximate season start
        if not end_date:
            end_date = f"{year}-06-30"  # Approximate season end including playoffs
        
        self.logger.info(f"Starting NHL season crawl for {year-1}-{year} ({start_date} to {end_date})")
        
        # Process date range in chunks to avoid timeout/memory issues
        start_dt = datetime.strptime(start_date, "%Y-%m-%d")
        end_dt = datetime.strptime(end_date, "%Y-%m-%d")
        
        current_dt = start_dt
        chunk_size = timedelta(days=14)  # Process two weeks at a time
        
        while current_dt <= end_dt:
            chunk_end = min(current_dt + chunk_size, end_dt)
            chunk_start_str = current_dt.strftime("%Y-%m-%d")
            chunk_end_str = chunk_end.strftime("%Y-%m-%d")
            
            self.logger.info(f"Processing games from {chunk_start_str} to {chunk_end_str}")
            
            # Get schedule for the date range
            games = self.get_schedule(chunk_start_str, chunk_end_str)
            
            # Filter by teams if specified
            if teams:
                filtered_games = []
                for game in games:
                    home_team = game["teams"]["home"]["team"]["abbreviation"]
                    away_team = game["teams"]["away"]["team"]["abbreviation"]
                    if home_team in teams or away_team in teams:
                        filtered_games.append(game)
                games = filtered_games
            
            # Process each game
            for game in games:
                game_id = game["gamePk"]
                self.logger.info(f"Processing game {game_id}")
                
                # Get game data
                game_data = self.get_game_data(str(game_id))
                
                if game_data:
                    # Get boxscore for goalie information
                    boxscore = self.get_boxscore(str(game_id))
                    
                    # Process play-by-play events
                    plays = self.process_game_events(game_data)
                    
                    if plays:
                        # Add sequence context
                        plays = self.process_shot_sequence(plays)
                        
                        # Add goalie information
                        if boxscore:
                            plays = self.add_goalie_info(plays, boxscore)
                        
                        # Calculate shot features
                        plays = self.calculate_shot_features(plays)
                        
                        # Save play-by-play data
                        pbp_file = os.path.join(self.output_dir, f"pbp_{game_id}.csv")
                        save_to_csv(plays, pbp_file)
                
                # Get shifts data
                shifts = self.get_shifts(str(game_id))
                
                if shifts:
                    # Save shifts data
                    shifts_file = os.path.join(self.output_dir, f"shifts_{game_id}.json")
                    with open(shifts_file, 'w') as f:
                        json.dump(shifts, f)
                
                # Avoid hitting rate limits
                rate_limit(1.5, 3.0)
            
            # Move to next chunk
            current_dt = chunk_end + timedelta(days=1)
    
    def merge_season_data(self, year: int, output_file: Optional[str] = None) -> pd.DataFrame:
        """
        Merge all crawled data for a season into a single dataset.
        
        Args:
            year: Season year (e.g., 2022 for the 2021-22 season)
            output_file: Path to save the merged data (optional)
            
        Returns:
            DataFrame containing the merged data
        """
        self.logger.info(f"Merging data for {year-1}-{year} season")
        
        # Find all play-by-play files
        pbp_files = [
            os.path.join(self.output_dir, f) 
            for f in os.listdir(self.output_dir) 
            if f.startswith("pbp_") and f.endswith(".csv")
        ]
        
        # Read and merge play-by-play data
        pbp_dfs = []
        for file in pbp_files:
            try:
                df = pd.read_csv(file)
                pbp_dfs.append(df)
            except Exception as e:
                self.logger.error(f"Error reading {file}: {e}")
        
        if pbp_dfs:
            merged_data = pd.concat(pbp_dfs, ignore_index=True)
            merged_data = merged_data.drop_duplicates()
            
            # Filter shots only
            shots_data = merged_data[merged_data["is_shot"] == True].copy()
            
            # Add additional features for analysis
            
            # Convert period time to seconds from start of game
            shots_data["game_seconds"] = shots_data.apply(
                lambda row: (row["period"] - 1) * 1200 + self._convert_time_to_seconds(row["period_time"]),
                axis=1
            )
            
            # Create binary outcome for goals
            shots_data["is_goal"] = shots_data["event_type"] == "Goal"
            
            # Save merged data if requested
            if output_file:
                save_to_csv(shots_data, output_file)
            
            return shots_data
        else:
            self.logger.warning("No data found to merge")
            return pd.DataFrame()


if __name__ == "__main__":
    # Example usage
    crawler = NHLCrawler(output_dir="data/nhl", log_file="logs/nhl_crawler.log")
    
    # Crawl part of the 2021-22 season for a few teams
    crawler.crawl_season(
        year=2022,  # 2021-22 season
        start_date="2021-10-12",  # 2021-22 Opening night
        end_date="2021-10-19",    # Just crawl one week for this example
        teams=["TOR", "MTL", "NYR"]  # Just a few teams for the example
    )
    
    # Merge the data
    shots_data = crawler.merge_season_data(
        year=2022,
        output_file="data/nhl/nhl_2021_22_shots.csv"
    )

