"""
Utility functions for data crawling.

This module provides common utility functions for crawling and processing sports data.
"""

import os
import logging
import time
import random
import pandas as pd
from typing import Dict, List, Any, Optional, Union
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry


def setup_logging(log_file: str = None, level: int = logging.INFO) -> logging.Logger:
    """
    Set up logging configuration.
    
    Args:
        log_file: Path to log file. If None, logs to console only.
        level: Logging level (default: logging.INFO)
        
    Returns:
        Configured logger instance
    """
    logger = logging.getLogger("sports_crawler")
    logger.setLevel(level)
    
    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    
    # Console handler
    console_handler = logging.StreamHandler()
    console_handler.setFormatter(formatter)
    logger.addHandler(console_handler)
    
    # File handler (optional)
    if log_file:
        os.makedirs(os.path.dirname(log_file), exist_ok=True)
        file_handler = logging.FileHandler(log_file)
        file_handler.setFormatter(formatter)
        logger.addHandler(file_handler)
    
    return logger


def create_session(retries: int = 3, backoff_factor: float = 0.3) -> requests.Session:
    """
    Create a requests session with retry capabilities.
    
    Args:
        retries: Number of retries for failed requests
        backoff_factor: Backoff factor for retries
        
    Returns:
        Configured requests session
    """
    session = requests.Session()
    
    retry_strategy = Retry(
        total=retries,
        backoff_factor=backoff_factor,
        status_forcelist=[429, 500, 502, 503, 504],
    )
    
    adapter = HTTPAdapter(max_retries=retry_strategy)
    session.mount("http://", adapter)
    session.mount("https://", adapter)
    
    # Set a reasonable user agent
    session.headers.update({
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    })
    
    return session


def rate_limit(min_delay: float = 1.0, max_delay: float = 3.0) -> None:
    """
    Implement rate limiting with random delay to avoid overloading servers.
    
    Args:
        min_delay: Minimum delay in seconds
        max_delay: Maximum delay in seconds
    """
    delay = random.uniform(min_delay, max_delay)
    time.sleep(delay)


def save_to_csv(data: Union[List[Dict[str, Any]], pd.DataFrame], 
                output_path: str, 
                mode: str = 'w',
                index: bool = False) -> None:
    """
    Save data to CSV file.
    
    Args:
        data: Data to save (list of dictionaries or DataFrame)
        output_path: Path to output CSV file
        mode: File opening mode ('w' for write, 'a' for append)
        index: Whether to write DataFrame index
    """
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    
    if isinstance(data, list) and len(data) > 0:
        df = pd.DataFrame(data)
        df.to_csv(output_path, mode=mode, index=index, header=(mode == 'w'))
    elif isinstance(data, pd.DataFrame):
        data.to_csv(output_path, mode=mode, index=index, header=(mode == 'w'))
    else:
        raise ValueError("Data must be a non-empty list of dictionaries or a DataFrame")

