import numpy as np
import pandas as pd
import statsmodels.api as sm
from typing import Union
from numpy.testing import *

def Markowitz(r:Union[np.ndarray, pd.DataFrame]) -> np.ndarray:
    """
    r : numpy.ndarray or pandas.DataFrame
        Panel of asset returns where rows (axis=0) changes with time and columns (axis=1) change cross-sectionally.
    """
    if isinstance(r, pd.DataFrame):
        r = r.values
    assert np.ndim(r)==2, f"Dimension of [r] is {np.ndim(r)}. It must be 2."

    Sigma = r.T.dot(r) / r.shape[0]
    mu = np.mean(r, axis=0)
    w = np.dot(np.linalg.pinv(Sigma), mu)

    return w


def sharpe(r:Union[np.ndarray, pd.Series, pd.DataFrame]) -> Union[float, np.ndarray]:
    """
    r : numpy.ndarray or pandas.Series or pandas.DataFrame
        if [r] has a dimension of 1, it is a time series of returns.
        If [r] has a dimension of 2, it is a panel of asset returns where rows (axis=0) changes with time and columns (axis=1) change cross-sectionally.
    """
    if isinstance(r, pd.Series) or isinstance(r, pd.DataFrame):
        r = r.values

    if np.ndim(r) == 1:
        out = np.mean(r / r.std())

    elif np.ndim(r) == 2:
        out = np.mean(r / r.std(axis=0), axis=0)

    else:
        raise AssertionError(f"Dimension of [r] is {np.ndim(r)}. It must be 1 or 2.")

    return np.mean(r / r.std())


def get_factor_names(factor_model:str='FF3') -> pd.Index:
    factor_model = factor_model.upper()
    
    if factor_model == 'CAPM':
        out = ['Mkt-RF']
        
    elif factor_model == 'FF3':
        out = ['Mkt-RF','SMB','HML']
    
    elif factor_model == 'FF5':
        out = ['Mkt-RF','SMB','HML','RMW','CMA']
    
    elif factor_model == 'FFC4':
        out = ['Mkt-RF','SMB','HML','Mom']
        
    elif factor_model == 'FFC6':
        out = ['Mkt-RF','SMB','HML','RMW','CMA','Mom']
        
    out = pd.Index(out)
    
    return out


def compute_residuals_of_cross_sectional_regression(asset_rtrn:pd.Series, beta:pd.DataFrame, rf:float) -> pd.Series:
    """
    Compute residuals of regression of a cross-section of excess returns on betas (i.e., factor loadings), without an intercept, i.e.,
        r_{i,t} - rf_t = beta_i' * f_t + eps_{i,t}
    for a fixed time t.

    asset_rtrn : pandas.Series
        It consists of *a* cross-section of returns, not a panel of returns.

    beta : pandas.DataFrame
        A row consists of factor loadings of an asset.

    rf : float
        Risk free rate at the time of the cross section.
    """
    assert np.ndim(asset_rtrn)==1, f"Dimension of [asset_rtrn] is {np.ndim(asset_rtrn)}. It must be 1."

    # Parse dependent variables
    excess_asset_rtrn = (asset_rtrn.dropna() - rf) * 100

    # Parse independent variables
    b = beta.dropna()
    
    idx_assets = excess_asset_rtrn.index.intersection(b.index)
    if len(idx_assets) > 0:
        # Regress
        ols_results = sm.OLS(excess_asset_rtrn[idx_assets], b.loc[idx_assets]).fit()
        out = (ols_results.resid)/100
        out = pd.concat([out, pd.Series(None, index=asset_rtrn.index.difference(out.index), dtype=float)])
       
    else:
        out = pd.Series(None, index=asset_rtrn.index, dtype=float)

    # Form output
    out.name = asset_rtrn.name
    out = out.loc[asset_rtrn.index]

    return out


def estimate_coef_by_time_series_regression(asset_rtrn:Union[pd.Series, pd.DataFrame], factor_rtrn:pd.DataFrame, rf:pd.Series=None, intercept:bool=True, min_non_missing:int=12) -> Union[pd.Series, pd.DataFrame]:
    """
    Compute estimated coefficients (i.e., factor loading) of regression of a time-series of excess returns on factor returns, i.e.,
        r_{i,t} - rf_t = alpha_i + beta_i' * f_t + eps_{i,t}
    for a fixed asset i. (alpha_i is estimated only if [intercept]==True.)

    asset_rtrn : pandas.Series or pandas.DataFrame
        A time-series of returns (if its data type is pandas.Series)
        A panel of asset returns where rows (axis=0) changes with time and columns (axis=1) change cross-sectionally (if its data type is pandas.DataFrame)

    factor_rtrn : pandas.DataFrame
        A panel of factor returns where rows (axis=0) changes with time and columns (axis=1) change cross-sectionally.

    rf : pandas.Series
        A time-series of Risk free rates.
        If None, rf_t is assumed to be zero for all t.

    intercept : bool
        Determines whether to estimate alpha of the time-series regression.

    min_non_missing : int
        If the number of non-missing values in a firm's [asset_rtrn-rf] is less than [min_non_missing],
        then time-series regression is not performed.
    """
    if isinstance(asset_rtrn, pd.Series):
        assert np.all(asset_rtrn.index == factor_rtrn.index)
        if rf is not None:
            assert np.all(asset_rtrn.index == rf.index)

        # Parse independent variables
        X = factor_rtrn.values * 100.0
        if intercept:
            X = sm.add_constant(X)

        # Parse dependent variables
        if rf is None:
            Y = asset_rtrn * 100.0
        else:
            Y = (asset_rtrn - rf) * 100.0

        t_valid = np.where(~Y.isna())[0]

        if len(t_valid) >= min_non_missing:
            # Estimate coefficients (i.e., factor loadings) of regression of excess asset returns on factor returns
            ols_results = sm.OLS(Y.iloc[t_valid].values, X[t_valid]).fit()
            estm_coef = pd.Series(ols_results.params)
        else:
            estm_coef = pd.Series(np.nan, index=range(factor_rtrn.shape[1]+int(intercept)))


        # Reformulate
        estm_coef.name = asset_rtrn.name
        if intercept:
            estm_coef.index = ['const'] + factor_rtrn.columns.to_list()
            estm_coef.loc['const'] /= 100

        else:
            estm_coef.index = factor_rtrn.columns


    elif isinstance(asset_rtrn, pd.DataFrame):
        estm_coef = asset_rtrn.apply(estimate_coef_by_time_series_regression, axis=0, raw=False, args=(factor_rtrn, rf, intercept, min_non_missing))


    else:
        raise AssertionError(f"Data type of [rtrn_firm] is {type(rtrn_firm)}. It must be pandas.Series or pandas.DataFrame.")


    return estm_coef


def estimate_residual_by_time_series_regression(asset_rtrn:Union[pd.Series, pd.DataFrame], factor_rtrn:pd.DataFrame, rf:pd.Series=None, intercept:bool=True, min_non_missing:int=12) -> Union[pd.Series, pd.DataFrame]:
    """
    Compute estimated residuals of regression of a time-series of excess returns on factor returns, i.e.,
        r_{i,t} - rf_t = alpha_i + beta_i' * f_t + eps_{i,t}
    for a fixed asset i. (alpha_i is estimated only if [intercept]==True.)

    asset_rtrn : pandas.Series or pandas.DataFrame
        A time-series of returns (if its data type is pandas.Series)
        A panel of asset returns where rows (axis=0) changes with time and columns (axis=1) change cross-sectionally (if its data type is pandas.DataFrame)

    factor_rtrn : pandas.DataFrame
        A panel of factor returns where rows (axis=0) changes with time and columns (axis=1) change cross-sectionally.

    rf : pandas.Series
        A time-series of Risk free rates.
        If None, rf_t is assumed to be zero for all t.

    intercept : bool
        Determines whether to estimate alpha of the time-series regression.

    min_non_missing : int
        If the number of non-missing values in a firm's [asset_rtrn-rf] is less than [min_non_missing],
        then time-series regression is not performed.
    """
    if isinstance(asset_rtrn, pd.Series):
        assert np.all(asset_rtrn.index == factor_rtrn.index)
        if rf is not None:
            assert np.all(asset_rtrn.index == rf.index)

        # Parse independent variables
        X = factor_rtrn.values * 100.0
        if intercept:
            X = sm.add_constant(X)

        # Parse dependent variables
        if rf is None:
            Y = asset_rtrn * 100.0
        else:
            Y = (asset_rtrn - rf) * 100.0

        t_valid = np.where(~Y.isna())[0]

        resid = pd.Series(None, index=Y.index, dtype=float, name=asset_rtrn.name)
        if len(t_valid) >= min_non_missing:
            # Estimate residuals of regression of excess asset returns on factor returns
            ols_results = sm.OLS(Y.iloc[t_valid].values, X[t_valid]).fit()
            resid.iloc[t_valid] = ols_results.resid/100


    elif isinstance(asset_rtrn, pd.DataFrame):
        resid = asset_rtrn.apply(estimate_residual_by_time_series_regression, axis=0, raw=False, args=(factor_rtrn, rf, intercept, min_non_missing))


    else:
        raise AssertionError(f"Data type of [rtrn_firm] is {type(rtrn_firm)}. It must be pandas.Series or pandas.DataFrame.")


    return resid


def compute_explanability(r:pd.DataFrame, resid:pd.DataFrame) -> pd.Series:
    r = r.dropna(how='all', axis=1)
    resid = resid.loc[:,r.columns].copy()
    N_t = r.apply(lambda x: len(x.dropna()), axis=1)
    T_i = r.apply(lambda x: len(x.dropna()), axis=0)

    # 1. EV
    numer = ((resid**2).sum(axis=1, skipna=True) / N_t).mean()
    denom = ((r**2).sum(axis=1, skipna=True) / N_t).mean()
    ev = 1-numer/denom

    # 2. XS-R2
    numer = ((resid.sum(axis=0, skipna=True)/T_i)**2).mean()
    denom = ((r.sum(axis=0, skipna=True)/T_i)**2).mean()
    xs_r2 = 1-numer/denom

    # 3. XS-R2 weighted
    numer = (((resid.sum(axis=0, skipna=True)/T_i)**2) * T_i).mean()
    denom = (((r.sum(axis=0, skipna=True)/T_i)**2) * T_i).mean()
    xs_r2_weighted = 1-numer/denom

    out = pd.Series({'EV':ev, 'XS-R2':xs_r2, 'W-XS-R2': xs_r2_weighted})

    return out


def get_cov_matrix(name, N, Sigma_F:np.ndarray, Lambda:np.ndarray, Sigma_U:np.ndarray, sample_Sigma_U:np.ndarray):
    if name == "identity":
        covmat = identity(N)

    # Full matrices
    elif name == "population_cov_of_xs_alpha":
        covmat = Lambda @ Sigma_F @ Lambda.T + Sigma_U
    elif name == 'population_cov_of_E':
        covmat = Sigma_U
    elif name == 'sample_cov_of_E':
        covmat = sample_Sigma_U

    # Diagonal matrices
    elif name == 'population_diag_of_cov_of_xs_alpha':
        tmp = get_cov_matrix('population_cov_of_xs_alpha', N, Sigma_F, Lambda, Sigma_U, sample_Sigma_U)
        covmat = np.diag(np.diag(tmp))
    elif name == 'population_diag_of_cov_of_E':
        tmp = get_cov_matrix('population_cov_of_E', N, Sigma_F, Lambda, Sigma_U, sample_Sigma_U)
        covmat = np.diag(np.diag(tmp))
    elif name == 'sample_diag_of_cov_of_E':
        tmp = get_cov_matrix('sample_cov_of_E', N, Sigma_F, Lambda, Sigma_U, sample_Sigma_U)
        covmat = np.diag(np.diag(tmp))

    # Assert
    assert_array_equal((N,N), covmat.shape, name)

    return (covmat.T + covmat) / 2 # guarantees symmetry


def compute_normalzation_factor_of_covmat(covmat:np.ndarray) -> float:
    # Normalize the covariance matrix to make it satisfy
    # trace(covmat * normalization_factor)=len(covmat),
    # which is equal to that of the identity matrix.
    normalization_factor = 1 / np.mean(np.diag(covmat))
    return normalization_factor


def get_V_matrices(name, N, Sigma_F:np.ndarray, Lambda:np.ndarray, Sigma_U:np.ndarray, sample_Sigma_U:np.ndarray, K_min:float, do_sanity_check:bool=False, normalization:int=0):
    covmat = get_cov_matrix(name, N, Sigma_F, Lambda, Sigma_U, sample_Sigma_U)
    assert_array_equal((N,N), covmat.shape, name)

    if name in ['sample_cov_of_E', 'sample_diag_of_cov_of_E']:
        # This way, the nonzero smallest eigenvalue of [covmat] has now
        # the smallest eigenvalue of K_min*nonzero_eval_min, which is not 0, anymore.
        rank = np.linalg.matrix_rank(covmat)
        eigvals = np.linalg.eigvalsh(covmat)
        eigvals = np.sort(eigvals)[::-1] # The largest one comes first.
        nonzero_eval_min = eigvals[rank-1] # Choose the "rank"-th largest eigenvalue, i.e., the smallest non-zero eigenvalue(=[nonzero_eval_min]) of covmat.
        covmat += K_min * nonzero_eval_min * np.identity(N) # The smallest eigenvalue of this new [covmat] becomes [K_min*nonzero_eval_min].

    if normalization == 0:
        covmat = covmat * compute_normalzation_factor_of_covmat(covmat)
        V = np.linalg.inv(covmat)

    elif normalization == 1:
        covmat_inv = np.linalg.inv(covmat)
        V = covmat_inv * compute_normalzation_factor_of_covmat(covmat_inv)

    if do_sanity_check:
        assert_equal(np.linalg.matrix_rank(V), N, f"rank(V)(={np.linalg.matrix_rank(V)}) must be {N} for the matrix P of regulpca to be nonsingular.")

    return (V.T + V) / 2 # guarantees symmetry

