import numpy as np
from math import sqrt, exp
from scipy.stats import beta, ttest_ind, wasserstein_distance

def kolmogorov_smirnov_cdf(x):
    if x < 0:
        return 0
    elif x == 0:
        return 1
    else:
        total = 0
        for k in range(1, 100):  # Series approximation
            total += (-1) ** (k - 1) * exp(-2 * (k ** 2) * (x ** 2))
        return 1 - 2 * total
        
def ks_2samp_with_pvalue(sample1, sample2):
    # Sort the samples
    sample1_sorted = np.sort(sample1)
    sample2_sorted = np.sort(sample2)

    # Calculate the empirical cumulative distribution functions (ECDFs)
    n1 = len(sample1_sorted)
    n2 = len(sample2_sorted)

    # Create combined sorted values from both samples
    combined_sorted = np.sort(np.concatenate((sample1_sorted, sample2_sorted)))

    # ECDF for each sample at the combined sorted values
    ecdf1 = np.searchsorted(sample1_sorted, combined_sorted, side="right") / n1
    ecdf2 = np.searchsorted(sample2_sorted, combined_sorted, side="right") / n2

    # Calculate KS statistic (maximum difference between ECDFs)
    ks_statistic = np.max(np.abs(ecdf1 - ecdf2))

    # Compute p-value
    en = sqrt(n1 * n2 / (n1 + n2))  # Effective sample size
    lambda_value = sqrt(en) * ks_statistic

    p_value = 1 - kolmogorov_smirnov_cdf(lambda_value)

    return ks_statistic, p_value

def ks_test_against_beta(sample, alpha=0.05):
    # Step 1: Fit the Beta distribution to the sample
    a, b, loc, scale = beta.fit(sample, floc=0, fscale=1)  # Constrain to [0, 1]

    # Step 2: Sort the sample and compute ECDF
    sample_sorted = np.sort(sample)
    n = len(sample_sorted)
    ecdf = np.arange(1, n + 1) / n

    # Step 3: Compute the Beta CDF for the fitted distribution
    beta_cdf = beta.cdf(sample_sorted, a, b, loc=loc, scale=scale)

    # Step 4: Compute the KS statistic
    ks_statistic = np.max(np.abs(ecdf - beta_cdf))

    # Step 5: Compute p-value based on the KS statistic
    lambda_value = sqrt(n) * ks_statistic

    p_value = 1 - kolmogorov_smirnov_cdf(lambda_value)

    # Step 6: Compare p-value with alpha
    if p_value < alpha:
        result = "Reject the null hypothesis (sample differs from Beta distribution)"
    else:
        result = "Fail to reject the null hypothesis (sample follows Beta distribution)"

    return ks_statistic, p_value, result

def bootstrap_beta_params(sample, n_bootstrap=1000):
    bootstrap_a, bootstrap_b = [], []
    
    for _ in range(n_bootstrap):
        resample = np.random.choice(sample, size=len(sample), replace=True)
        a, b, _, _ = beta.fit(resample, floc=0, fscale=1)
        bootstrap_a.append(a)
        bootstrap_b.append(b)

    return np.mean(bootstrap_a), np.std(bootstrap_a), np.mean(bootstrap_b), np.std(bootstrap_b)

def compare_beta_params(sample1, sample2, flag_bootstrap=False):
    """
    Without bootstrapping: Sensitive to MLE bias, doesn't quantify distributional difference
    """
    # Fit Beta distributions
    if flag_bootstrap:
        a1, a1_std, b1, b1_std = bootstrap_beta_params(sample1)
        a2, a2_std, b2, b2_std = bootstrap_beta_params(sample2)
    else:
        a1, b1, _, _ = beta.fit(sample1, floc=0, fscale=1)
        a2, b2, _, _ = beta.fit(sample2, floc=0, fscale=1)

    t_stat_a, p_value_a = ttest_ind([a1], [a2], equal_var=False)
    t_stat_b, p_value_b = ttest_ind([b1], [b2], equal_var=False)
    
    if (p_value_a < 0.05) and (p_value_b < 0.05):
        result = "Reject the null hypothesis (sample differs from Beta distribution)"
    else:
        result = "Fail to reject the null hypothesis (sample follows Beta distribution)"
    return (p_value_a, p_value_b), result

def beta_wasserstein_distance(sample1, sample2):
    a1, b1, _, _ = beta.fit(sample1, floc=0, fscale=1)
    a2, b2, _, _ = beta.fit(sample2, floc=0, fscale=1)
    
    x_values = np.linspace(0, 1, 1000)
    beta_cdf1 = beta.cdf(x_values, a1, b1)
    beta_cdf2 = beta.cdf(x_values, a2, b2)
    
    return wasserstein_distance(beta_cdf1, beta_cdf2)

def beta_ks_test(sample1, sample2):
    #Poor tail sensitivity, loses shape information -> wasserstein distance
    # Fit Beta distributions
    a1, b1, _, _ = beta.fit(sample1, floc=0, fscale=1)
    a2, b2, _, _ = beta.fit(sample2, floc=0, fscale=1)

    # Generate CDFs based on Beta distributions
    sample_sorted = np.sort(np.concatenate((sample1, sample2)))
    cdf1 = beta.cdf(sample_sorted, a1, b1)
    cdf2 = beta.cdf(sample_sorted, a2, b2)

    # Perform KS test on fitted Beta CDFs
    ks_statistic, p_value = ks_2samp(cdf1, cdf2)

    return ks_statistic, p_value

def compute_log_likelihood(sample, a, b):
    return np.sum(beta.logpdf(sample, a, b))


def likelihood_ratio_test(sample1, sample2):
    #Requires large samples, p-value interpretation issues -> bayes factor 
    # Fit Beta to combined data
    combined_sample = np.concatenate((sample1, sample2))
    a_comb, b_comb, _, _ = beta.fit(combined_sample, floc=0, fscale=1)
    
    # Fit Beta to each sample separately
    a1, b1, _, _ = beta.fit(sample1, floc=0, fscale=1)
    a2, b2, _, _ = beta.fit(sample2, floc=0, fscale=1)

    # Compute log-likelihoods
    log_likelihood_comb = compute_log_likelihood(combined_sample, a_comb, b_comb)
    log_likelihood_sep = compute_log_likelihood(sample1, a1, b1) + compute_log_likelihood(sample2, a2, b2)

    # Likelihood ratio test statistic
    LRT_stat = -2 * (log_likelihood_comb - log_likelihood_sep)
    
    return LRT_stat


def bayes_factor(sample1, sample2):
    combined_sample = np.concatenate((sample1, sample2))
    
    # Fit Beta to combined and separate samples
    a_comb, b_comb, _, _ = beta.fit(combined_sample, floc=0, fscale=1)
    a1, b1, _, _ = beta.fit(sample1, floc=0, fscale=1)
    a2, b2, _, _ = beta.fit(sample2, floc=0, fscale=1)
    
    # Compute log-likelihoods
    log_likelihood_comb = compute_log_likelihood(combined_sample, a_comb, b_comb)
    log_likelihood_sep = compute_log_likelihood(sample1, a1, b1) + compute_log_likelihood(sample2, a2, b2)
    
    # Bayes Factor (BF) using the BIC approximation
    k_comb, k_sep = 2, 4  # Parameters in single vs separate models
    n_comb, n_sep = len(combined_sample), len(sample1) + len(sample2)
    
    bic_comb = k_comb * np.log(n_comb) - 2 * log_likelihood_comb
    bic_sep = k_sep * np.log(n_sep) - 2 * log_likelihood_sep
    
    bf = np.exp((bic_comb - bic_sep) / 2)  # Bayes Factor approximation
    
    if bf>10: #High Bayes Factor
        result = "Reject the null hypothesis (sample follows different Beta distribution)"
    else:
        result = "Fail to reject the null hypothesis (samples follows same Beta distribution)"
    
    interpretation = "Similar" if bf<=10 else "Different"
    
    return bf, interpretation

def bootstrap_bayes_factor(sample1, sample2, n_bootstrap=1000):
    bf_samples = []
    
    for _ in range(n_bootstrap):
        resample1 = np.random.choice(sample1, size=len(sample1), replace=True)
        resample2 = np.random.choice(sample2, size=len(sample2), replace=True)
        bf, _ = bayes_factor(resample1, resample2)
        bf_samples.append(bf)
    
    # Compute bootstrapped mean and credible interval for BF
    bf_mean = np.mean(bf_samples)
    bf_ci = np.percentile(bf_samples, [2.5, 97.5])
    
    return bf_mean, bf_ci



