import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import NuSVR
from sklearn.linear_model import Ridge
import statsmodels.api as sm
from numpy.linalg import svd
from numpy import eye





def tightSVD(Mat, tol=1e-6):
    """
    Perform truncated SVD on the matrix 'Mat' and return the significant components.
    
    Parameters:
    Mat (ndarray): Input matrix for Singular Value Decomposition (SVD).
    tol (float): Tolerance value to filter singular values. Default is 1e-6.
    
    Returns:
    lb (ndarray): Diagonal matrix with the singular values.
    move (int): Number of significant singular values.
    P (ndarray): Left singular vectors (U).
    Q (ndarray): Right singular vectors (V).
    Q_perp (ndarray or None): The orthogonal complement of Q (if applicable).
    """
    from numpy.linalg import svd
    from numpy import eye
    
    # Perform SVD
    P_r, Lamb, Q_Tr = svd(Mat)  # n*n, n*p, p*p
    
    # Find the number of significant singular values using the tolerance
    move = np.sum(Lamb > tol)  # Count singular values greater than the tolerance
    
    # Truncate the components
    P = P_r[:, :move]  # Left singular vectors (n*r)
    Q = Q_Tr.T[:, :move]  # Right singular vectors (p*r)
    
    # Orthogonal complement of Q, if applicable
    Q_perp = Q_Tr.T[:, move:] if move < Mat.shape[1] else None
    
    # Create the diagonal matrix with singular values
    lb = eye(move)
    lb.flat[:move] = Lamb[:move]  # Directly fill the diagonal with singular values
    
    return lb, move, P, Q, Q_perp

def CalThHatstar1(X_train, y_train, Xnew, rho, an):
    """
    Compute ThHatstar as part of the DeCRR method.

    Parameters:
    - X_train: Training data
    - y_train: Training labels
    - Xnew: New data point for prediction
    - rho: Regularization parameter
    - an: Additional hyperparameter (unused in the function but needed for consistency)

    Returns:
    - ThHatstar: The computed ThHatstar matrix
    """
    import numpy as np
    from numpy.linalg import inv
    from numpy import eye
    lb, r, P, Q, Q_perp = tightSVD(Xnew)
    
    # Optimized matrix computations
    LambAdj1 = np.linalg.inv(lb @ lb + rho * eye(r))
    plamb = P @ lb
    LambAdj2 = Q @ LambAdj1 @ plamb.T
    
    # Compute condition (to set specific values to zero later)
    condition = calset(X_train, y_train, rho, an)

    LambAdj3 = lb @ lb + 2 * rho * eye(r)
    LambAdj4 = Q @ LambAdj3
    LambAdj5 = LambAdj1 @ LambAdj1
    LambAdj6 = LambAdj4 @ LambAdj5 @ plamb.T

    LambAdj6[~condition, :] = 0
    
    # Compute ThHatstar
    ThHatstar = Xnew @ LambAdj6
    return ThHatstar

def CalThHat1(X_train, y_train, Xnew, rho, an):
    """
    Optimized version of the CalThHat1 function.
    
    Parameters:
    - X_train: Training features.
    - y_train: Training labels.
    - Xnew: New input sample for prediction.
    - rho: Regularization parameter.
    - an: Additional hyperparameter (not used directly in the function, may be relevant to calset).
    
    Returns:
    - ThHat: The predicted values based on the learned model.
    """
    import numpy as np
    from numpy.linalg import inv
    from numpy import eye
    # Perform the Singular Value Decomposition (SVD) on Xnew
    lb, r, P, Q, Q_perp = tightSVD(Xnew)  # Assuming tightSVD is already implemented
    
    # Efficient matrix operations for LambAdj1 and LambAdj2
    LambAdj1 = np.linalg.inv(lb @ lb + rho * eye(r))  # Optimized matrix inversion
    plamb = P @ lb  # Matrix multiplication using @ operator for clarity and speed
    LambAdj2 = Q @ LambAdj1 @ plamb.T  # Combining matrix multiplications
    
    # Compute condition to adjust LambAdj2 (reduced redundancy)
    condition = calset(X_train, y_train, rho, an)  # Assuming calset is implemented and used as a condition
    LambAdj2[~condition, :] = 0  # Apply condition to set rows to zero where needed
    
    # Final prediction using matrix multiplication
    ThHat = Xnew @ LambAdj2  # Matrix multiplication for final prediction
    
    return ThHat


def calloQuantile(quans, alpha):
    from numpy import sort
    import math
    if quans.shape[0] == 0:
        return 0.0
    rets = sort(quans)  # Small to big
    size = math.floor(alpha * quans.shape[0])-1#math.ceil 函数对计算结果进行向下取整，以确保索引为整数。由于数组索引从0开始，最后需要减1以得到正确的索引。
    crucial2 = rets[size]
    return crucial2
def calupQuantile(quans, alpha):
    from numpy import sort
    import math
    if quans.shape[0] == 0:
        return 0.0
    rets = sort(quans)  # Small to big
    size = math.ceil(alpha * quans.shape[0])-1#math.ceil 函数对计算结果进行向上取整，以确保索引为整数。由于数组索引从0开始，最后需要减1以得到正确的索引。
    crucial1 = rets[size]
    return crucial1



def cross_validation(X, y, rhos, bs, kfold):
    """
    Cross-validation to find optimal rho and b.
    """
    from sklearn.model_selection import KFold
    from numpy.linalg import inv
    kfold = KFold(n_splits=kfold)
    size_rho, size_b = len(rhos), len(bs)
    theDeRidRes = np.zeros((size_rho, size_b))
    
    for train_idx, test_idx in kfold.split(X):
        X_train_cv, X_test_cv = X[train_idx], X[test_idx]
        y_train_cv, y_test_cv = y[train_idx], y[test_idx]
        
        for i, rho in enumerate(rhos):
            for j, b in enumerate(bs):
                theta = DeRIdge(X_train_cv, y_train_cv, rho)  # Compute theta using DeRIdge
                theta_truncated, _ = truncate(b, theta)  # Unpack the result of truncate
                
                # Ensure theta_truncated has the correct shape
                if theta_truncated.ndim == 1:  # If it’s a 1D array (as expected)
                    theta_truncated = theta_truncated.reshape(-1, 1)  # Reshape to (num_features, 1)
                
                # Compute residuals (ensure proper dimensions for matrix multiplication)
                predicted = np.matmul(X_test_cv, theta_truncated)  # Matrix multiplication
                residuals = y_test_cv - predicted.flatten()  # Ensure residuals is 1D

                # Compute the norm of residuals
                theDeRidRes[i, j] += np.linalg.norm(residuals)
    
    # Find optimal rho and b (minimize residuals)
    opt_idx = np.unravel_index(np.argmin(theDeRidRes), theDeRidRes.shape)
    return rhos[opt_idx[0]], bs[opt_idx[1]]



def calSD(X, y, coef):
    """
    Calculate the standard deviation of residuals.
    
    Parameters:
    X (ndarray): Feature matrix.
    y (ndarray): Target vector.
    coef (ndarray): Coefficient vector.
    
    Returns:
    float: Standard deviation of residuals.
    """
    resi = y - X @ coef
    var = np.dot(resi, resi) / len(y)
    return np.sqrt(var)

def calset(X, y, rho, an):
    """
    Compute the condition array based on the Ridge estimator and threshold.

    Parameters:
    X (ndarray): Input feature matrix.
    y (ndarray): Target vector.
    rho (float): Regularization parameter for Ridge regression.
    an (float): Threshold parameter to identify significant coefficients.

    Returns:
    ndarray: Boolean array indicating where the condition is met.
    """
    from numpy.linalg import inv
    from numpy import eye
    
    # Perform tight SVD
    lb, r, P, Q, Q_perp = tightSVD(X)
    
    # Ridge estimator
    thetahat = RIdge(X, y, rho)
    
    # Identify condition
    condition = np.abs(thetahat) > an

    return condition


def computeQuantile(quans, alpha, lower=True):
    """
    Compute lower or upper quantile of the given array.
    
    Parameters:
    quans (ndarray): Input array.
    alpha (float): Quantile threshold.
    lower (bool): Whether to compute the lower quantile. If False, computes upper quantile.
    
    Returns:
    float: Computed quantile value.
    """
    if quans.size == 0:
        return 0.0
    quans_sorted = np.sort(quans)
    size = int((alpha * len(quans)) - 1) if lower else int(np.ceil(alpha * len(quans)) - 1)
    return quans_sorted[max(0, size)]

    '''
    lower_quantile = computeQuantile(quans, alpha, lower=True)
    upper_quantile = computeQuantile(quans, alpha, lower=False)

    '''

def RIdge(X, y, rho):
    """
    Compute Ridge regression coefficients.
    
    Parameters:
    X (ndarray): Feature matrix.
    y (ndarray): Target vector.
    rho (float): Regularization parameter.
    
    Returns:
    ndarray: Ridge regression coefficients.
    """
    from sklearn.linear_model import Ridge
    ridge = Ridge(alpha=rho, fit_intercept=False)
    ridge.fit(X, y)
    return ridge.coef_.flatten()

def DeRIdge(X, y, rho):
    """
    Compute debiased Ridge regression coefficients.
    
    Parameters:
    X (ndarray): Feature matrix.
    y (ndarray): Target vector.
    rho (float): Regularization parameter.
    
    Returns:
    ndarray: Debiased Ridge regression coefficients.
    """
    Lamb, r, P, Q, _ = tightSVD(X)
    Oritheta = RIdge(X, y, rho)
    adjustment = rho * Q @ np.linalg.inv(Lamb @ Lamb + rho * np.eye(r)) @ Q.T @ Oritheta
    return Oritheta + adjustment

    
def truncate(thre, a):
    """
    Truncate values in the array based on a threshold.
    
    Parameters:
    thre (float): Threshold for truncation.
    a (ndarray): Input array.
    
    Returns:
    tuple: (Truncated array, Validity mask).
    """
    valid_mask = np.abs(a) > thre
    truncated = np.where(valid_mask, a, 0)
    return truncated, valid_mask


# 定义分位数回归的目标函数
def quantile_loss(beta, X, y, tau, alpha):
    residuals = y - np.dot(X, beta)
    loss = np.sum(np.where(residuals >= 0, tau * residuals, (tau - 1) * residuals)) + alpha * np.sum(beta[1:] ** 2)
    return loss

# 定义拟合函数
def fit_quantile_regression(X, y, tau, alpha):
    import numpy as np
    import scipy.optimize as opt
    initial_beta = np.zeros(X.shape[1])
    result = opt.minimize(quantile_loss, initial_beta, args=(X, y, tau, alpha), method='BFGS')
    return result.x




def fit_and_predict_qr(X_train, y_train, X_test, alpha):
    """Fit Quantile Regression models and predict intervals."""
   
    X_train1, X_cali, y_train1, y_cali = train_test_split(X_train, y_train, test_size=0.5)
    
    # 添加常数列

    X_train1 = sm.add_constant(X_train1, has_constant='add')
    X_cali = sm.add_constant(X_cali, has_constant='add')
    X_test = sm.add_constant(X_test, has_constant='add')

    
    # 确认维度一致
    print("X_train1 shape:", X_train1.shape)
    print("X_cali shape:", X_cali.shape)
    print("X_test shape:", X_test.shape)

    beta1 = fit_quantile_regression(X_train1, y_train1, 0.05, alpha)
    beta2 = fit_quantile_regression(X_train1, y_train1, 0.95, alpha)

    # 确保 beta 的长度与矩阵列数匹配
    print("beta1 shape:", beta1.shape)
    print("beta2 shape:", beta2.shape)

    y_cali_pred_qr1 = np.dot(X_cali, beta1)
    y_test_pred_qr1 = np.dot(X_test, beta1)
    E1 = y_cali_pred_qr1 - y_cali
    y_cali_pred_qr2 = np.dot(X_cali, beta2)
    y_test_pred_qr2 = np.dot(X_test, beta2)
    E2 = y_cali_pred_qr2 - y_cali
    E = np.max([E1, E2], axis=0)

    Q_qr = np.quantile(E, 0.9)
        
    lower_bounds5 = y_test_pred_qr1 - Q_qr
    upper_bounds5 = y_test_pred_qr2 + Q_qr
    
    return lower_bounds5, upper_bounds5

def fit_and_predict_svr(X_train, y_train, X_test, alpha):
    """Fit NuSVR model and predict intervals."""
    X_train1, X_cali, y_train1, y_cali = train_test_split(X_train, y_train, test_size=0.5)
    model = NuSVR(nu=0.1, C=1 / (2 * alpha), kernel='linear')
    model.fit(X_train1, y_train1)

    y_cali_pre_svr = model.predict(X_cali)
    R = np.abs(y_cali - y_cali_pre_svr)
    Q_svr = np.quantile(R, 0.9)

    y_test_pre_svr = model.predict(X_test)
    lower_bounds6 = y_test_pre_svr - Q_svr
    upper_bounds6 = y_test_pre_svr + Q_svr
    return lower_bounds6, upper_bounds6

def fit_and_predict_ridge(X_train, y_train, X_test, alpha):
    """Fit Ridge model and predict intervals."""
    X_train1, X_cali, y_train1, y_cali = train_test_split(X_train, y_train, test_size=0.5)
    model1 = Ridge(alpha=alpha)
    model1.fit(X_train1, y_train1)
    y_cali_pre_rid = model1.predict(X_cali)
    R_rid = np.abs(y_cali - y_cali_pre_rid)
    Q_rid = np.quantile(R_rid, 0.9)
    y_test_pre_rid = model1.predict(X_test)
    lower_bounds7 = y_test_pre_rid - Q_rid
    upper_bounds7 = y_test_pre_rid+ Q_rid
    return  lower_bounds7, upper_bounds7

def compute_crr_bounds(X_train, y_train, X_test, y_test, hn,b_opt, alpha):
    """
    Compute lower and upper bounds for CRR method.

    Parameters:
    X_train (array): Training features
    y_train (array): Training targets
    X_test (array): Test features
    y_test (array): Test targets
    rhos (array): Array of regularization parameters
    bs (array): Array of truncation thresholds
    alpha (float): Confidence level
    kfold (int): Number of cross-validation folds

    Returns:
    lower_bounds, upper_bounds (arrays): Lower and upper prediction bounds
    """
    #hn, b_opt = cross_validation(X_train, y_train, rhos, bs, kfold)
    n = len(y_train)
    yn = np.concatenate((y_train, [0]))
    yf = np.concatenate((np.zeros(n), [1]))
    lower_bounds, upper_bounds = np.zeros(len(y_test)), np.zeros(len(y_test))

    for i in range(len(y_test)):
        X_new = np.vstack((X_train, X_test[i, :]))
        thHat = CalThHat1(X_train, y_train, X_new, hn, b_opt)
        ihat1 = eye(n+1) - thHat
        A1, B1 = np.matmul(ihat1, yn), np.matmul(ihat1, yf)
        u1 = [(A1[j] - A1[-1]) / (B1[-1] - B1[j]) if B1[-1] > B1[j] else 0 for j in range(n+1)]
        lower_bounds[i] = calloQuantile(np.array(u1), alpha/2)
        upper_bounds[i] = calupQuantile(np.array(u1), 1-alpha/2)

    return lower_bounds, upper_bounds
     
def compute_decrr_bounds(X_train, y_train, X_test, y_test, alpha,h,b):
    """
    Compute bounds for method2: DeCRR (Differentially Private Conformal Regression).

    Parameters:
    - X_train: Training data
    - y_train: Training labels
    - Xnew: New data point for prediction
    - rho: Regularization parameter
    - alpha: Significance level
    - yn, yf: Used for constructing the matrix
    - n: Length of the training data
    - i: Index for the test sample
    - hn, b: Hyperparameters for Cross-validation
    
    Returns:
    - lower_bounds2: Lower bounds for the prediction
    - upper_bounds2: Upper bounds for the prediction
    """
    import numpy as np
    from numpy import eye
    n = len(y_train)
    yn = np.concatenate((y_train, [0]))
    yf = np.concatenate((np.zeros(n), [1]))
    lower_bounds, upper_bounds = np.zeros(len(y_test)), np.zeros(len(y_test))

    for i in range(len(y_test)):
        X_new = np.vstack((X_train, X_test[i, :]))
        # Step 1: Compute ThHatstar using CalThHatstar1 function
        ThHatstar = CalThHatstar1(X_train, y_train, X_new, h, b)
    
        # Step 2: Calculate A2, B2, and u2 in a vectorized form
        ihat2 = eye(n + 1) - ThHatstar
        A2 = np.matmul(ihat2, yn)
        B2 = np.matmul(ihat2, yf)
        
        # Step 3: Efficiently compute u2 with conditional logic using vectorized operations
        #u2 = np.where(B2[-1] > B2, (A2 - A2[-1]) / (B2[-1] - B2), 0)
        u2 = [(A2[j] - A2[-1]) / (B2[-1] - B2[j]) if B2[-1] > B2[j] else 0 for j in range(n+1)]

        # Step 4: Compute the quantiles for the lower and upper bounds
        lower_bounds[i] = calloQuantile(np.array(u2), alpha/2)
        upper_bounds[i] = calupQuantile(np.array(u2), 1-alpha/2)

    return lower_bounds, upper_bounds
'''
def bootstrap_bias(X_train, y_train, X_test, B, alpha, hn, b):
    """
    Function to compute bounds for Method 4: Bootstrap(bias).

    Parameters:
    - X_train: Training features.
    - y_train: Training labels.
    - X_test: Test features.
    - Q_perp: Matrix for perpendicular component (if available).
    - B: Number of bootstrap samples.
    - alpha: Quantile for the bootstrap procedure.
    - hn: Hyperparameter (may be related to regularization).
    - b: Threshold for condition checking.

    Returns:
    - lower_bounds: Lower bound for predictions.
    - upper_bounds: Upper bound for predictions.
    """
    import numpy as np
    # Step 1: Compute initial Ridge estimate
    theTilde = RIdge(X_train, y_train, hn)  # Ridge regression
    condition = np.abs(theTilde) > b  # Apply condition based on threshold b
    thetahat = np.copy(theTilde.reshape(-1))
    thetahat[~condition] = 0  # Apply condition to set irrelevant elements to 0
    
    # Step 2: Compute sigma and residuals
    sigmahat = calSD(X_train, y_train, thetahat)  # Standard deviation calculation
    eps_hat = (y_train - np.matmul(X_train, thetahat)).reshape(-1)
    eps_haMeant = np.mean(eps_hat)
    epshat = eps_hat - eps_haMeant  # Center the residuals
    
    # Step 4: Initial predictions for the test set
    yhat_test = np.matmul(X_test, thetahat)
    lower_bounds1 = np.zeros(len(X_test))
    upper_bounds1 = np.zeros(len(X_test))


    # Step 5: Bootstrap sampling loop
    for i in range(len(X_test)):
        bootSamp = []
        for j in range(B):
            # Generate bootstrap samples
            epsfstar = np.random.choice(epshat, size=1)
            epsstar = np.random.normal(loc=0, scale=sigmahat, size=len(y_train))
            
            # Generate new bootstrap dataset
            ystar = np.matmul(X_train, thetahat) + epsstar
            
            # Refit the model with the bootstrap sample
            thetahatstar_1 = DeRIdge(X_train, ystar, hn)
            
            # Apply condition to the new theta estimate
            condition = np.abs(thetahatstar_1) > b
            thetahatstar = np.copy(thetahatstar_1.reshape(-1))
            thetahatstar[~condition] = 0
            
            # Compute the bootstrap prediction
            yfstar = np.matmul(X_test, thetahat) + epsfstar
            yfhatstar = np.matmul(X_test, thetahatstar)
            
            # Add the maximum absolute difference to the bootstrap sample
            bootSamp.append(np.amax(np.absolute(yfstar - yfhatstar)))
        
        # Step 6: Quantile computation for the bounds
        bootQuan = np.quantile(bootSamp, q=1 - alpha)
        
        # Step 7: Calculate final lower and upper bounds
        lower_bounds[i] = yhat_test[i] - bootQuan
        upper_bounds[i] = yhat_test[i] + bootQuan
    
    return lower_bounds, upper_bounds

def bootstrap_debias(X_train, y_train, X_test, hn, b, alpha, B):
    """
    Perform Bootstrap Debias method to compute lower and upper bounds.
    
    Parameters:
    X_train (array-like): Training data.
    y_train (array-like): Training target values.
    X_test (array-like): Test data.
    hn (float): Tuning parameter for DeRIdge.
    b (float): Threshold for shrinkage.
    alpha (float): Quantile for coverage (1-alpha is the confidence level).
    B (int): Number of bootstrap samples.
    Q_perp (array-like, optional): Perpendicular matrix if available.

    Returns:
    lower_bounds (array-like): Lower bounds of prediction intervals.
    upper_bounds (array-like): Upper bounds of prediction intervals.
    """
    import numpy as np
    # Compute the initial ridge estimator (Debias)
    theTilde = DeRIdge(X_train, y_train, hn)
    condition = np.abs(theTilde) > b
    thetahat = np.copy(theTilde.reshape(-1))
    thetahat[~condition] = 0  # Apply shrinkage
    sigmahat = calSD(X_train, y_train, thetahat)
    
    # Compute residuals and adjust
    eps_hat = (y_train - np.matmul(X_train, thetahat)).reshape(-1)
    eps_haMeant = np.mean(eps_hat)
    epshat = eps_hat - eps_haMeant
    
    # Initialize lists to store the bounds
    lower_bounds = np.zeros(len(X_test))
    upper_bounds = np.zeros(len(X_test))
    yhat_test = np.matmul(X_test, thetahat)
    
    # Iterate over each test instance
    for i in range(len(X_test)):
        bootSamp = []
        
        # Perform Bootstrap resampling
        for j in range(B):
            epsfstar = np.random.choice(epshat, size=1)  # Resample residuals
            epsstar = np.random.normal(loc=0, scale=sigmahat, size=len(y_train))  # Resample noise
            ystar = np.matmul(X_train, thetahat) + epsstar  # Adjusted training labels
            
            # Recompute theta using DeRIdge
            thetahatstar_1 = DeRIdge(X_train, ystar, hn)
            condition = np.abs(thetahatstar_1) > b
            thetahatstar = np.copy(thetahatstar_1.reshape(-1))
            thetahatstar[~condition] = 0  # Apply shrinkage to the new theta
            
            # Compute prediction and the difference
            yfstar = np.matmul(X_test[i], thetahat) + epsfstar
            yfhatstar = np.matmul(X_test[i], thetahatstar)
            bootSamp.append(np.amax(np.abs(yfstar - yfhatstar)))  # Capture the maximum absolute error
        
        # Quantile for the bootstrap sample
        bootQuan = np.quantile(bootSamp, q=1 - alpha)
        
        # Store the bounds
        lower_bounds[i] = yhat_test[i] - bootQuan
        upper_bounds[i] = yhat_test[i] + bootQuan
    
    return lower_bounds, upper_bounds
'''
import numpy as np

def compute_initial_estimator(X_train, y_train, hn, b, DeRIdge_flag=False):
    """
    Computes the initial estimator (either Ridge or DeRIdge) and applies shrinkage.
    """
    if DeRIdge_flag:
        theTilde = DeRIdge(X_train, y_train, hn)
    else:
        theTilde = RIdge(X_train, y_train, hn)
    
    condition = np.abs(theTilde) > b  # Apply condition based on threshold b
    thetahat = np.copy(theTilde.reshape(-1))
    thetahat[~condition] = 0  # Apply shrinkage
    sigmahat = calSD(X_train, y_train, thetahat)
    
    # Compute residuals and center them
    eps_hat = (y_train - np.matmul(X_train, thetahat)).reshape(-1)
    eps_haMeant = np.mean(eps_hat)
    epshat = eps_hat - eps_haMeant
    
    return thetahat, sigmahat, epshat

def bootstrap_bounds(X_train, y_train, X_test, B, alpha, hn, b, DeRIdge_flag=False):
    """
    Perform bootstrap method to compute lower and upper bounds for predictions.
    """
    # Step 1: Compute the initial estimator (Debias or Ridge)
    thetahat, sigmahat, epshat = compute_initial_estimator(X_train, y_train, hn, b, DeRIdge_flag)
    
    # Step 2: Initial predictions for the test set
    yhat_test = np.matmul(X_test, thetahat)
    
    # Step 3: Initialize the bounds
    lower_bounds = np.zeros(len(X_test))
    upper_bounds = np.zeros(len(X_test))
    
    # Step 4: Bootstrap resampling loop
    boot_samples = np.zeros((len(X_test), B))
    for i in range(len(X_test)):
        # Perform B bootstrap samples for each test instance
        for j in range(B):
            epsfstar = np.random.choice(epshat, size=1)  # Resample residuals
            epsstar = np.random.normal(loc=0, scale=sigmahat, size=len(y_train))  # Resample noise
            ystar = np.matmul(X_train, thetahat) + epsstar  # Adjusted training labels
            
            # Recompute theta using DeRIdge (or Ridge) for the bootstrap sample
            thetahatstar = DeRIdge(X_train, ystar, hn) if DeRIdge_flag else RIdge(X_train, y_train, hn)
            condition = np.abs(thetahatstar) > b
            thetahatstar[~condition] = 0  # Apply shrinkage to the new theta
            
            # Compute bootstrap prediction and the difference
            yfstar = np.matmul(X_test[i], thetahat) + epsfstar
            yfhatstar = np.matmul(X_test[i], thetahatstar)
            boot_samples[i, j] = np.amax(np.abs(yfstar - yfhatstar))  # Capture the max absolute error
        
        # Step 5: Compute the quantile for the bootstrap sample
        bootQuan = np.quantile(boot_samples[i], q=1 - alpha)
        
        # Step 6: Calculate final lower and upper bounds
        lower_bounds[i] = yhat_test[i] - bootQuan
        upper_bounds[i] = yhat_test[i] + bootQuan

    return lower_bounds, upper_bounds

def bootstrap_bias(X_train, y_train, X_test, B, alpha, hn, b):
    """
    Function for Method 4: Bootstrap(Bias).
    """
    return bootstrap_bounds(X_train, y_train, X_test, B, alpha, hn, b, DeRIdge_flag=False)

def bootstrap_debias(X_train, y_train, X_test, hn, b, alpha, B):
    """
    Function for Method 5: Bootstrap(Debias).
    """
    return bootstrap_bounds(X_train, y_train, X_test, B, alpha, hn, b, DeRIdge_flag=True)


def compute_conditional_bins(X_test, feature_index=5):
    # Check if X_test is 2D
    if X_test.ndim == 1:
        X_test = X_test.reshape(-1, 1)  # Reshape to 2D if necessary
    
    # Extract the specified feature column (feature_index)
    rm_feature = X_test[:, feature_index]
    
    # Flatten the array to ensure it's 1D
    rm_feature = rm_feature.flatten()
    
    # Compute percentiles and bins
    conditional_bins = np.percentile(rm_feature, [25, 50, 75])
    bins = np.digitize(rm_feature, bins=conditional_bins)
    
    return bins


def compute_conditional_coverage(bins, y_test, bounds):
    """
    Compute conditional coverage for a given set of bounds.
    
    Parameters:
    bins (ndarray): Bin assignments for each sample in y_test.
    y_test (ndarray): True test values.
    bounds (ndarray): Array of shape (n_samples, 2) containing lower and upper bounds.
    
    Returns:
    list: Conditional coverage for each bin.
    """
    lower_bounds, upper_bounds = bounds[:, 0], bounds[:, 1]
    num_bins = len(np.unique(bins))
    return [
        np.mean((y_test[bins == i] >= lower_bounds[bins == i]) &
                (y_test[bins == i] <= upper_bounds[bins == i]))
        for i in range(num_bins)
    ]

def compute_optimal_cov_diff(conditional_coverages1, conditional_coverages2, cov_lists, conditional_coverages):
    """
    Compute the bin with the maximum difference between conditional coverages and update cov_lists.
    
    Parameters:
    conditional_coverages1, conditional_coverages2 (list): Conditional coverage for two methods.
    cov_lists (list of lists): Lists to store optimal conditional coverages for multiple methods.
    conditional_coverages (list of lists): Conditional coverages for all methods.
    """
    # Find the bin index with the maximum coverage difference
    cov_index = np.argmax(np.array(conditional_coverages2) - np.array(conditional_coverages1))
    
    # Extract coverage values for this bin and append to respective lists
    for method_idx, cov in enumerate(cov_lists):
        cov.append(conditional_coverages[method_idx][cov_index])

import threading
def Getresult(X, y, alpha, testtime, B,hn,b):
    """
    Compute conditional and marginal coverage, and interval lengths for 7 methods.
    """
    import time
    X = np.array(X.astype(np.float32))
    y = np.array(y.astype(np.float32))
    
    cov_con, cov_marginal, interval_lengths = [[] for _ in range(7)], [[] for _ in range(7)], [[] for _ in range(7)]
     # 超时时间设置
    timeout_seconds = 1800  # 每次循环最多允许 10 秒

    for k in range(testtime):
        print(f"Iteration: {k+1}/{testtime}")

        def single_iteration():
            nonlocal cov_con, cov_marginal, interval_lengths

            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=None)
            bins = compute_conditional_bins(X_test, feature_index=5)
            
            
            # Initialize bounds for all methods
            bounds = [np.zeros((len(y_test), 2)) for _ in range(7)]

            # Compute bounds for each method
            start= time.time()
            bounds[0][:, 0], bounds[0][:, 1]= compute_crr_bounds(X_train, y_train, X_test, y_test, hn, b, alpha)
            print(f"crr 运行时间: {time.time() - start:.4f} 秒")
            start= time.time()
            bounds[1][:, 0], bounds[1][:, 1] = compute_decrr_bounds(X_train, y_train, X_test, y_test, alpha,hn,b)
            print(f"decerr 运行时间: {time.time() - start:.4f} 秒")
            start= time.time()
            bounds[2][:, 0], bounds[2][:, 1] = bootstrap_bias(X_train, y_train, X_test, B, alpha, hn, b)
            print(f"bootrr 运行时间: {time.time() - start:.4f} 秒")
            start= time.time()
            bounds[3][:, 0], bounds[3][:, 1] = bootstrap_debias(X_train, y_train, X_test, hn, b, alpha, B)
            print(f"bootdrr 运行时间: {time.time() - start:.4f} 秒")
            start= time.time()
            bounds[4][:, 0], bounds[4][:, 1] = fit_and_predict_qr(X_train, y_train, X_test, alpha)
            print(f"qr 运行时间: {time.time() - start:.4f} 秒")
            start= time.time()
            bounds[5][:, 0], bounds[5][:, 1] = fit_and_predict_svr(X_train, y_train, X_test, alpha)
            print(f"svr 运行时间: {time.time() - start:.4f} 秒")
            start= time.time()
            bounds[6][:, 0], bounds[6][:, 1] = fit_and_predict_ridge(X_train, y_train, X_test, alpha)
            print(f"r运行时间: {time.time() - start:.4f} 秒")

    


            # Compute conditional coverage for each method
            conditional_coverages = [
                compute_conditional_coverage(bins, y_test, bounds[i]) for i in range(7)
            ]
            
            # Update optimal conditional coverage differences
            compute_optimal_cov_diff(conditional_coverages[0],conditional_coverages[1],cov_con,conditional_coverages)


            # Compute coverage and lengths
            for method_idx in range(7):
                lower_bounds, upper_bounds = bounds[method_idx][:, 0], bounds[method_idx][:, 1]
                cov_marginal[method_idx].append(np.mean((y_test >= lower_bounds) & (y_test <= upper_bounds)))
                interval_lengths[method_idx].append(np.mean(upper_bounds - lower_bounds)
                                                    )
        # 启动线程
        task_thread = threading.Thread(target=single_iteration)
        task_thread.start()
        task_thread.join(timeout=timeout_seconds)  # 限制线程运行时间
        
        if task_thread.is_alive():
            print(f"Iteration {k+1} 超时，跳过此循环")
            task_thread.join(0)  # 强制结束线程（线程本身不会停止，逻辑跳过即可）
            continue
        else:
            print(f"Iteration {k+1} 正常完成")


    return cov_con, cov_marginal, interval_lengths
'''
def Getresult(X, y, alpha, testtime, B,hn,b):
    """
    Compute conditional and marginal coverage, and interval lengths for 7 methods.
    """
    import time
    X = np.array(X.astype(np.float32))
    y = np.array(y.astype(np.float32))
    
    cov_con, cov_marginal, interval_lengths = [[] for _ in range(7)], [[] for _ in range(7)], [[] for _ in range(7)]
     # 超时时间设置
    timeout_seconds = 1800  # 每次循环最多允许 10 秒

    for k in range(testtime):
        print(f"Iteration: {k+1}/{testtime}")

        
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=None)
        bins = compute_conditional_bins(X_test, feature_index=5)
        
        
        # Initialize bounds for all methods
        bounds = [np.zeros((len(y_test), 2)) for _ in range(7)]

        # Compute bounds for each method
        start= time.time()
        bounds[0][:, 0], bounds[0][:, 1]= compute_crr_bounds(X_train, y_train, X_test, y_test, hn, b, alpha)
        print(f"crr 运行时间: {time.time() - start:.4f} 秒")
        start= time.time()
        bounds[1][:, 0], bounds[1][:, 1] = compute_decrr_bounds(X_train, y_train, X_test, y_test, alpha,hn,b)
        print(f"decerr 运行时间: {time.time() - start:.4f} 秒")
        start= time.time()
        bounds[2][:, 0], bounds[2][:, 1] = bootstrap_bias(X_train, y_train, X_test, B, alpha, hn, b)
        print(f"bootrr 运行时间: {time.time() - start:.4f} 秒")
        start= time.time()
        bounds[3][:, 0], bounds[3][:, 1] = bootstrap_debias(X_train, y_train, X_test, hn, b, alpha, B)
        print(f"bootdrr 运行时间: {time.time() - start:.4f} 秒")
        start= time.time()
        bounds[4][:, 0], bounds[4][:, 1] = fit_and_predict_qr(X_train, y_train, X_test, alpha)
        print(f"qr 运行时间: {time.time() - start:.4f} 秒")
        start= time.time()
        bounds[5][:, 0], bounds[5][:, 1] = fit_and_predict_svr(X_train, y_train, X_test, alpha)
        print(f"svr 运行时间: {time.time() - start:.4f} 秒")
        start= time.time()
        bounds[6][:, 0], bounds[6][:, 1] = fit_and_predict_ridge(X_train, y_train, X_test, alpha)
        print(f"r运行时间: {time.time() - start:.4f} 秒")




        # Compute conditional coverage for each method
        conditional_coverages = [
            compute_conditional_coverage(bins, y_test, bounds[i]) for i in range(7)
        ]
        
        # Update optimal conditional coverage differences
        compute_optimal_cov_diff(conditional_coverages[0],conditional_coverages[1],cov_con,conditional_coverages)


        # Compute coverage and lengths
        for method_idx in range(7):
            lower_bounds, upper_bounds = bounds[method_idx][:, 0], bounds[method_idx][:, 1]
            cov_marginal[method_idx].append(np.mean((y_test >= lower_bounds) & (y_test <= upper_bounds)))
            interval_lengths[method_idx].append(np.mean(upper_bounds - lower_bounds)
                                                )
    # 启动线程



    return cov_con, cov_marginal, interval_lengths
    '''
import numpy as np

def compute_statistics(cov_con, cov_marginal, interval_lengths):
    """
    Compute mean and variance for each method's results.
    Parameters:
    - cov_con, cov_marginal, interval_lengths (list of lists): 
        Each list contains results for 7 methods across multiple iterations.

    Returns:
    - statistics (dict): Dictionary containing means and variances for each method.
    """
    statistics = {}

    # Calculate mean and variance for conditional coverage
    cov_con_means = [np.mean(cov_con[i]) for i in range(7)]
    cov_con_variances = [np.var(cov_con[i]) for i in range(7)]
    statistics["cov_con_mean"] = cov_con_means
    statistics["cov_con_variance"] = cov_con_variances

    # Calculate mean and variance for marginal coverage
    cov_marginal_means = [np.mean(cov_marginal[i]) for i in range(7)]
    cov_marginal_variances = [np.var(cov_marginal[i]) for i in range(7)]
    statistics["cov_marginal_mean"] = cov_marginal_means
    statistics["cov_marginal_variance"] = cov_marginal_variances

    # Calculate mean and variance for interval lengths
    interval_lengths_means = [np.mean(interval_lengths[i]) for i in range(7)]
    interval_lengths_variances = [np.var(interval_lengths[i]) for i in range(7)]
    statistics["interval_lengths_mean"] = interval_lengths_means
    statistics["interval_lengths_variance"] = interval_lengths_variances

    return statistics



import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

def plot_boxplot(data, ylabel, title):
    """
    绘制单张箱线图
    Parameters:
    - data (pd.DataFrame): 包含 Method 和 Metric 的数据。
    - ylabel (str): y轴标签。
    - title (str): 图标题。
    """
    sns.set_theme(style="whitegrid", font="Times New Roman", font_scale=1.5)
    plt.figure(figsize=(10, 7))
    ax = sns.boxplot(y="Method", x="Metric", data=data, palette="Set2", width=0.6, linewidth=2.5)
    #ax.set_xlabel("Metric", fontsize=16, weight='bold')
    ax.set_ylabel(ylabel, fontsize=18, weight='bold')
    ax.set_title(title, fontsize=20, weight='bold')
    plt.tight_layout()
    plt.show()

def prepare_and_plot(cov_con, cov_marginal, interval_lengths):
    """
    生成并绘制3张箱线图
    Parameters:
    - cov_con, cov_marginal, interval_lengths (list of lists): 
      每个方法的条件覆盖率、边际覆盖率和区间长度数据。
    """
    methods = ["CRR", "DeCRR", "Bootstrap(RR)", "Bootstrap(DeRR)", "CQR", "NuSVR", "Ridge"]
    
    # 创建条件覆盖率数据
    con_data = pd.DataFrame({
        "Metric": sum(cov_con, []),
        "Method": sum([[m] * len(cov_con[0]) for m in methods], [])
    })
    plot_boxplot(con_data, ylabel="Method", title="Conditional Coverage")

    # 创建边际覆盖率数据
    marginal_data = pd.DataFrame({
        "Metric": sum(cov_marginal, []),
        "Method": sum([[m] * len(cov_marginal[0]) for m in methods], [])
    })
    plot_boxplot(marginal_data, ylabel="Method", title="Marginal Coverage")

    # 创建区间长度数据
    length_data = pd.DataFrame({
        "Metric": sum(interval_lengths, []),
        "Method": sum([[m] * len(interval_lengths[0]) for m in methods], [])
    })
    plot_boxplot(length_data, ylabel="Method", title="Interval Length")


# 调用绘图函数
#prepare_and_plot(cov_con, cov_marginal, interval_lengths,"dataset")

def GetDataset(name, base_path):
    from sklearn.preprocessing import StandardScaler
    import numpy as np
    import pandas as pd
    """ Load a dataset
    
    Parameters
    ----------
    name : string, dataset name
    base_path : string, e.g. "path/to/datasets/directory/"
    
    Returns
    -------
    X : features (nXp)
    y : labels (n)
    
	"""
    if name=="meps_19":
        df = pd.read_csv(base_path + 'meps_19_reg.csv')
        column_names = df.columns
        response_name = "UTILIZATION_reg"
        column_names = column_names[column_names!=response_name]
        column_names = column_names[column_names!="Unnamed: 0"]
        
        col_names = ['AGE', 'PCS42', 'MCS42', 'K6SUM42', 'PERWT15F', 'REGION=1',
                   'REGION=2', 'REGION=3', 'REGION=4', 'SEX=1', 'SEX=2', 'MARRY=1',
                   'MARRY=2', 'MARRY=3', 'MARRY=4', 'MARRY=5', 'MARRY=6', 'MARRY=7',
                   'MARRY=8', 'MARRY=9', 'MARRY=10', 'FTSTU=-1', 'FTSTU=1', 'FTSTU=2',
                   'FTSTU=3', 'ACTDTY=1', 'ACTDTY=2', 'ACTDTY=3', 'ACTDTY=4',
                   'HONRDC=1', 'HONRDC=2', 'HONRDC=3', 'HONRDC=4', 'RTHLTH=-1',
                   'RTHLTH=1', 'RTHLTH=2', 'RTHLTH=3', 'RTHLTH=4', 'RTHLTH=5',
                   'MNHLTH=-1', 'MNHLTH=1', 'MNHLTH=2', 'MNHLTH=3', 'MNHLTH=4',
                   'MNHLTH=5', 'HIBPDX=-1', 'HIBPDX=1', 'HIBPDX=2', 'CHDDX=-1',
                   'CHDDX=1', 'CHDDX=2', 'ANGIDX=-1', 'ANGIDX=1', 'ANGIDX=2',
                   'MIDX=-1', 'MIDX=1', 'MIDX=2', 'OHRTDX=-1', 'OHRTDX=1', 'OHRTDX=2',
                   'STRKDX=-1', 'STRKDX=1', 'STRKDX=2', 'EMPHDX=-1', 'EMPHDX=1',
                   'EMPHDX=2', 'CHBRON=-1', 'CHBRON=1', 'CHBRON=2', 'CHOLDX=-1',
                   'CHOLDX=1', 'CHOLDX=2', 'CANCERDX=-1', 'CANCERDX=1', 'CANCERDX=2',
                   'DIABDX=-1', 'DIABDX=1', 'DIABDX=2', 'JTPAIN=-1', 'JTPAIN=1',
                   'JTPAIN=2', 'ARTHDX=-1', 'ARTHDX=1', 'ARTHDX=2', 'ARTHTYPE=-1',
                   'ARTHTYPE=1', 'ARTHTYPE=2', 'ARTHTYPE=3', 'ASTHDX=1', 'ASTHDX=2',
                   'ADHDADDX=-1', 'ADHDADDX=1', 'ADHDADDX=2', 'PREGNT=-1', 'PREGNT=1',
                   'PREGNT=2', 'WLKLIM=-1', 'WLKLIM=1', 'WLKLIM=2', 'ACTLIM=-1',
                   'ACTLIM=1', 'ACTLIM=2', 'SOCLIM=-1', 'SOCLIM=1', 'SOCLIM=2',
                   'COGLIM=-1', 'COGLIM=1', 'COGLIM=2', 'DFHEAR42=-1', 'DFHEAR42=1',
                   'DFHEAR42=2', 'DFSEE42=-1', 'DFSEE42=1', 'DFSEE42=2',
                   'ADSMOK42=-1', 'ADSMOK42=1', 'ADSMOK42=2', 'PHQ242=-1', 'PHQ242=0',
                   'PHQ242=1', 'PHQ242=2', 'PHQ242=3', 'PHQ242=4', 'PHQ242=5',
                   'PHQ242=6', 'EMPST=-1', 'EMPST=1', 'EMPST=2', 'EMPST=3', 'EMPST=4',
                   'POVCAT=1', 'POVCAT=2', 'POVCAT=3', 'POVCAT=4', 'POVCAT=5',
                   'INSCOV=1', 'INSCOV=2', 'INSCOV=3', 'RACE']
        
        y = df[response_name].values
        X = df[col_names].values
        seed = 222
        np.random.seed(seed)
        X_blog_train, X, y_blog_train, y = train_test_split(X, y, test_size=0.2, random_state=seed) 
        
    if name=="meps_20":
        df = pd.read_csv(base_path + 'meps_20_reg.csv')
        column_names = df.columns
        response_name = "UTILIZATION_reg"
        column_names = column_names[column_names!=response_name]
        column_names = column_names[column_names!="Unnamed: 0"]
        
        col_names = ['AGE', 'PCS42', 'MCS42', 'K6SUM42', 'PERWT15F', 'REGION=1',
                   'REGION=2', 'REGION=3', 'REGION=4', 'SEX=1', 'SEX=2', 'MARRY=1',
                   'MARRY=2', 'MARRY=3', 'MARRY=4', 'MARRY=5', 'MARRY=6', 'MARRY=7',
                   'MARRY=8', 'MARRY=9', 'MARRY=10', 'FTSTU=-1', 'FTSTU=1', 'FTSTU=2',
                   'FTSTU=3', 'ACTDTY=1', 'ACTDTY=2', 'ACTDTY=3', 'ACTDTY=4',
                   'HONRDC=1', 'HONRDC=2', 'HONRDC=3', 'HONRDC=4', 'RTHLTH=-1',
                   'RTHLTH=1', 'RTHLTH=2', 'RTHLTH=3', 'RTHLTH=4', 'RTHLTH=5',
                   'MNHLTH=-1', 'MNHLTH=1', 'MNHLTH=2', 'MNHLTH=3', 'MNHLTH=4',
                   'MNHLTH=5', 'HIBPDX=-1', 'HIBPDX=1', 'HIBPDX=2', 'CHDDX=-1',
                   'CHDDX=1', 'CHDDX=2', 'ANGIDX=-1', 'ANGIDX=1', 'ANGIDX=2',
                   'MIDX=-1', 'MIDX=1', 'MIDX=2', 'OHRTDX=-1', 'OHRTDX=1', 'OHRTDX=2',
                   'STRKDX=-1', 'STRKDX=1', 'STRKDX=2', 'EMPHDX=-1', 'EMPHDX=1',
                   'EMPHDX=2', 'CHBRON=-1', 'CHBRON=1', 'CHBRON=2', 'CHOLDX=-1',
                   'CHOLDX=1', 'CHOLDX=2', 'CANCERDX=-1', 'CANCERDX=1', 'CANCERDX=2',
                   'DIABDX=-1', 'DIABDX=1', 'DIABDX=2', 'JTPAIN=-1', 'JTPAIN=1',
                   'JTPAIN=2', 'ARTHDX=-1', 'ARTHDX=1', 'ARTHDX=2', 'ARTHTYPE=-1',
                   'ARTHTYPE=1', 'ARTHTYPE=2', 'ARTHTYPE=3', 'ASTHDX=1', 'ASTHDX=2',
                   'ADHDADDX=-1', 'ADHDADDX=1', 'ADHDADDX=2', 'PREGNT=-1', 'PREGNT=1',
                   'PREGNT=2', 'WLKLIM=-1', 'WLKLIM=1', 'WLKLIM=2', 'ACTLIM=-1',
                   'ACTLIM=1', 'ACTLIM=2', 'SOCLIM=-1', 'SOCLIM=1', 'SOCLIM=2',
                   'COGLIM=-1', 'COGLIM=1', 'COGLIM=2', 'DFHEAR42=-1', 'DFHEAR42=1',
                   'DFHEAR42=2', 'DFSEE42=-1', 'DFSEE42=1', 'DFSEE42=2',
                   'ADSMOK42=-1', 'ADSMOK42=1', 'ADSMOK42=2', 'PHQ242=-1', 'PHQ242=0',
                   'PHQ242=1', 'PHQ242=2', 'PHQ242=3', 'PHQ242=4', 'PHQ242=5',
                   'PHQ242=6', 'EMPST=-1', 'EMPST=1', 'EMPST=2', 'EMPST=3', 'EMPST=4',
                   'POVCAT=1', 'POVCAT=2', 'POVCAT=3', 'POVCAT=4', 'POVCAT=5',
                   'INSCOV=1', 'INSCOV=2', 'INSCOV=3', 'RACE']
        
        y = df[response_name].values
        X = df[col_names].values

        
        
    if name=="meps_21":
        df = pd.read_csv(base_path + 'meps_21_reg.csv')
        column_names = df.columns
        response_name = "UTILIZATION_reg"
        column_names = column_names[column_names!=response_name]
        column_names = column_names[column_names!="Unnamed: 0"]
        
        col_names = ['AGE', 'PCS42', 'MCS42', 'K6SUM42', 'PERWT16F', 'REGION=1',
                   'REGION=2', 'REGION=3', 'REGION=4', 'SEX=1', 'SEX=2', 'MARRY=1',
                   'MARRY=2', 'MARRY=3', 'MARRY=4', 'MARRY=5', 'MARRY=6', 'MARRY=7',
                   'MARRY=8', 'MARRY=9', 'MARRY=10', 'FTSTU=-1', 'FTSTU=1', 'FTSTU=2',
                   'FTSTU=3', 'ACTDTY=1', 'ACTDTY=2', 'ACTDTY=3', 'ACTDTY=4',
                   'HONRDC=1', 'HONRDC=2', 'HONRDC=3', 'HONRDC=4', 'RTHLTH=-1',
                   'RTHLTH=1', 'RTHLTH=2', 'RTHLTH=3', 'RTHLTH=4', 'RTHLTH=5',
                   'MNHLTH=-1', 'MNHLTH=1', 'MNHLTH=2', 'MNHLTH=3', 'MNHLTH=4',
                   'MNHLTH=5', 'HIBPDX=-1', 'HIBPDX=1', 'HIBPDX=2', 'CHDDX=-1',
                   'CHDDX=1', 'CHDDX=2', 'ANGIDX=-1', 'ANGIDX=1', 'ANGIDX=2',
                   'MIDX=-1', 'MIDX=1', 'MIDX=2', 'OHRTDX=-1', 'OHRTDX=1', 'OHRTDX=2',
                   'STRKDX=-1', 'STRKDX=1', 'STRKDX=2', 'EMPHDX=-1', 'EMPHDX=1',
                   'EMPHDX=2', 'CHBRON=-1', 'CHBRON=1', 'CHBRON=2', 'CHOLDX=-1',
                   'CHOLDX=1', 'CHOLDX=2', 'CANCERDX=-1', 'CANCERDX=1', 'CANCERDX=2',
                   'DIABDX=-1', 'DIABDX=1', 'DIABDX=2', 'JTPAIN=-1', 'JTPAIN=1',
                   'JTPAIN=2', 'ARTHDX=-1', 'ARTHDX=1', 'ARTHDX=2', 'ARTHTYPE=-1',
                   'ARTHTYPE=1', 'ARTHTYPE=2', 'ARTHTYPE=3', 'ASTHDX=1', 'ASTHDX=2',
                   'ADHDADDX=-1', 'ADHDADDX=1', 'ADHDADDX=2', 'PREGNT=-1', 'PREGNT=1',
                   'PREGNT=2', 'WLKLIM=-1', 'WLKLIM=1', 'WLKLIM=2', 'ACTLIM=-1',
                   'ACTLIM=1', 'ACTLIM=2', 'SOCLIM=-1', 'SOCLIM=1', 'SOCLIM=2',
                   'COGLIM=-1', 'COGLIM=1', 'COGLIM=2', 'DFHEAR42=-1', 'DFHEAR42=1',
                   'DFHEAR42=2', 'DFSEE42=-1', 'DFSEE42=1', 'DFSEE42=2',
                   'ADSMOK42=-1', 'ADSMOK42=1', 'ADSMOK42=2', 'PHQ242=-1', 'PHQ242=0',
                   'PHQ242=1', 'PHQ242=2', 'PHQ242=3', 'PHQ242=4', 'PHQ242=5',
                   'PHQ242=6', 'EMPST=-1', 'EMPST=1', 'EMPST=2', 'EMPST=3', 'EMPST=4',
                   'POVCAT=1', 'POVCAT=2', 'POVCAT=3', 'POVCAT=4', 'POVCAT=5',
                   'INSCOV=1', 'INSCOV=2', 'INSCOV=3', 'RACE']
        
        y = df[response_name].values
        X = df[col_names].values
        
    if name=="star":
        df = pd.read_csv(base_path + 'STAR.csv')
        df.loc[df['gender'] == 'female', 'gender'] = 0
        df.loc[df['gender'] == 'male', 'gender'] = 1
        
        df.loc[df['ethnicity'] == 'cauc', 'ethnicity'] = 0
        df.loc[df['ethnicity'] == 'afam', 'ethnicity'] = 1
        df.loc[df['ethnicity'] == 'asian', 'ethnicity'] = 2
        df.loc[df['ethnicity'] == 'hispanic', 'ethnicity'] = 3
        df.loc[df['ethnicity'] == 'amindian', 'ethnicity'] = 4
        df.loc[df['ethnicity'] == 'other', 'ethnicity'] = 5
        
        df.loc[df['stark'] == 'regular', 'stark'] = 0
        df.loc[df['stark'] == 'small', 'stark'] = 1
        df.loc[df['stark'] == 'regular+aide', 'stark'] = 2
        
        df.loc[df['star1'] == 'regular', 'star1'] = 0
        df.loc[df['star1'] == 'small', 'star1'] = 1
        df.loc[df['star1'] == 'regular+aide', 'star1'] = 2        
        
        df.loc[df['star2'] == 'regular', 'star2'] = 0
        df.loc[df['star2'] == 'small', 'star2'] = 1
        df.loc[df['star2'] == 'regular+aide', 'star2'] = 2   

        df.loc[df['star3'] == 'regular', 'star3'] = 0
        df.loc[df['star3'] == 'small', 'star3'] = 1
        df.loc[df['star3'] == 'regular+aide', 'star3'] = 2      
        
        df.loc[df['lunchk'] == 'free', 'lunchk'] = 0
        df.loc[df['lunchk'] == 'non-free', 'lunchk'] = 1
        
        df.loc[df['lunch1'] == 'free', 'lunch1'] = 0    
        df.loc[df['lunch1'] == 'non-free', 'lunch1'] = 1      
        
        df.loc[df['lunch2'] == 'free', 'lunch2'] = 0    
        df.loc[df['lunch2'] == 'non-free', 'lunch2'] = 1  
        
        df.loc[df['lunch3'] == 'free', 'lunch3'] = 0    
        df.loc[df['lunch3'] == 'non-free', 'lunch3'] = 1  
        
        df.loc[df['schoolk'] == 'inner-city', 'schoolk'] = 0
        df.loc[df['schoolk'] == 'suburban', 'schoolk'] = 1
        df.loc[df['schoolk'] == 'rural', 'schoolk'] = 2  
        df.loc[df['schoolk'] == 'urban', 'schoolk'] = 3

        df.loc[df['school1'] == 'inner-city', 'school1'] = 0
        df.loc[df['school1'] == 'suburban', 'school1'] = 1
        df.loc[df['school1'] == 'rural', 'school1'] = 2  
        df.loc[df['school1'] == 'urban', 'school1'] = 3      
        
        df.loc[df['school2'] == 'inner-city', 'school2'] = 0
        df.loc[df['school2'] == 'suburban', 'school2'] = 1
        df.loc[df['school2'] == 'rural', 'school2'] = 2  
        df.loc[df['school2'] == 'urban', 'school2'] = 3      
        
        df.loc[df['school3'] == 'inner-city', 'school3'] = 0
        df.loc[df['school3'] == 'suburban', 'school3'] = 1
        df.loc[df['school3'] == 'rural', 'school3'] = 2  
        df.loc[df['school3'] == 'urban', 'school3'] = 3  
        
        df.loc[df['degreek'] == 'bachelor', 'degreek'] = 0
        df.loc[df['degreek'] == 'master', 'degreek'] = 1
        df.loc[df['degreek'] == 'specialist', 'degreek'] = 2  
        df.loc[df['degreek'] == 'master+', 'degreek'] = 3 

        df.loc[df['degree1'] == 'bachelor', 'degree1'] = 0
        df.loc[df['degree1'] == 'master', 'degree1'] = 1
        df.loc[df['degree1'] == 'specialist', 'degree1'] = 2  
        df.loc[df['degree1'] == 'phd', 'degree1'] = 3              
        
        df.loc[df['degree2'] == 'bachelor', 'degree2'] = 0
        df.loc[df['degree2'] == 'master', 'degree2'] = 1
        df.loc[df['degree2'] == 'specialist', 'degree2'] = 2  
        df.loc[df['degree2'] == 'phd', 'degree2'] = 3
        
        df.loc[df['degree3'] == 'bachelor', 'degree3'] = 0
        df.loc[df['degree3'] == 'master', 'degree3'] = 1
        df.loc[df['degree3'] == 'specialist', 'degree3'] = 2  
        df.loc[df['degree3'] == 'phd', 'degree3'] = 3          
        
        df.loc[df['ladderk'] == 'level1', 'ladderk'] = 0
        df.loc[df['ladderk'] == 'level2', 'ladderk'] = 1
        df.loc[df['ladderk'] == 'level3', 'ladderk'] = 2  
        df.loc[df['ladderk'] == 'apprentice', 'ladderk'] = 3  
        df.loc[df['ladderk'] == 'probation', 'ladderk'] = 4
        df.loc[df['ladderk'] == 'pending', 'ladderk'] = 5
        df.loc[df['ladderk'] == 'notladder', 'ladderk'] = 6
        
        
        df.loc[df['ladder1'] == 'level1', 'ladder1'] = 0
        df.loc[df['ladder1'] == 'level2', 'ladder1'] = 1
        df.loc[df['ladder1'] == 'level3', 'ladder1'] = 2  
        df.loc[df['ladder1'] == 'apprentice', 'ladder1'] = 3  
        df.loc[df['ladder1'] == 'probation', 'ladder1'] = 4
        df.loc[df['ladder1'] == 'noladder', 'ladder1'] = 5
        df.loc[df['ladder1'] == 'notladder', 'ladder1'] = 6
        
        df.loc[df['ladder2'] == 'level1', 'ladder2'] = 0
        df.loc[df['ladder2'] == 'level2', 'ladder2'] = 1
        df.loc[df['ladder2'] == 'level3', 'ladder2'] = 2  
        df.loc[df['ladder2'] == 'apprentice', 'ladder2'] = 3  
        df.loc[df['ladder2'] == 'probation', 'ladder2'] = 4
        df.loc[df['ladder2'] == 'noladder', 'ladder2'] = 5
        df.loc[df['ladder2'] == 'notladder', 'ladder2'] = 6
        
        df.loc[df['ladder3'] == 'level1', 'ladder3'] = 0
        df.loc[df['ladder3'] == 'level2', 'ladder3'] = 1
        df.loc[df['ladder3'] == 'level3', 'ladder3'] = 2  
        df.loc[df['ladder3'] == 'apprentice', 'ladder3'] = 3  
        df.loc[df['ladder3'] == 'probation', 'ladder3'] = 4
        df.loc[df['ladder3'] == 'noladder', 'ladder3'] = 5
        df.loc[df['ladder3'] == 'notladder', 'ladder3'] = 6
        
        df.loc[df['tethnicityk'] == 'cauc', 'tethnicityk'] = 0
        df.loc[df['tethnicityk'] == 'afam', 'tethnicityk'] = 1
        
        df.loc[df['tethnicity1'] == 'cauc', 'tethnicity1'] = 0
        df.loc[df['tethnicity1'] == 'afam', 'tethnicity1'] = 1
        
        df.loc[df['tethnicity2'] == 'cauc', 'tethnicity2'] = 0
        df.loc[df['tethnicity2'] == 'afam', 'tethnicity2'] = 1
        
        df.loc[df['tethnicity3'] == 'cauc', 'tethnicity3'] = 0
        df.loc[df['tethnicity3'] == 'afam', 'tethnicity3'] = 1
        df.loc[df['tethnicity3'] == 'asian', 'tethnicity3'] = 2
        
        df = df.dropna()
        
        grade = df["readk"] + df["read1"] + df["read2"] + df["read3"]
        grade += df["mathk"] + df["math1"] + df["math2"] + df["math3"]
        
        
        names = df.columns
        target_names = names[8:16]
        data_names = np.concatenate((names[0:8],names[17:]))
        X = df.loc[:, data_names].values
        y = grade.values
        
        
    if name=="facebook_1":
        df = pd.read_csv(base_path + 'facebook/Features_Variant_1.csv')        
        y = df.iloc[:,53].values
        X = df.iloc[:,0:53].values     
        #seed = 222
        #np.random.seed(seed)
        #X_blog_train, X, y_blog_train, y = train_test_split(X, y, test_size=0.1, random_state=seed)   
    
    if name=="facebook_2":
        df = pd.read_csv(base_path + 'facebook/Features_Variant_2.csv')        
        y = df.iloc[:,53].values
        X = df.iloc[:,0:53].values
        #seed = 222
        #np.random.seed(seed)
        #X_blog_train, X, y_blog_train, y = train_test_split(X, y, test_size=0.1, random_state=seed) 
        
    if name=="bio":
        #https://github.com/joefavergel/TertiaryPhysicochemicalProperties/blob/master/RMSD-ProteinTertiaryStructures.ipynb
        df = pd.read_csv(base_path + 'CASP.csv')        
        y = df.iloc[:,0].values
        X = df.iloc[:,1:].values   
        #seed = 222
        #np.random.seed(seed)
        #X_blog_train, X, y_blog_train, y = train_test_split(X, y, test_size=0.1, random_state=seed)     
        
    if name=='blog_data':
        # https://github.com/xinbinhuang/feature-selection_blogfeedback
        df = pd.read_csv(base_path + 'blogData_train.csv', header=None)
        X = df.iloc[:,0:280].values
        y = df.iloc[:,-1].values
        #seed = 222
        #np.random.seed(seed)
        #X_blog_train, X, y_blog_train, y = train_test_split(X, y, test_size=0.1, random_state=seed)
    
    if name == "concrete":
        dataset = np.loadtxt(open(base_path + 'Concrete_Data.csv', "rb"), delimiter=",", skiprows=1)
        X = dataset[:, :-1]
        y = dataset[:, -1:]
    
    
    if name=="bike":
        # https://www.kaggle.com/rajmehra03/bike-sharing-demand-rmsle-0-3194
        df=pd.read_csv(base_path + 'bike_train.csv')
        
        # # seperating season as per values. this is bcoz this will enhance features.
        season=pd.get_dummies(df['season'],prefix='season')
        df=pd.concat([df,season],axis=1)
        
        # # # same for weather. this is bcoz this will enhance features.
        weather=pd.get_dummies(df['weather'],prefix='weather')
        df=pd.concat([df,weather],axis=1)
        
        # # # now can drop weather and season.
        df.drop(['season','weather'],inplace=True,axis=1)
        df.head()
        
        df["hour"] = [t.hour for t in pd.DatetimeIndex(df.datetime)]
        df["day"] = [t.dayofweek for t in pd.DatetimeIndex(df.datetime)]
        df["month"] = [t.month for t in pd.DatetimeIndex(df.datetime)]
        df['year'] = [t.year for t in pd.DatetimeIndex(df.datetime)]
        df['year'] = df['year'].map({2011:0, 2012:1})
 
        df.drop('datetime',axis=1,inplace=True)
        df.drop(['casual','registered'],axis=1,inplace=True)
        df.columns.to_series().groupby(df.dtypes).groups
        X = df.drop('count',axis=1).values
        y = df['count'].values
        #seed = 222
        #np.random.seed(seed)
        #X_blog_train, X, y_blog_train, y = train_test_split(X, y, test_size=0.5, random_state=seed)


    
    if name=="community":
        # https://github.com/vbordalo/Communities-Crime/blob/master/Crime_v1.ipynb
        attrib = pd.read_csv(base_path + 'communities_attributes.csv', delim_whitespace = True)
        data = pd.read_csv(base_path + 'communities.data', names = attrib['attributes'])
        data = data.drop(columns=['state','county',
                        'community','communityname',
                        'fold'], axis=1)
        
        data = data.replace('?', np.nan)
        
        # Impute mean values for samples with missing values        
        from sklearn.impute import SimpleImputer

        # 创建 SimpleImputer 实例
        imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

        # 拟合并转换数据
        data[['OtherPerCap']] = imputer.fit_transform(data[['OtherPerCap']])
        data = data.dropna(axis=1)
        X = data.iloc[:, 0:100].values
        y = data.iloc[:, 100].values
        

    if name=="boston":
        from sklearn.datasets import fetch_california_housing

        data = fetch_california_housing(as_frame=True)
        X = data.data
        y = data.target




    return X,y