# -*- coding: utf-8 -*-

import numpy as np
import pandas as pd
import openai
import time
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from scipy.optimize import minimize
import statsmodels.api as sm
from scipy.special import expit
import statsmodels.api as sm
from sklearn.ensemble import RandomForestRegressor
from sklearn.semi_supervised import LabelSpreading
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeRegressor
from scipy.stats import t

# Set your OpenAI API key
openai.api_key = '[API KEY]'  # insert your api key

pitt_labeled = pd.read_csv('pitt_labeled_with_nlp.csv')
hopkins_unlabeled = pd.read_csv('hopkins_unlabeled_with_nlp.csv')

# Drop specific columns from Hopkins data
hopkins_unlabeled_cleaned = hopkins_unlabeled.drop(columns=[
    'File',
    'Date.of.Audio..taped.Language.Sample',
    'X1Date.of.Testing'
], errors='ignore')  # 'errors=ignore' in case the column name slightly differs or missing

# Drop specific columns from Pitt data
pitt_labeled_cleaned = pitt_labeled.drop(columns=[
    'id',
    'idate',
    'dx1',
    'dx2',
    'dx3'
], errors='ignore')

# Containers for results
beta_hats = []
labeled_only_params = []

np.random.seed(32)

# How many subjects to generate
n_subjects = 300
n_iter = 300


# Containers for results
beta_hats_proposed = pd.DataFrame()
beta_hats_naive = pd.DataFrame()
beta_hats_ssl = pd.DataFrame()

"""## U: test score"""

# Define categorization function for Hamilton Depression Scale (HAM-D)
def categorize_hamilton(score):
    if pd.isnull(score):
        return 'Unknown'
    elif 0 <= score <= 7:
        return 'Normal'
    elif 8 <= score <= 13:
        return 'Mild Depression'
    elif 14 <= score <= 18:
        return 'Moderate Depression'
    elif 19 <= score <= 22:
        return 'Severe Depression'
    elif score >= 23:
        return 'Very Severe Depression'
    else:
        return 'Invalid Score'

# Apply the function to your dataframe
pitt_labeled['hamilton_category'] = pitt_labeled['hamilton'].apply(categorize_hamilton)

# Check the distribution of categories
print(pitt_labeled['hamilton_category'].value_counts())

"""## (1) LLM prediction"""

# Define helper function to generate prediction
def zero_shot_predict(Sex, Race, Education, ttr_category, filler_ratio_category, mean_utterance_length, Score, Transcript):
    # Truncate transcript if too long
    if len(Transcript) > 8000:
        Transcript = Transcript[:8000]

    prompt = f"""You are a medical assistant trained to assess Alzheimer's Disease (AD) risk using both structured patient data and natural language transcripts of picture description tasks. Given a patient’s demographic information, two NLP-derived categories (lexical diversity and filler word usage), and their transcript, decide whether they are likely to have Alzheimer’s Disease (label as 1) or not (label as 0). Your prediction should rely primarily on semantic coherence, richness of description, sentence structure, and topic maintenance in the transcripts, supported by the NLP-derived categories.

Focus especially on:

- Semantic richness: Are objects and actions well described?
- Syntactic fluency: Are sentences grammatically structured?
- Discourse organization: Is the description cohesive, progressing logically?
- Error patterns: Do you see word-finding difficulty, repetition, or vagueness?

Interpretation of NLP-derived categories:
- Lexical Diversity Category (Type-Token Ratio):
  - High lexical diversity: Typical of cognitively healthy individuals.
  - Moderate lexical diversity: Mildly reduced; possibly normal aging.
  - Low-moderate lexical diversity: Suggestive of mild cognitive impairment.
  - Low lexical diversity: Often indicative of Alzheimer's or dementia.

- Filler Word Usage Category:
  - Normal: Typical cognitive function.
  - Mildly elevated: Possible mild cognitive changes.
  - Elevated: Indicative of cognitive impairment or Alzheimer's.

Patient Information:
- Sex: {Sex}
- Race: {Race}
- Education (years): {Education}
- Lexical Diversity Category: {ttr_category}
- Filler Word Usage Category: {filler_ratio_category}
- Speech excerpt: {Transcript}
- Hamilton score: {Score}


Return your answer as 0 (unlikely AD) or 1 (likely AD). No explanation is needed.

Answer:"""

    try:
        response = openai.chat.completions.create(
            model="gpt-4",
            messages=[{"role": "user", "content": prompt}],
            temperature=0.5,
            max_tokens=1
        )
        prediction_raw = response.choices[0].message.content.strip()

        # Only keep 0 or 1
        prediction_clean = ''.join(c for c in prediction_raw if c in ['0', '1'])
        if prediction_clean in ['0', '1']:
            return int(prediction_clean)
        else:
            print(f"Unexpected model output: {prediction_raw}")
            return None

    except Exception as e:
        print(f"Error: {e}")
        return None

"""## (2) Weight computation"""

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor

def estimate_cov_and_var(X, Y_star_xz, Y, X_new=None, n_estimators=100, random_state=0):
    """
    Estimates conditional covariance, variance, conditional mean, and weighting function using Random Forest.

    Parameters:
    X (DataFrame or ndarray): Covariate matrix for training (n_samples x n_features).
    Y_star_xz (array-like): Outcome Y_star_xz for training.
    Y (array-like): Outcome Y for training.
    X_new (DataFrame or ndarray, optional): Covariate matrix for predictions. If None, uses X.
    n_estimators (int): Number of trees in Random Forest.
    random_state (int): Random state for reproducibility.

    Returns:
    DataFrame: Data augmented with gamma_xz_hat, nu_xz_hat, m_x_hat, and omega_xz_hat estimates.
    """
    # Ensure inputs are arrays
    X = np.asarray(X)
    Y_star_xz = np.asarray(Y_star_xz)
    Y = np.asarray(Y)
    X_pred = np.asarray(X_new) if X_new is not None else X

    # Fit E[Y_star_xz | X]
    model_Y_star_xz = RandomForestRegressor(n_estimators=n_estimators, random_state=random_state)
    model_Y_star_xz.fit(X, Y_star_xz)
    Y_star_xz_hat = model_Y_star_xz.predict(X)

    # Fit E[Y | X]
    model_Y = RandomForestRegressor(n_estimators=n_estimators, random_state=random_state)
    model_Y.fit(X, Y)
    Y_hat = model_Y.predict(X)

    # Compute residuals
    resid_Y_star_xz = Y_star_xz - Y_star_xz_hat
    resid_Y = Y - Y_hat

    # Estimate gamma_xz(x) = Cov(Y_star_xz, Y | X=x)
    resid_product = resid_Y_star_xz * resid_Y
    model_cov = RandomForestRegressor(n_estimators=n_estimators, random_state=random_state)
    model_cov.fit(X, resid_product)
    gamma_xz_hat = model_cov.predict(X_pred)

    # Estimate nu_xz(x) = Var(Y_star_xz | X=x)
    resid_Y_star_xz_sq = resid_Y_star_xz ** 2
    model_var = RandomForestRegressor(n_estimators=n_estimators, random_state=random_state)
    model_var.fit(X, resid_Y_star_xz_sq)
    nu_xz_hat = model_var.predict(X_pred)

    # Predict m(x) = E[Y | X=x] on new data
    m_x_hat = model_Y.predict(X_pred)

    # Create output DataFrame
    results_df = pd.DataFrame(X_pred, columns=[f'X{i+1}' for i in range(X_pred.shape[1])])
    results_df['gamma_xz_hat'] = gamma_xz_hat
    results_df['nu_xz_hat'] = nu_xz_hat
    results_df['m_x_hat'] = m_x_hat

    # Compute omega_xz_hat
    results_df['omega_xz_hat'] = results_df['gamma_xz_hat'] / results_df['nu_xz_hat']
    results_df['omega_xz_hat'] = results_df['omega_xz_hat'].replace([np.inf, -np.inf], 0).fillna(0)

    return results_df

"""## (3) Prediction invariant region"""

def compute_honest_region(Y_star_XZ, Y_star_XZU, X, alpha=0.05, depth_d=3, seed=44):

    n = len(Y_star_XZ)
    diff = Y_star_XZ - Y_star_XZU
    s = np.std(diff, ddof=1)

    if s == 0:
        return False, pd.DataFrame(), 'No variation in outcome differences.'

    T_global = np.mean(diff) / (s / np.sqrt(n))
    t_crit_global = t.ppf(1 - alpha / 2, df=n - 1)

    if np.abs(T_global) <= t_crit_global:
        return True, X.copy(), 'Whole region is honest.'

    np.random.seed(seed)
    indices = np.arange(n)
    np.random.shuffle(indices)
    I_1, I_2 = indices[:n // 2], indices[n // 2:]

    cart = DecisionTreeRegressor(max_depth=depth_d, random_state=seed)
    cart.fit(X.iloc[I_1, :], diff[I_1])

    leaf_ids_I2 = cart.apply(X.iloc[I_2, :])
    unique_leaves = np.unique(leaf_ids_I2)

    honest_leaves = []
    for leaf in unique_leaves:
        leaf_indices = I_2[leaf_ids_I2 == leaf]
        diff_leaf = diff[leaf_indices]
        s_leaf = np.std(diff_leaf, ddof=1)
        n_leaf = len(diff_leaf)

        if n_leaf <= 1 or s_leaf == 0:
            continue

        T_leaf = np.mean(diff_leaf) / (s_leaf / np.sqrt(n_leaf))
        t_crit_leaf = t.ppf(1 - alpha / (2 * len(unique_leaves)), df=n_leaf - 1)

        if np.abs(T_leaf) <= t_crit_leaf:
            honest_leaves.append(leaf)

    if honest_leaves:
        A_hat = X.iloc[np.isin(cart.apply(X), honest_leaves), :].copy()
        return False, A_hat.reset_index(drop=True), 'Partial honest regions identified.'
    else:
        return False, pd.DataFrame(), 'No honest regions identified.'

# Y_star_XZ = labeled['Y_star_XZ']
# Y_star_XZU = labeled['Y_star_XZU']

# honest_all, A_hat, message = compute_honest_region(Y_star_XZ, Y_star_XZU, X)

# print("Honest All:", honest_all)
# print("Message:", message)
# if not A_hat.empty:
#     print("Identified Honest Region(s):")
#     print(A_hat.head())
# else:
#     print("No regions returned.")

"""## (4) Pesudo Label"""

def pseudo_label_generation(result_df_xz, result_df_xzu, labeled, unlabeled, honest_all, A_hat):

  n = labeled.shape[0]
  N = unlabeled.shape[0]

  result_df_labeled_xz = result_df_xz.iloc[:n].reset_index(drop=True)
  result_df_unlabeled_xz = result_df_xz.iloc[n:].reset_index(drop=True)

  result_df_labeled_xzu = result_df_xzu.iloc[:n].reset_index(drop=True)
  result_df_unlabeled_xzu = result_df_xzu.iloc[n:].reset_index(drop=True)

  m_x_hat_labeled_xz = result_df_labeled_xz['m_x_hat'].values
  m_x_hat_labeled_xzu = result_df_labeled_xzu['m_x_hat'].values
  omega_xz_hat_labeled = result_df_labeled_xz['omega_xz_hat'].values
  omega_xzu_hat_labeled = result_df_labeled_xzu['omega_xzu_hat'].values

  m_x_hat_unlabeled_xz = result_df_unlabeled_xz['m_x_hat'].values
  m_x_hat_unlabeled_xzu = result_df_unlabeled_xzu['m_x_hat'].values
  omega_xz_hat_unlabeled = result_df_unlabeled_xz['omega_xz_hat'].values
  omega_xzu_hat_unlabeled = result_df_unlabeled_xzu['omega_xzu_hat'].values

  Y_star_labeled_xz = labeled['Y_star_XZ'].values
  Y_star_labeled_xzu = labeled['Y_star_XZU'].values
  Y_star_unlabeled_xz = unlabeled['Y_star_XZ'].values

  if honest_all:
    # entire region is honest
    f_hat_unlabeled = m_x_hat_unlabeled_xz  + omega_xzu_hat_unlabeled * (Y_star_unlabeled_xz - m_x_hat_unlabeled_xz)
    f_hat_labeled = m_x_hat_labeled_xzu + omega_xzu_hat_labeled * (Y_star_labeled_xzu - m_x_hat_labeled_xzu)
  elif A_hat.empty:
    # no honest region
    f_hat_unlabeled = m_x_hat_unlabeled_xz  + omega_xz_hat_unlabeled * (Y_star_unlabeled_xz - m_x_hat_unlabeled_xz)
    f_hat_labeled = m_x_hat_labeled_xz + omega_xz_hat_labeled * (Y_star_labeled_xz - m_x_hat_labeled_xz)
  else:
    # for identified honest regions
    labeled_indices_in_A_hat = labeled.index.isin(A_hat.index)
    unlabeled_indices_in_A_hat = unlabeled.index.isin(A_hat.index)
    f_hat_unlabeled = np.where(
            unlabeled_indices_in_A_hat,
            m_x_hat_unlabeled_xz + omega_xzu_hat_unlabeled * (Y_star_unlabeled_xz - m_x_hat_unlabeled_xz),
            m_x_hat_unlabeled_xz + omega_xz_hat_unlabeled * (Y_star_unlabeled_xz - m_x_hat_unlabeled_xz)
        )
    # Labeled data pseudo-labels
    f_hat_labeled = np.where(
            labeled_indices_in_A_hat,
            m_x_hat_labeled_xzu + omega_xzu_hat_labeled * (Y_star_labeled_xzu - m_x_hat_labeled_xzu),
            m_x_hat_labeled_xz + omega_xz_hat_labeled * (Y_star_labeled_xz - m_x_hat_labeled_xz)
        )


    # Initialize pseudo-label array
  pseudo_labels = np.zeros(N + n)

    # Assign pseudo-labels
  pseudo_labels[:n] = labeled['Y'] + (N/n) * (labeled['Y'] - f_hat_labeled)  # Labeled data pseudo-labels
  pseudo_labels[n:] = f_hat_unlabeled

  return pseudo_labels

# pesudo_labels = pseudo_label_generation(result_df_xz, result_df_xzu, labeled, unlabeled, False, A_hat)

"""## Simulation"""

for iteration in range(n_iter):
        print(f"\n--- Simulation Iteration: {iteration + 1} ---")

        # Initialize storage for simulated data
        labeled_samples = []
        unlabeled_samples = []

        # Simulate subject sampling
        for _ in range(n_subjects):
            S_i = np.random.binomial(1, 0.311)

            if S_i == 1:
                # Sample from labeled dataset
                sample = pitt_labeled.sample(1).copy().reset_index(drop=True)
                labeled_samples.append(sample)
            else:
                # Sample from unlabeled dataset
                sample = hopkins_unlabeled.sample(1).copy().reset_index(drop=True)
                unlabeled_samples.append(sample)

        # Combine sampled labeled data
        labeled_data = (
            pd.concat(labeled_samples, ignore_index=True)
            if labeled_samples else pd.DataFrame()
        )

        # Combine sampled unlabeled data
        unlabeled_data = (
            pd.concat(unlabeled_samples, ignore_index=True)
            if unlabeled_samples else pd.DataFrame()
        )

        # Save the datasets
        labeled_data.to_csv('labeled_sim.csv', index=False)
        unlabeled_data.to_csv('unlabeled_sim.csv', index=False)

        print("Files saved: labeled_sim.csv and unlabeled_sim.csv")

# Step 2: Generate Y_star using GPT models
        hopkins_unlabeled_cleaned = pd.read_csv('unlabeled_sim.csv')

    # Predict for all subjects
        predicted_Y_XZ = []
        predicted_Y_XZU = []
        for idx, row in hopkins_unlabeled_cleaned.iterrows():
            Sex = row.get('Sex', 'Unknown')
            Race = row.get('Race', 'Unknown')
            Education = row.get('Education', 'Unknown')
            ttr_category = row.get('ttr_category', 'Unknown')
            filler_ratio_category = row.get('filler_ratio_category', 'Unknown')
            Score = row.get('hamilton_category', '')
            mean_utterance_length = row.get('mean_utterance_length', 'Unknown')
            Transcripts = row.get('Transcripts', '')

        #print(f"\nRow {idx} — Sex: {Sex}, Race: {Race}, Education: {Education}, type token ratio :{ttr_category},filler words ratio :{filler_ratio_category}")
        #print(f"Transcript preview: {Transcripts[:10]}")

            if pd.isna(Transcripts) or pd.isna(Sex) or pd.isna(Race) or pd.isna(ttr_category) or pd.isna(filler_ratio_category) or pd.isna(mean_utterance_length):
                predicted_Y_XZ.append(None)
                predicted_Y_XZU.append(None)
            else:
                pred_XZ = zero_shot_predict(Sex, Race, Education, ttr_category, filler_ratio_category, mean_utterance_length, "NA", Transcripts)
                pred_XZU = zero_shot_predict(Sex, Race, Education, ttr_category, filler_ratio_category, mean_utterance_length, Score, Transcripts)
                predicted_Y_XZ.append(pred_XZ)
                predicted_Y_XZU.append(pred_XZU)
                time.sleep(1)

    # Add predictions
        hopkins_unlabeled_cleaned['Y_star_XZ'] = predicted_Y_XZ
        hopkins_unlabeled_cleaned['Y_star_XZU'] = predicted_Y_XZU

    # Save
        hopkins_unlabeled_cleaned.to_csv('unlabeled_sim_with_predictions_mismatch.csv', index=False)
        print("Predictions saved to hopkins_unlabeled_with_predictions.csv")

    # Load your cleaned labeled dataset
        pitt_labeled_cleaned = pd.read_csv('labeled_sim.csv')
        pitt_labeled_cleaned['sex'] = pitt_labeled_cleaned['sex'].replace({0: 'female', 1: 'male'})
        pitt_labeled_cleaned['race'] = pitt_labeled_cleaned['race'].replace({1: 'white', 2: 'black'})

    # Predict for all subjects
        predicted_Y_XZ = []
        predicted_Y_XZU = []
        for idx, row in pitt_labeled_cleaned.iterrows():
            Sex = row.get('sex', 'Unknown')
            Race = row.get('race', 'Unknown')
            Education = row.get('educ', 'Unknown')
            ttr_category = row.get('ttr_category', 'Unknown')
            filler_ratio_category = row.get('filler_ratio_category', 'Unknown')
            mean_utterance_length = row.get('mean_utterance_length', 'Unknown')
            Score = row.get('hamilton_category', '')
            Transcripts = row.get('Transcripts', '')

            if pd.isna(Transcripts) or not str(Transcripts).strip():
                transcript_preview = "unknown"
            else:
                transcript_preview = Transcripts[:10]

        #print(f"\nRow {idx} — Sex: {Sex}, Race: {Race}, Education: {Education}, type token ratio :{ttr_category},filler words ratio :{filler_ratio_category}")
        #print(f"Transcript preview: {transcript_preview}")

            if pd.isna(Transcripts) or pd.isna(Sex) or pd.isna(Race) or pd.isna(Education) or pd.isna(ttr_category) or pd.isna(filler_ratio_category) or pd.isna(mean_utterance_length):
                predicted_Y_XZ.append(None)
                predicted_Y_XZU.append(None)
            else:
                pred_XZ = zero_shot_predict(Sex, Race, Education, ttr_category, filler_ratio_category, mean_utterance_length, "NA", Transcripts)
                pred_XZU = zero_shot_predict(Sex, Race, Education, ttr_category, filler_ratio_category, mean_utterance_length, Score, Transcripts)
                predicted_Y_XZ.append(pred_XZ)
                predicted_Y_XZU.append(pred_XZU)
                time.sleep(1)

    # Add predictions
        pitt_labeled_cleaned['Y_star_XZ'] = predicted_Y_XZ
        pitt_labeled_cleaned['Y_star_XZU'] = predicted_Y_XZU

    # Save
        pitt_labeled_cleaned.to_csv('labeled_sim_with_predictions_mismatch.csv', index=False)
        print("Predictions saved to pitt_labeled_with_predictions_mismatch.csv")


    # Step 3: Compute weights --------------------------------------------------
    # Load data
        labeled = pd.read_csv('labeled_sim_with_predictions_mismatch.csv')
        unlabeled = pd.read_csv('unlabeled_sim_with_predictions_mismatch.csv')

        labeled['Y_star_XZ'] = labeled['Y_star_XZ'].fillna(1)
        labeled['Y_star_XZU'] = labeled['Y_star_XZU'].fillna(1)

    # Rename columns
        labeled = labeled.rename(columns={'basedx': 'Y', 'X1': 'Sex'})
        unlabeled = unlabeled.rename(columns={'Sex':'sex',"Education":"educ"})

    # Clean types
        labeled['sex'] = labeled['sex'].map({'female': 0, 'male': 1}).astype(int)
        unlabeled['sex'] = unlabeled['sex'].map({'female': 0, 'male': 1}).astype(int)

    # compute borrowing weights
        X = labeled[['sex', 'educ','ttr_category','filler_ratio_category','mean_utterance_length']]
        X_un = unlabeled[['sex', 'educ','ttr_category','filler_ratio_category','mean_utterance_length']]
        X_new = pd.concat([X, X_un], axis=0, ignore_index=True)
        Y = labeled['Y']
        Y_star_XZ = labeled['Y_star_XZ']
        Y_star_XZU = labeled['Y_star_XZU']


    ## format categorical variables
        cat_var = 'ttr_category'
    # Combine categories from X and X_new, excluding NaN explicitly
        combined_categories = pd.concat([X[cat_var], X_new[cat_var]]).dropna().unique()
    # Create a categorical type without NaN
        categorical_dtype = pd.CategoricalDtype(categories=combined_categories, ordered=True)
    # Set the categorical type explicitly, handling NaNs gracefully
        X[cat_var] = X[cat_var].astype(categorical_dtype)
        X_new[cat_var] = X_new[cat_var].astype(categorical_dtype)
        X = pd.get_dummies(X, drop_first=True)
        X_new = pd.get_dummies(X_new, drop_first=True)

        #result_df_xz = estimate_cov_and_var(X, Y_star_XZ, Y, X_new, n_estimators=100, random_state=0)
        #result_df_xzu = estimate_cov_and_var(X, Y_star_XZU, Y, X_new, n_estimators=100, random_state=0)

        try:
          result_df_xz = estimate_cov_and_var(X, Y_star_XZ, Y, X_new, n_estimators=100, random_state=0)
          result_df_xzu = estimate_cov_and_var(X, Y_star_XZU, Y, X_new, n_estimators=100, random_state=0)
        except Exception as e:
          print(f"Error encountered: {e}. Skipping to next iteration.")
          continue


        result_df_xzu = result_df_xzu.rename(columns={'omega_xz_hat': 'omega_xzu_hat'})

        print('weights calculated for XZ')
        print(result_df_xz.head())
        print('weights calculated for XZU')
        print(result_df_xzu.head())

    # Step 3.5: identify region A
        Y_star_XZ = labeled['Y_star_XZ']
        Y_star_XZU = labeled['Y_star_XZU']
        honest_all, A_hat, message = compute_honest_region(Y_star_XZ, Y_star_XZU, X)
        print("Honest All:", honest_all)
        print("Message:", message)
        if not A_hat.empty:
          print("Identified Honest Region(s):")
          print(A_hat.head())
        else:
          print("No regions returned.")

    # Step 4: Generate peusdo outcomes based on weights -----------------------

        pseudo_labels = pseudo_label_generation(result_df_xz, result_df_xzu, labeled, unlabeled, honest_all, A_hat)

    # Combine into a final DataFrame
        combined_df = pd.concat([unlabeled, labeled], ignore_index=True)
        combined_df['Y_tilde'] = pseudo_labels
        combined_df = combined_df['Y_tilde']
        final_df = pd.concat([combined_df.reset_index(drop=True), X_new.reset_index(drop=True)], axis=1)

    # Check the results
        print("--- Pseudo-labels created and combined with X ---")
        print(final_df.head())


    # Step 5: Estimate beta -----------------------------------------
        predictors = final_df.drop(columns=['Y_tilde'])
        predictors_numeric = pd.get_dummies(predictors, drop_first=True)
        if predictors_numeric.isnull().any().any():
            predictors_numeric = predictors_numeric.fillna(0)
        predictors_numeric = predictors_numeric.astype(float)
        Y_tilde = final_df['Y_tilde'].fillna(0).astype(float)
        ols_model = sm.OLS(Y_tilde, sm.add_constant(predictors_numeric)).fit()
        print(ols_model.summary())

        covariate_coefs = ols_model.params.drop('const')
        # Append explicitly as new row to DataFrame
        beta_hats_proposed = pd.concat([beta_hats_proposed, covariate_coefs.to_frame().T], ignore_index=True)


    # method: using labeled only
        X_labeled_numeric = pd.get_dummies(X, drop_first=True)
        X_labeled_numeric = X_labeled_numeric.apply(pd.to_numeric, errors='coerce').fillna(0)
        X_labeled_numeric = X_labeled_numeric.loc[:, X_labeled_numeric.nunique() > 1]
    #X_labeled_numeric = sm.add_constant(X_labeled_numeric, has_constant='add')
        X_labeled_numeric = X_labeled_numeric.astype(float)
        Y_labeled = labeled['Y']
        Y_labeled_numeric = pd.to_numeric(labeled['Y'], errors='coerce').fillna(0)
        Y_labeled_numeric = Y_labeled_numeric.astype(int).values.ravel()
        try:
            logit_model = sm.Logit(Y_labeled_numeric, X_labeled_numeric).fit(disp=0)
            print(logit_model.summary())
            logit_coefs_naive = logit_model.params
            beta_hats_naive = pd.concat([beta_hats_naive, logit_coefs_naive.to_frame().T], ignore_index=True)

            marginal_effects = logit_model.get_margeff(at='mean').summary()
            print(marginal_effects)

        except Exception as e:
        #print(f"Model fitting failed: {e}")
            logit_model = np.nan  # Assign NA equivalent to the model
            marginal_effects = np.nan  # Marginal effects NA
            logit_coefs_nan = pd.Series(np.nan, index=X_labeled_numeric.columns)
            beta_hats_naive = pd.concat([beta_hats_naive, logit_coefs_nan.to_frame().T], ignore_index=True)



    # method: SSL
        combined_X_encoded = X_new
        imputer = SimpleImputer(strategy='mean')
        combined_X_encoded = pd.DataFrame(imputer.fit_transform(combined_X_encoded),
                                  columns=combined_X_encoded.columns)
        X_labeled_encoded = combined_X_encoded[:len(labeled)]
        X_unlabeled_encoded = combined_X_encoded[len(labeled):]
    # Concatenate labeled and unlabeled data explicitly
        X_semi = combined_X_encoded
        Y_labeled = labeled['Y']


    # Prepare labels explicitly (unlabeled data labeled as -1)
        Y_semi = np.concatenate([Y_labeled, -1*np.ones(len(X_unlabeled_encoded))])
        ssl_model = LabelSpreading(kernel='rbf', alpha=0.2)
        ssl_model.fit(combined_X_encoded, Y_semi)

    # Explicitly get predicted labels from semi-supervised method
        Y_ssl_pred = ssl_model.transduction_
        X_ssl = sm.add_constant(combined_X_encoded)
        logit_model_SSL = sm.Logit(Y_ssl_pred, X_ssl).fit(disp=0)

    # Display explicitly estimated logistic regression coefficients (beta)
        print("Estimated beta coefficients from semi-supervised logistic regression:")
        print(logit_model_SSL.summary())

        ssl_coefs = logit_model_SSL.params.drop('const')
        beta_hats_ssl = pd.concat([beta_hats_ssl, ssl_coefs.to_frame().T], ignore_index=True)




# Save results
beta_hats_proposed.to_csv('beta_hats_simulation_proposed.csv', index=False)
beta_hats_naive.to_csv('beta_hats_simulation_naive.csv', index=False)
beta_hats_ssl.to_csv('beta_hats_simulation_ssl.csv', index=False)

print("\nSimulations complete. Results saved:")
print(beta_hats_proposed)
print(beta_hats_naive)
print(beta_hats_ssl)
