# -*- coding: utf-8 -*-


import numpy as np
import pandas as pd
import openai
import time
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from scipy.optimize import minimize
import statsmodels.api as sm
from scipy.special import expit
import statsmodels.api as sm
from sklearn.ensemble import RandomForestRegressor
from sklearn.semi_supervised import LabelSpreading
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

# Set your OpenAI API key
openai.api_key = '[API KEY]' # insert your API key

pitt_labeled = pd.read_csv('pitt_labeled_with_nlp.csv')
hopkins_unlabeled = pd.read_csv('hopkins_unlabeled_with_nlp.csv')

# Drop specific columns from Hopkins data
hopkins_unlabeled_cleaned = hopkins_unlabeled.drop(columns=[
    'File',
    'Date.of.Audio..taped.Language.Sample',
    'X1Date.of.Testing'
], errors='ignore')  # 'errors=ignore' in case the column name slightly differs or missing

# Drop specific columns from Pitt data
pitt_labeled_cleaned = pitt_labeled.drop(columns=[
    'id',
    'idate',
    'dx1',
    'dx2',
    'dx3'
], errors='ignore')

# Containers for results
beta_hats = []
labeled_only_params = []

np.random.seed(32)

# How many subjects to generate
n_subjects = 300
n_iter = 100

# Containers for results
beta_hats_proposed = pd.DataFrame()
beta_hats_naive = pd.DataFrame()
beta_hats_ssl = pd.DataFrame()

"""## (1) LLM prediction"""

# Define helper function to generate prediction
def zero_shot_predict(Sex, Race, Education, ttr_category, filler_ratio_category, mean_utterance_length, Transcript):
    # Truncate transcript if too long
    if len(Transcript) > 8000:
        Transcript = Transcript[:8000]

    prompt = f"""You are a medical assistant trained to assess Alzheimer's Disease (AD) risk using both structured patient data and natural language transcripts of picture description tasks. Given a patient’s demographic information, two NLP-derived categories (lexical diversity and filler word usage), and their transcript, decide whether they are likely to have Alzheimer’s Disease (label as 1) or not (label as 0). Your prediction should rely primarily on semantic coherence, richness of description, sentence structure, and topic maintenance in the transcripts, supported by the NLP-derived categories.

Focus especially on:

- Semantic richness: Are objects and actions well described?
- Syntactic fluency: Are sentences grammatically structured?
- Discourse organization: Is the description cohesive, progressing logically?
- Error patterns: Do you see word-finding difficulty, repetition, or vagueness?

Interpretation of NLP-derived categories:
- Lexical Diversity Category (Type-Token Ratio):
  - High lexical diversity: Typical of cognitively healthy individuals.
  - Moderate lexical diversity: Mildly reduced; possibly normal aging.
  - Low-moderate lexical diversity: Suggestive of mild cognitive impairment.
  - Low lexical diversity: Often indicative of Alzheimer's or dementia.

- Filler Word Usage Category:
  - Normal: Typical cognitive function.
  - Mildly elevated: Possible mild cognitive changes.
  - Elevated: Indicative of cognitive impairment or Alzheimer's.

Patient Information:
- Sex: {Sex}
- Race: {Race}
- Education (years): {Education}
- Lexical Diversity Category: {ttr_category}
- Filler Word Usage Category: {filler_ratio_category}
- Speech excerpt: {Transcript}

Return your answer as 0 (unlikely AD) or 1 (likely AD). No explanation is needed.

Answer:"""

    try:
        response = openai.chat.completions.create(
            model="gpt-4",
            messages=[{"role": "user", "content": prompt}],
            temperature=0.5,
            max_tokens=1
        )
        prediction_raw = response.choices[0].message.content.strip()

        # Only keep 0 or 1
        prediction_clean = ''.join(c for c in prediction_raw if c in ['0', '1'])
        if prediction_clean in ['0', '1']:
            return int(prediction_clean)
        else:
            print(f"Unexpected model output: {prediction_raw}")
            return None

    except Exception as e:
        print(f"Error: {e}")
        return None

"""## (2) Weight computation"""

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor

def estimate_cov_and_var(X, Y_star_xz, Y, X_new=None, n_estimators=100, random_state=0):
    """
    Estimates conditional covariance, variance, conditional mean, and weighting function using Random Forest.

    Parameters:
    X (DataFrame or ndarray): Covariate matrix for training (n_samples x n_features).
    Y_star_xz (array-like): Outcome Y_star_xz for training.
    Y (array-like): Outcome Y for training.
    X_new (DataFrame or ndarray, optional): Covariate matrix for predictions. If None, uses X.
    n_estimators (int): Number of trees in Random Forest.
    random_state (int): Random state for reproducibility.

    Returns:
    DataFrame: Data augmented with gamma_xz_hat, nu_xz_hat, m_x_hat, and omega_xz_hat estimates.
    """
    # Ensure inputs are arrays
    X = np.asarray(X)
    Y_star_xz = np.asarray(Y_star_xz)
    Y = np.asarray(Y)
    X_pred = np.asarray(X_new) if X_new is not None else X

    # Fit E[Y_star_xz | X]
    model_Y_star_xz = RandomForestRegressor(n_estimators=n_estimators, random_state=random_state)
    model_Y_star_xz.fit(X, Y_star_xz)
    Y_star_xz_hat = model_Y_star_xz.predict(X)

    # Fit E[Y | X]
    model_Y = RandomForestRegressor(n_estimators=n_estimators, random_state=random_state)
    model_Y.fit(X, Y)
    Y_hat = model_Y.predict(X)

    # Compute residuals
    resid_Y_star_xz = Y_star_xz - Y_star_xz_hat
    resid_Y = Y - Y_hat

    # Estimate gamma_xz(x) = Cov(Y_star_xz, Y | X=x)
    resid_product = resid_Y_star_xz * resid_Y
    model_cov = RandomForestRegressor(n_estimators=n_estimators, random_state=random_state)
    model_cov.fit(X, resid_product)
    gamma_xz_hat = model_cov.predict(X_pred)

    # Estimate nu_xz(x) = Var(Y_star_xz | X=x)
    resid_Y_star_xz_sq = resid_Y_star_xz ** 2
    model_var = RandomForestRegressor(n_estimators=n_estimators, random_state=random_state)
    model_var.fit(X, resid_Y_star_xz_sq)
    nu_xz_hat = model_var.predict(X_pred)

    # Predict m(x) = E[Y | X=x] on new data
    m_x_hat = model_Y.predict(X_pred)

    # Create output DataFrame
    results_df = pd.DataFrame(X_pred, columns=[f'X{i+1}' for i in range(X_pred.shape[1])])
    results_df['gamma_xz_hat'] = gamma_xz_hat
    results_df['nu_xz_hat'] = nu_xz_hat
    results_df['m_x_hat'] = m_x_hat

    # Compute omega_xz_hat
    results_df['omega_xz_hat'] = results_df['gamma_xz_hat'] / results_df['nu_xz_hat']
    results_df['omega_xz_hat'] = results_df['omega_xz_hat'].replace([np.inf, -np.inf], 0).fillna(0)

    return results_df

# # test
# labeled = pd.read_csv('labeled_sim_with_predictions.csv')
# labeled['Y_star'] = labeled['Y_star'].fillna(1)

# labeled = labeled.rename(columns={'basedx': 'Y', 'X1': 'Sex', 'Y_star': 'Y_star'})
# labeled['sex'] = labeled['sex'].map({'female': 0, 'male': 1}).astype(int)

# unlabeled = pd.read_csv('unlabeled_sim_with_predictions.csv')
# unlabeled['Sex'] = unlabeled['Sex'].map({'female': 0, 'male': 1}).astype(int)
# unlabeled = unlabeled.rename(columns={'Sex':'sex', 'Education': 'educ'})


# X = labeled[['sex', 'educ','ttr_category','filler_ratio_category']]
# Y = labeled['Y']
# Y_star_xz = labeled['Y_star']
# X_un = unlabeled[['sex', 'educ','ttr_category','filler_ratio_category']]
# X_new = pd.concat([X, X_un], axis=0, ignore_index=True)

# ## format categorical variables
# cat_var = 'ttr_category'  # for your case
# # Combine categories from X and X_new, excluding NaN explicitly
# combined_categories = pd.concat([X[cat_var], X_new[cat_var]]).dropna().unique()
# # Create a categorical type without NaN
# categorical_dtype = pd.CategoricalDtype(categories=combined_categories, ordered=True)
# # Set the categorical type explicitly, handling NaNs gracefully
# X[cat_var] = X[cat_var].astype(categorical_dtype)
# X_new[cat_var] = X_new[cat_var].astype(categorical_dtype)


# X = pd.get_dummies(X, drop_first=True)
# X_new = pd.get_dummies(X_new, drop_first=True)

# result_df = estimate_cov_and_var(X, Y_star_xz, Y, X_new, n_estimators=100, random_state=0)

"""## Simulation"""

for iteration in range(n_iter):
        print(f"\n--- Simulation Iteration: {iteration + 1} ---")

        # Initialize storage for simulated data
        labeled_samples = []
        unlabeled_samples = []

        # Simulate subject sampling
        for _ in range(n_subjects):
            S_i = np.random.binomial(1, 0.311)

            if S_i == 1:
                # Sample from labeled dataset
                sample = pitt_labeled.sample(1).copy().reset_index(drop=True)
                labeled_samples.append(sample)
            else:
                # Sample from unlabeled dataset
                sample = hopkins_unlabeled.sample(1).copy().reset_index(drop=True)
                unlabeled_samples.append(sample)

        # Combine sampled labeled data
        labeled_data = (
            pd.concat(labeled_samples, ignore_index=True)
            if labeled_samples else pd.DataFrame()
        )

        # Combine sampled unlabeled data
        unlabeled_data = (
            pd.concat(unlabeled_samples, ignore_index=True)
            if unlabeled_samples else pd.DataFrame()
        )

        # Save the datasets
        labeled_data.to_csv('labeled_sim.csv', index=False)
        unlabeled_data.to_csv('unlabeled_sim.csv', index=False)

        print("Files saved: labeled_sim.csv and unlabeled_sim.csv")

# Step 2: Generate Y_star using GPT models
        hopkins_unlabeled_cleaned = pd.read_csv('unlabeled_sim.csv')

    # Predict for all subjects
        predicted_Y = []
        for idx, row in hopkins_unlabeled_cleaned.iterrows():
            Sex = row.get('Sex', 'Unknown')
            Race = row.get('Race', 'Unknown')
            Education = row.get('Education', 'Unknown')
            ttr_category = row.get('ttr_category', 'Unknown')
            filler_ratio_category = row.get('filler_ratio_category', 'Unknown')
            mean_utterance_length = row.get('mean_utterance_length', 'Unknown')
            Transcripts = row.get('Transcripts', '')

        #print(f"\nRow {idx} — Sex: {Sex}, Race: {Race}, Education: {Education}, type token ratio :{ttr_category},filler words ratio :{filler_ratio_category}")
        #print(f"Transcript preview: {Transcripts[:10]}")

            if pd.isna(Transcripts) or pd.isna(Sex) or pd.isna(Race) or pd.isna(ttr_category) or pd.isna(filler_ratio_category) or pd.isna(mean_utterance_length):
                predicted_Y.append(None)
            else:
                pred = zero_shot_predict(Sex, Race, Education, ttr_category, filler_ratio_category, mean_utterance_length, Transcripts)
                predicted_Y.append(pred)
                time.sleep(1)

    # Add predictions
        hopkins_unlabeled_cleaned['basedx'] = predicted_Y

    # Save
        hopkins_unlabeled_cleaned.to_csv('unlabeled_sim_with_predictions.csv', index=False)
        print("Predictions saved to hopkins_unlabeled_with_predictions.csv")

    # Load your cleaned labeled dataset
        pitt_labeled_cleaned = pd.read_csv('labeled_sim.csv')
        pitt_labeled_cleaned['sex'] = pitt_labeled_cleaned['sex'].replace({0: 'female', 1: 'male'})
        pitt_labeled_cleaned['race'] = pitt_labeled_cleaned['race'].replace({1: 'white', 2: 'black'})

    # Predict for all subjects
        predicted_Y = []
        for idx, row in pitt_labeled_cleaned.iterrows():
            Sex = row.get('sex', 'Unknown')
            Race = row.get('race', 'Unknown')
            Education = row.get('educ', 'Unknown')
            ttr_category = row.get('ttr_category', 'Unknown')
            filler_ratio_category = row.get('filler_ratio_category', 'Unknown')
            mean_utterance_length = row.get('mean_utterance_length', 'Unknown')
            Transcripts = row.get('Transcripts', '')

            if pd.isna(Transcripts) or not str(Transcripts).strip():
                transcript_preview = "unknown"
            else:
                transcript_preview = Transcripts[:100]

        #print(f"\nRow {idx} — Sex: {Sex}, Race: {Race}, Education: {Education}, type token ratio :{ttr_category},filler words ratio :{filler_ratio_category}")
        #print(f"Transcript preview: {transcript_preview}")

            if pd.isna(Transcripts) or pd.isna(Sex) or pd.isna(Race) or pd.isna(Education) or pd.isna(ttr_category) or pd.isna(filler_ratio_category) or pd.isna(mean_utterance_length):
                predicted_Y.append(None)
            else:
                pred = zero_shot_predict(Sex, Race, Education, ttr_category, filler_ratio_category, mean_utterance_length,Transcripts)
                predicted_Y.append(pred)
                time.sleep(1)

    # Add predictions
        pitt_labeled_cleaned['Y_star'] = predicted_Y

    # Save
        pitt_labeled_cleaned.to_csv('labeled_sim_with_predictions.csv', index=False)
        print("Predictions saved to pitt_labeled_with_predictions.csv")


    # Step 3: Compute weights --------------------------------------------------
    # Load data
        labeled = pd.read_csv('labeled_sim_with_predictions.csv')
        unlabeled = pd.read_csv('unlabeled_sim_with_predictions.csv')

        labeled['Y_star'] = labeled['Y_star'].fillna(1)

    # Rename columns
        labeled = labeled.rename(columns={'basedx': 'Y', 'X1': 'Sex', 'Y_star': 'Y_star'})
        unlabeled = unlabeled.rename(columns={'basedx': 'Y_star', 'Sex':'sex',"Education":"educ"})

    # Clean types
        labeled['sex'] = labeled['sex'].map({'female': 0, 'male': 1}).astype(int)
        unlabeled['sex'] = unlabeled['sex'].map({'female': 0, 'male': 1}).astype(int)

    # compute borrowing weights
        X = labeled[['sex', 'educ','ttr_category','filler_ratio_category','mean_utterance_length']]
        X_un = unlabeled[['sex', 'educ','ttr_category','filler_ratio_category','mean_utterance_length']]
        X_new = pd.concat([X, X_un], axis=0, ignore_index=True)

        Y = labeled['Y']
        Y_star_xz = labeled['Y_star']


    ## format categorical variables
        cat_var = 'ttr_category'
    # Combine categories from X and X_new, excluding NaN explicitly
        combined_categories = pd.concat([X[cat_var], X_new[cat_var]]).dropna().unique()
    # Create a categorical type without NaN
        categorical_dtype = pd.CategoricalDtype(categories=combined_categories, ordered=True)
    # Set the categorical type explicitly, handling NaNs gracefully
        X[cat_var] = X[cat_var].astype(categorical_dtype)
        X_new[cat_var] = X_new[cat_var].astype(categorical_dtype)
        X = pd.get_dummies(X, drop_first=True)
        X_new = pd.get_dummies(X_new, drop_first=True)
        #result_df = estimate_cov_and_var(X, Y_star_xz, Y, X_new, n_estimators=100, random_state=0)
        try:
          result_df = estimate_cov_and_var(X, Y_star_xz, Y, X_new, n_estimators=100, random_state=0)
        except Exception as e:
          print(f"Error encountered: {e}. Skipping to next iteration.")
          continue

        print('weights calculated')
        print(result_df.head())


    # Step 4: Generate peusdo outcomes based on weights -----------------------
        n = labeled.shape[0]
        N = unlabeled.shape[0]
    # First, split result_df back into labeled and unlabeled predictions
        result_df_labeled = result_df.iloc[:n].reset_index(drop=True)
        result_df_unlabeled = result_df.iloc[n:].reset_index(drop=True)
    # Extract relevant predictions from result_df
        m_x_hat_labeled = result_df_labeled['m_x_hat'].values
        omega_xz_hat_labeled = result_df_labeled['omega_xz_hat'].values
        m_x_hat_unlabeled = result_df_unlabeled['m_x_hat'].values
        omega_xz_hat_unlabeled = result_df_unlabeled['omega_xz_hat'].values

        Y_star_labeled = labeled['Y_star'].values
        Y_star_unlabeled = unlabeled['Y_star'].values

    # Compute f_hat for unlabeled
        f_hat_unlabeled = m_x_hat_unlabeled  + omega_xz_hat_unlabeled * (Y_star_unlabeled - m_x_hat_unlabeled )
    # Compute f_hat for labeled
        f_hat_labeled = m_x_hat_labeled + omega_xz_hat_labeled * (Y_star_labeled - m_x_hat_labeled)

    # Initialize pseudo-label array
        pseudo_labels = np.zeros(N + n)

    # Assign pseudo-labels
        pseudo_labels[:n] = labeled['Y'] + (N/n) * (labeled['Y'] - f_hat_labeled)  # Labeled data pseudo-labels
        pseudo_labels[n:] = f_hat_unlabeled

    # Combine into a final DataFrame
        combined_df = pd.concat([unlabeled, labeled], ignore_index=True)
        combined_df['Y_tilde'] = pseudo_labels
        combined_df = combined_df['Y_tilde']
        final_df = pd.concat([combined_df.reset_index(drop=True), X_new.reset_index(drop=True)], axis=1)

    # Check the results
        print("--- Pseudo-labels created and combined with X ---")
        print(final_df.head())


    # Step 5: Estimate beta -----------------------------------------
        predictors = final_df.drop(columns=['Y_tilde'])
        predictors_numeric = pd.get_dummies(predictors, drop_first=True)
        if predictors_numeric.isnull().any().any():
            predictors_numeric = predictors_numeric.fillna(0)
        predictors_numeric = predictors_numeric.astype(float)
        Y_tilde = final_df['Y_tilde'].fillna(0).astype(float)
        ols_model = sm.OLS(Y_tilde, sm.add_constant(predictors_numeric)).fit()
        print(ols_model.summary())

        covariate_coefs = ols_model.params.drop('const')
        # Append explicitly as new row to DataFrame
        beta_hats_proposed = pd.concat([beta_hats_proposed, covariate_coefs.to_frame().T], ignore_index=True)


    # method: using labeled only
        X_labeled_numeric = pd.get_dummies(X, drop_first=True)
        X_labeled_numeric = X_labeled_numeric.apply(pd.to_numeric, errors='coerce').fillna(0)
        X_labeled_numeric = X_labeled_numeric.loc[:, X_labeled_numeric.nunique() > 1]
    #X_labeled_numeric = sm.add_constant(X_labeled_numeric, has_constant='add')
        X_labeled_numeric = X_labeled_numeric.astype(float)
        Y_labeled = labeled['Y']
        Y_labeled_numeric = pd.to_numeric(labeled['Y'], errors='coerce').fillna(0)
        Y_labeled_numeric = Y_labeled_numeric.astype(int).values.ravel()
        try:
            logit_model = sm.Logit(Y_labeled_numeric, X_labeled_numeric).fit(disp=0)
            print(logit_model.summary())
            logit_coefs_naive = logit_model.params
            beta_hats_naive = pd.concat([beta_hats_naive, logit_coefs_naive.to_frame().T], ignore_index=True)

            marginal_effects = logit_model.get_margeff(at='mean').summary()
            print(marginal_effects)

        except Exception as e:
        #print(f"Model fitting failed: {e}")
            logit_model = np.nan  # Assign NA equivalent to the model
            marginal_effects = np.nan  # Marginal effects NA
            logit_coefs_nan = pd.Series(np.nan, index=X_labeled_numeric.columns)
            beta_hats_naive = pd.concat([beta_hats_naive, logit_coefs_nan.to_frame().T], ignore_index=True)



    # method: SSL
        combined_X_encoded = X_new
        imputer = SimpleImputer(strategy='mean')
        combined_X_encoded = pd.DataFrame(imputer.fit_transform(combined_X_encoded),
                                  columns=combined_X_encoded.columns)
        X_labeled_encoded = combined_X_encoded[:len(labeled)]
        X_unlabeled_encoded = combined_X_encoded[len(labeled):]
    # Concatenate labeled and unlabeled data explicitly
        X_semi = combined_X_encoded
        Y_labeled = labeled['Y']


    # Prepare labels explicitly (unlabeled data labeled as -1)
        Y_semi = np.concatenate([Y_labeled, -1*np.ones(len(X_unlabeled_encoded))])
        ssl_model = LabelSpreading(kernel='rbf', alpha=0.2)
        ssl_model.fit(combined_X_encoded, Y_semi)

    # Explicitly get predicted labels from semi-supervised method
        Y_ssl_pred = ssl_model.transduction_
        X_ssl = sm.add_constant(combined_X_encoded)
        logit_model_SSL = sm.Logit(Y_ssl_pred, X_ssl).fit(disp=0)

    # Display explicitly estimated logistic regression coefficients (beta)
        print("Estimated beta coefficients from semi-supervised logistic regression:")
        print(logit_model_SSL.summary())

        ssl_coefs = logit_model_SSL.params.drop('const')
        beta_hats_ssl = pd.concat([beta_hats_ssl, ssl_coefs.to_frame().T], ignore_index=True)




# Save results
beta_hats_proposed.to_csv('beta_hats_simulation_proposed.csv', index=False)
beta_hats_naive.to_csv('beta_hats_simulation_naive.csv', index=False)
beta_hats_ssl.to_csv('beta_hats_simulation_ssl.csv', index=False)

print("\nSimulations complete. Results saved:")
print(beta_hats_proposed)
print(beta_hats_naive)
print(beta_hats_ssl)

beta_hats_proposed
