import numpy as np

from other_algo import run_experiment
from FJRnew import run_FJR_mixed
#from appr_FJR import run_appr_FJR
import os
import pandas as pd
import requests
import sklearn.cluster
from scipy.cluster.hierarchy import fcluster
from sklearn.cluster import KMeans
from sklearn.preprocessing import scale
from sklearn_extra.cluster import KMedoids
from statistics import mean
from scipy.spatial.distance import pdist, squareform
from FGC_center_included import run_FGC_CR
from core_mixed_cost import run_core_mixed
from measures import within_cluster
from measures import kmeans_objective
from measures import kmedoids_objective
from Semi_ball_growing import refine_FGC_assignments
import math
import random

import os
import gurobipy as gp





data_set = 'Iris'  # possible choices: 'Adults', 'Iris', 'Diabetes'
dist_function = 'euclidean'

print(data_set)

# Iris experiment scenarios: (scenario_name, k_list, lambda_list)
iris_settings = [
    ('k_sweep',      list(range(5, 26)),      [0.5]),
    ('lambda_sweep', [15],                    [i/10 for i in range(1, 10)])
]

#iris_settings = [
#    ('trial', [5], [0.5])
#]

if data_set == 'Iris':

    file_path = os.path.join('iris', 'iris.data')
    # Read Iris dataset robustly, skip empty or malformed lines
    df = pd.read_csv(file_path, header=None, comment='#', skip_blank_lines=True)
    # Include column 0 as a feature, drop only the label column
    data = df.iloc[:, :-1].values.astype(float)
    n = data.shape[0]

    distance_vector = pdist(data, metric=dist_function)
    distance_matrix = squareform(distance_vector)
    r = 0
    gl_it=1 # number of trials
    it= 20

    for scenario_name, k_list, lambda_list in iris_settings:
        for k in k_list:
            theta_ = 4
            GC_M, GC_C, GC_R = run_FGC_CR(n,k,distance_matrix)

            # Metrics independent of lambda for Greedy Capture
            GC_within = within_cluster(distance_vector, GC_M)
            GC_kmeans = kmeans_objective(distance_vector, GC_M)
            GC_kmedoids = kmedoids_objective(distance_vector, GC_M)

            baseline_pairs = []
            for _ in range(it):
                km_labels, med_labels, km_centers, medoids_centers = run_experiment(data, k, distance_matrix, dist_function)
                from scipy.spatial.distance import cdist
                km_C = [np.argmin(cdist([c], data).flatten()) for c in km_centers]
                med_C = [np.argmin(cdist([c], data).flatten()) for c in medoids_centers]
                baseline_pairs.append((km_labels, km_C, med_labels, med_C))

            for lambda_ in lambda_list:
                results = []
                print(f"Running experiment with k={k}, lambda={lambda_}")
                # Recompute c0 for each lambda
                q = math.sqrt(-11 * lambda_**2 + 2 * lambda_ + 13) / 6 + (5 * lambda_ - 1) / 6
                c0 = q / lambda_
                print(f"c0: {c0}")

                GC_results=np.zeros((gl_it,6))
                GC_results[r,0]=k
                #GC_results[r,1]=run_appr_FJR(n, k, distance_matrix, GC)
                GC_results[r,1]=run_FJR_mixed(n, k, distance_matrix, GC_M ,theta_, lambda_, GC_C)
                GC_results[r,2]=run_core_mixed(n, k, distance_matrix, GC_M ,theta_, lambda_, GC_C)
                GC_results[r,3] = GC_within
                GC_results[r,4] = GC_kmeans
                GC_results[r,5] = GC_kmedoids

                SB_results = np.zeros((gl_it, 6))
                new_GC_M = refine_FGC_assignments(n, k, distance_matrix, GC_M, GC_C, GC_R, lambda_, c0)
                SB_results[r, 0] = k
                SB_results[r, 1] = run_FJR_mixed(n, k, distance_matrix, new_GC_M, theta_, lambda_, GC_C)
                SB_results[r, 2] = run_core_mixed(n, k, distance_matrix, new_GC_M, theta_, lambda_, GC_C)
                SB_results[r, 3] = within_cluster(distance_vector, new_GC_M)
                SB_results[r, 4] = kmeans_objective(distance_vector, new_GC_M)
                SB_results[r, 5] = kmedoids_objective(distance_vector, new_GC_M)

                temp_means=np.zeros((it,6))
                temp_medoids=np.zeros((it,6))

                for l in range(it):
                    km_labels, km_C, med_labels, med_C = baseline_pairs[l]

                    temp_means[l,0]+=k
                    #temp_means[l,1]+=run_appr_FJR(n, k, distance_matrix, km_labels)
                    temp_means[l,1]+=run_FJR_mixed(n, k, distance_matrix, km_labels,theta_, lambda_, km_C)
                    temp_means[l,2]+=run_core_mixed(n,k, distance_matrix, km_labels,theta_, lambda_, km_C)
                    temp_means[l,3]+=within_cluster( distance_vector, km_labels)
                    temp_means[l,4]+=kmeans_objective( distance_vector, km_labels)
                    temp_means[l,5]+=kmedoids_objective( distance_vector, km_labels)

                    temp_medoids[l,0]+=k
                    #temp_medoids[l,1]+=run_appr_FJR(n, k, distance_matrix, med_labels)
                    temp_medoids[l,1]+=run_FJR_mixed(n, k, distance_matrix, med_labels,  theta_, lambda_, med_C)
                    temp_medoids[l,2]+=run_core_mixed(n,k,distance_matrix, med_labels,theta_, lambda_, med_C)
                    temp_medoids[l,3]+=within_cluster( distance_vector, med_labels)
                    temp_medoids[l,4]+=kmeans_objective( distance_vector, med_labels)
                    temp_medoids[l,5]+=kmedoids_objective( distance_vector, med_labels)

                temp=(GC_results.mean(axis=0)  )
                results.append(temp)
                temp=1.96*GC_results.std(axis=0)/math.sqrt(gl_it)
                results.append(temp)

                temp = (SB_results.mean(axis=0))
                results.append(temp)
                temp = 1.96 * SB_results.std(axis=0) / math.sqrt(gl_it)
                results.append(temp)


                temp=(temp_means.mean(axis=0)  )
                results.append(temp)
                temp=1.96*temp_means.std(axis=0)/math.sqrt(it)
                results.append(temp)


                temp=(temp_medoids.mean(axis=0)  )
                results.append(temp)
                temp=1.96*temp_medoids.std(axis=0)/math.sqrt(it)
                results.append(temp)

                os.makedirs('Iris_Results_1', exist_ok=True)
                np.savetxt(f'Iris_Results_1/{scenario_name}_k={k}-lambda={lambda_}-{data_set}max.csv',
                           results, delimiter=',', fmt='%f')

elif data_set == 'Diabetes':
    # Load Pima Diabetes dataset from local CSV
    script_dir = os.path.dirname(os.path.abspath(__file__))
    file_path = os.path.join(script_dir, 'pimaDiabetes.csv')
    df = pd.read_csv(file_path)
    # All features are numeric; no one-hot encoding needed
    data = df.values.astype(float)
    # Save full data for repeated sampling
    full_data = data
    n_total = full_data.shape[0]


    theta_ = 4
    it = 3  # number of baseline repetitions
    trials = 10

    # Define experiment scenarios: sweep k at lambda=0.5, and sweep lambda at k=15
    diabetes_settings = [
        ('k_sweep',      list(range(5, 26)),      [0.5]),
        ('lambda_sweep', [15],                    [i/10 for i in range(1, 10)])
    ]

    # Dictionary to store all results for expectation and variance computation
    diabetes_metrics = {}

    for scenario_name, k_list, lambda_list in diabetes_settings:
        for r in range(trials):
            print(f"Diabetes trial {r+1}/{trials}")
            # Sample 100 points for this trial
            sample_size = 50
            indices = np.random.choice(n_total, sample_size, replace=False)
            data = full_data[indices, :]
            n = sample_size
            # Optionally save the sampled data for inspection
            np.savetxt(
                os.path.join('pima_Diabetes_Result', f'sampled_trial{r+1}_100_points.txt'),
                data,
                delimiter=',',
                fmt='%f'
            )
            distance_vector = pdist(data, metric=dist_function)
            distance_matrix = squareform(distance_vector)


            for k in k_list:
                # Greedy Capture (only once per k per trial)
                GC_M, GC_C, GC_R = run_FGC_CR(n, k, distance_matrix)
                # Precompute baseline clustering pairs for this trial and k
                baseline_pairs = []
                for _ in range(it):
                    km_labels, med_labels, km_centers, medoids_centers = run_experiment(
                        data, k, distance_matrix, dist_function
                    )
                    from scipy.spatial.distance import cdist
                    km_C = [np.argmin(cdist([c], data).flatten()) for c in km_centers]
                    med_C = [np.argmin(cdist([c], data).flatten()) for c in medoids_centers]
                    baseline_pairs.append((km_labels, km_C, med_labels, med_C))

                # Compute Greedy Capture metrics (independent of lambda)
                w_j   = within_cluster(distance_vector, GC_M)
                km_gc = kmeans_objective(distance_vector, GC_M)
                md_gc = kmedoids_objective(distance_vector, GC_M)

                # Precompute baseline within/obj metrics (independent of lambda)
                base_vals_km_w = []
                base_vals_md_w = []
                for km_labels, km_C, med_labels, med_C in baseline_pairs:
                    bw_km  = within_cluster(distance_vector, km_labels)
                    bkm_km = kmeans_objective(distance_vector, km_labels)
                    bmd_km = kmedoids_objective(distance_vector, km_labels)
                    bw_md  = within_cluster(distance_vector, med_labels)
                    bkm_md = kmeans_objective(distance_vector, med_labels)
                    bmd_md = kmedoids_objective(distance_vector, med_labels)
                    base_vals_km_w.append([bw_km, bkm_km, bmd_km])
                    base_vals_md_w.append([bw_md, bkm_md, bmd_md])
                base_vals_km_w = np.array(base_vals_km_w)
                base_vals_md_w = np.array(base_vals_md_w)
                base_mean_km_w = base_vals_km_w.mean(axis=0)
                base_mean_md_w = base_vals_md_w.mean(axis=0)

                for lambda_ in lambda_list:
                    # Progress indicator
                    print(f"[Diabetes:{scenario_name}] k={k}, λ={lambda_}")
                    # Compute convexity parameter c0
                    q = math.sqrt(-11 * lambda_**2 + 2 * lambda_ + 13) / 6 + (5 * lambda_ - 1) / 6
                    c0 = q / lambda_

                    # Greedy Capture metrics
                    f_j   = run_FJR_mixed(n, k, distance_matrix, GC_M, theta_, lambda_, GC_C)
                    c_j   = run_core_mixed(n, k, distance_matrix, GC_M, theta_, lambda_, GC_C)
                    # w_j, km_gc, md_gc already computed above

                    # Semi-Ball Growing metrics
                    new_M   = refine_FGC_assignments(n, k, distance_matrix, GC_M, GC_C, GC_R, lambda_, c0)
                    sf_j    = run_FJR_mixed(n, k, distance_matrix, new_M, theta_, lambda_, GC_C)
                    sc_j    = run_core_mixed(n, k, distance_matrix, new_M, theta_, lambda_, GC_C)
                    sw_j    = within_cluster(distance_vector, new_M)
                    km_sb   = kmeans_objective(distance_vector, new_M)
                    md_sb   = kmedoids_objective(distance_vector, new_M)

                    # Baseline means over repetitions (dependent on lambda for FJR/core, but NOT for within/obj)
                    base_vals_km = []
                    base_vals_md = []
                    for km_labels, km_C, med_labels, med_C in baseline_pairs:
                        # KMeans++ baseline
                        bfj_km = run_FJR_mixed(n, k, distance_matrix, km_labels, theta_, lambda_, km_C)
                        bc_km  = run_core_mixed(n, k, distance_matrix, km_labels, theta_, lambda_, km_C)
                        # Remove: bw_km  = within_cluster(distance_vector, km_labels)
                        # Remove: bkm_km = kmeans_objective(distance_vector, km_labels)
                        # Remove: bmd_km = kmedoids_objective(distance_vector, km_labels)
                        base_vals_km.append([bfj_km, bc_km])

                        # KMedoids baseline
                        bfj_md = run_FJR_mixed(n, k, distance_matrix, med_labels, theta_, lambda_, med_C)
                        bc_md  = run_core_mixed(n, k, distance_matrix, med_labels, theta_, lambda_, med_C)
                        # Remove: bw_md  = within_cluster(distance_vector, med_labels)
                        # Remove: bkm_md = kmeans_objective(distance_vector, med_labels)
                        # Remove: bmd_md = kmedoids_objective(distance_vector, med_labels)
                        base_vals_md.append([bfj_md, bc_md])

                    base_vals_km = np.array(base_vals_km)
                    base_vals_md = np.array(base_vals_md)
                    # base_mean_km: [FJR, core]
                    base_mean_km = base_vals_km.mean(axis=0)
                    base_mean_md = base_vals_md.mean(axis=0)

                    # Stack metrics: GC, SB, KMeans, KMedoids
                    # [f_j, c_j, w_j, km_gc, md_gc, sf_j, sc_j, sw_j, km_sb, md_sb,
                    #  bfj_km, bc_km, bw_km, bkm_km, bmd_km, bfj_md, bc_md, bw_md, bkm_md, bmd_md]
                    metrics = [
                        f_j, c_j, w_j, km_gc, md_gc,
                        sf_j, sc_j, sw_j, km_sb, md_sb
                    ] + base_mean_km.tolist() + base_mean_km_w.tolist() + base_mean_md.tolist() + base_mean_md_w.tolist()
                    # Store metrics for summary computation
                    key = (scenario_name, k, lambda_)
                    if key not in diabetes_metrics:
                        diabetes_metrics[key] = []
                    diabetes_metrics[key].append(metrics)

    # Aggregate and save summary statistics
    for (scenario_name, k, lambda_), values in diabetes_metrics.items():
        arr = np.array(values)
        mean_vals = arr.mean(axis=0)
        ci_vals = 1.96 * arr.std(axis=0) / math.sqrt(arr.shape[0])
        summary = np.vstack([mean_vals, ci_vals])
        filename = f'pima_{scenario_name}_k={k}-lambda={lambda_}-Diabetes_summary_1.csv'
        out_path = os.path.join('pima_Diabetes_Result', filename)
        np.savetxt(out_path, summary, delimiter=',', fmt='%f')



elif data_set == 'Adults':
    # Load and preprocess Adult census data
    file_path = os.path.join('census+income', 'adult.data')
    df = pd.read_csv(file_path, header=None, sep=',', skip_blank_lines=True)
    # Include all columns (including the original label) as features
    data_all = pd.get_dummies(df, dummy_na=False).values.astype(float)
    n_total = data_all.shape[0]
    sample_size = min(100, n_total)
    if sample_size <= 0:
        raise ValueError(f"No data available for Adults sampling: n_total={n_total}")
    trials = 10
    it = 3  # baseline repetitions per trial

    # Adults experiment scenarios: (scenario_name, k_list, lambda_list)
    adults_settings = [
        ('k_sweep',      list(range(5, 26)), [0.5]),
        ('lambda_sweep', [15], [i/10 for i in range(1, 10)])
    ]

    for scenario_name, k_list, lambda_list in adults_settings:
        # initialize per-scenario metrics dictionary
        metrics_dict = {(k, lam): [] for k in k_list for lam in lambda_list}

        for r in range(trials):
            print(f"Adults trial {r+1}/{trials}")
            # Sample data for this trial
            indices = np.random.choice(n_total, sample_size, replace=False)
            data = data_all[indices, :]
            n = sample_size
            distance_vector = pdist(data, metric=dist_function)
            distance_matrix = squareform(distance_vector)

            for k in k_list:
                theta_ = 4
                # Greedy Capture clustering (once per k per trial)
                GC_M, GC_C, GC_R = run_FGC_CR(n, k, distance_matrix)

                # Precompute baseline pairs (kmeans++ and kmedoids) for this k/trial
                baseline_pairs = []
                for _ in range(it):
                    km_labels, med_labels, km_centers, medoids_centers = run_experiment(
                        data, k, distance_matrix, dist_function
                    )
                    from scipy.spatial.distance import cdist
                    km_C = [np.argmin(cdist([c], data).flatten()) for c in km_centers]
                    med_C = [np.argmin(cdist([c], data).flatten()) for c in medoids_centers]
                    baseline_pairs.append((km_labels, km_C, med_labels, med_C))

                # GC metrics (independent of lambda)
                w_j   = within_cluster(distance_vector, GC_M)
                km_gc = kmeans_objective(distance_vector, GC_M)
                md_gc = kmedoids_objective(distance_vector, GC_M)

                base_vals_km_w = []
                base_vals_md_w = []
                for km_labels, _, med_labels, _ in baseline_pairs:
                    bw_km  = within_cluster(distance_vector, km_labels)
                    bkm_km = kmeans_objective(distance_vector, km_labels)
                    bmd_km = kmedoids_objective(distance_vector, km_labels)
                    bw_md  = within_cluster(distance_vector, med_labels)
                    bkm_md = kmeans_objective(distance_vector, med_labels)
                    bmd_md = kmedoids_objective(distance_vector, med_labels)
                    base_vals_km_w.append([bw_km, bkm_km, bmd_km])
                    base_vals_md_w.append([bw_md, bkm_md, bmd_md])
                base_vals_km_w = np.array(base_vals_km_w)
                base_vals_md_w = np.array(base_vals_md_w)
                base_mean_km_w = base_vals_km_w.mean(axis=0)
                base_mean_md_w = base_vals_md_w.mean(axis=0)

                for lambda_ in lambda_list:
                    print(f"  k={k}, lambda={lambda_}")
                    # Compute threshold c0
                    q = math.sqrt(-11 * lambda_**2 + 2 * lambda_ + 13) / 6 + (5 * lambda_ - 1) / 6
                    c0 = q / lambda_

                    # Greedy Capture metrics
                    f_j   = run_FJR_mixed(n, k, distance_matrix, GC_M,  theta_, lambda_, GC_C)
                    c_j   = run_core_mixed(n, k, distance_matrix, GC_M, theta_, lambda_, GC_C)
                    # w_j, km_gc, md_gc already computed above

                    # Semi-Ball Growing metrics
                    new_M   = refine_FGC_assignments(n, k, distance_matrix, GC_M, GC_C, GC_R, lambda_, c0)
                    sf_j    = run_FJR_mixed(n, k, distance_matrix, new_M, theta_, lambda_, GC_C)
                    sc_j    = run_core_mixed(n, k, distance_matrix, new_M, theta_, lambda_, GC_C)
                    sw_j    = within_cluster(distance_vector, new_M)
                    km_sb   = kmeans_objective(distance_vector, new_M)
                    md_sb   = kmedoids_objective(distance_vector, new_M)

                    # Baseline metrics: separate 2 measures for KMeans++ and KMedoids (FJR, Core)
                    vals_km = []
                    vals_md = []
                    for km_labels, km_C, med_labels, med_C in baseline_pairs:
                        # KMeans++ baseline
                        bfj_km = run_FJR_mixed(n, k, distance_matrix, km_labels, theta_, lambda_, km_C)
                        bc_km  = run_core_mixed(n, k, distance_matrix, km_labels, theta_, lambda_, km_C)
                        vals_km.append([bfj_km, bc_km])

                        # KMedoids baseline
                        bfj_md = run_FJR_mixed(n, k, distance_matrix, med_labels, theta_, lambda_, med_C)
                        bc_md  = run_core_mixed(n, k, distance_matrix, med_labels, theta_, lambda_, med_C)
                        vals_md.append([bfj_md, bc_md])

                    base_mean_km = np.array(vals_km).mean(axis=0)
                    base_mean_md = np.array(vals_md).mean(axis=0)

                    # Stack all metrics: GC, SB, KMeans++ baseline, KMedoids baseline
                    # [f_j, c_j, w_j, km_gc, md_gc, sf_j, sc_j, sw_j, km_sb, md_sb,
                    #  bfj_km, bc_km, bw_km, bkm_km, bmd_km, bfj_md, bc_md, bw_md, bkm_md, bmd_md]
                    metrics_dict[(k, lambda_)].append(
                        np.hstack([
                            [f_j,   c_j,   w_j,   km_gc,   md_gc,
                             sf_j,  sc_j,  sw_j,  km_sb,   md_sb],
                            base_mean_km,
                            base_mean_km_w,
                            base_mean_md,
                            base_mean_md_w
                        ])
                    )

        # After all trials, aggregate and save summary per (k, lambda_)
        os.makedirs('Adults_Result', exist_ok=True)
        for (k, lambda_), vals in metrics_dict.items():
            arr = np.array(vals)
            summary = [
                arr.mean(axis=0),
                1.96 * arr.std(axis=0) / math.sqrt(arr.shape[0])
            ]
            np.savetxt(
                os.path.join('Adults_Result',
                             f'{scenario_name}_k={k}-lambda={lambda_}-Adultsmax.csv'),
                summary,
                delimiter=',', fmt='%f'
            )



