import pandas as pd
import random
import numpy as np
import logging

def get_client_characteristic(data_map_file):
    # 生成两个个.csv文件，一个是每个client_id对应的characteristic 一个是归一化的特征
    #len_characteristic = 6
    # 用于读取 CSV 文件并统计每个 client_id 的数据条目数，并将结果存储在字典中
    # 读取 CSV 文件，并指定第一行作为列名
    df = pd.read_csv(data_map_file, delimiter=',', header=0)

    # 将 client_id 列转换为整数类型
    df['client_id'] = df['client_id'].astype(str).str.strip()  # 去除两端空格并转换为字符串类型
    df['client_id'] = df['client_id'].astype(int)

    # 统计 client_id 列中不同取值的数量
    #client_num = df['client_id'].nunique()

    #client_characteristic = np.zeros((client_num, len_characteristic))

    # 统计每个 client_id 的数据条目数，并转换为字典
    client_id_counts = df['client_id'].value_counts().to_dict()

    # 生成cpu的备选集
    cpu_list = []
    for i in range(51):
        cpu_list.append(50+i*2) # 范围[50, 150] 核数，偶数

    # 生成memory的备选集
    memory_list = [128, 256, 512, 1024] 

    # 生成gpu的备选集
    gpu_list = [2,4,6,8]

    # 生成learning rate的备选集
    learningrate_list = [0.0001, 0.001, 0.01, 0.1]

    df_output = pd.DataFrame(columns=['client_id', 'cpu', 'memory', 'gpu', 'datasize', 'batchsize', 'learningrate'])

    for client_id, client_id_count in client_id_counts.items():
        # 六维：cpu, memory, gpu, datasize, batchsize, learningrate
        
        # 获取Client_id对应的Data Size 
        client_characteristic_data_size = client_id_count
        client_characteristic_cpu = random.choice(cpu_list)
        client_characteristic_memory = random.choice(memory_list)
        client_characteristic_gpu = random.choice(gpu_list)
        # 指定batch size
        client_characteristic_batch_size = random.randint(int(client_id_count / 10), int(client_id_count / 5))
        if client_characteristic_batch_size == 0:
            client_characteristic_batch_size = 1
        client_characteristic_learning_rate = random.choice(learningrate_list)

        df_output = df_output.append({
            'client_id': int(client_id),
            'cpu': client_characteristic_cpu,
            'memory': client_characteristic_memory,
            'gpu': client_characteristic_gpu,
            'datasize': client_characteristic_data_size,
            'batchsize': client_characteristic_batch_size,
            'learningrate': client_characteristic_learning_rate
        }, ignore_index=True)

    #output_file = "/data/dataset/femnist/client_data_mapping/train_client_characteristic.csv"
    output_file = "/nfs/dataset/google_speech/client_data_mapping/train_client_characteristic.csv"
    df_output.to_csv(output_file, index=False)

    # 对除了 client_id 列以外的其他列进行归一化处理
    df_output.iloc[:, 1:] = df_output.iloc[:, 1:].div(df_output.iloc[:, 1:].max())
    #output_file_normalization = "/data/dataset/femnist/client_data_mapping/train_client_characteristic_normalization.csv"
    output_file_normalization = "/nfs/dataset/google_speech/client_data_mapping/train_client_characteristic_normalization.csv"
    df_output.to_csv(output_file_normalization, index=False)
    
def get_client_id_characteristic(client_id, data_map_file):
    # data_map_file = "/data/dataset/femnist/client_data_mapping/train_client_characteristic.csv"
    df = pd.read_csv(data_map_file)

    client_row = df[df['client_id'].astype(int) == client_id]

    client_dict = client_row.to_dict(orient='records')

    if client_dict:
        for key, value in client_dict[0].items():
            if key != 'learningrate':
                client_dict[0][key] = int(value)
        return client_dict[0]
    else:
        return {}
    
def get_client_id_characteristic_norm(client_id, data_map_file):
    # data_map_file = "/data/dataset/femnist/client_data_mapping/train_client_characteristic.csv"
    df = pd.read_csv(data_map_file)

    client_row = df[df['client_id'].astype(int) == client_id]

    client_dict = client_row.to_dict(orient='records')

    if client_dict:
        return client_dict[0]
    else:
        logging.info("client_id is {}".format(client_id))
        return {}

def renumber_client_ids(input_file,output_file):
    # 对原始数据集进行重新编号

    df = pd.read_csv(input_file, delimiter=',')
    
    unique_client_ids = df['client_id'].unique()
    client_id_mapping = {old_id: new_id+1 for new_id, old_id in enumerate(unique_client_ids)}
    df['client_id'] = df['client_id'].map(client_id_mapping)
    df.to_csv(output_file, index=False)

def count_client_entries(file_path):
    # 输出文件的clien_id 对应的最大数据条目和最小数据条目
    try:
        # 读取 CSV 文件，将数据用逗号分隔
        df = pd.read_csv(file_path, delimiter=',')
        
        # 统计每个 client_id 对应的数据条目数
        client_id_counts = df['client_id'].value_counts()
        
        # 计算最大值和最小值
        max_count = client_id_counts.max()
        min_count = client_id_counts.min()
        
        # 输出结果
        print("最大值:", max_count)
        print("最小值:", min_count)
    except Exception as e:
        print("An error occurred:", e)

def get_whether_valid_participant(client_i_characteristic, max_client_data_size, min_client_data_size): 
    # client_i: 第i个client, 从0开始计数
    if client_i_characteristic[3] == 0:
        return 0
    #每个减去的基准都调小了，是为了确保在所有client特征取最小值时，valid_mu不为0
    # 六维：cpu, memory, gpu, datasize, batchsize, learningrate
    # 先归一化-> (value - min)/(max - min)-> min值可以设定小一点，防止取值为0
    # 0.1*CPU + 0.1*Memory + 0.3*GPU + 0.3*(1/iterations)(iterations=Data_size/batch_size) + 0.2*(1/batch_size)
    #valid_mu = 0.1*((client_i_characteristic[0]-48)/102) + 0.1*((client_i_characteristic[1]-64)/896) + 0.3*((client_i_characteristic[2])/8) + 0.3*(client_i_characteristic[4]/client_i_characteristic[3])/(int(max_client_data_size/10)/min_client_data_size) + 0.2 / client_i_characteristic[4] / 0.5
    #valid_mu = math.sqrt(valid_mu)
    #valid_mu = 0.2 + 0.8 * valid_mu
    valid_mu = 0.1*((client_i_characteristic[0]*150-50)/100) + 0.1*((client_i_characteristic[1]*1024-128)/896) + 0.1*((client_i_characteristic[2]*8-2)/6) + (0.7 - 0.7*(client_i_characteristic[3] * max_client_data_size - min_client_data_size) / (max_client_data_size - min_client_data_size) )
    valid_mu = 0.3 + 0.7 * valid_mu
    # 添加白噪声
    noise = np.random.normal(0, 0)  # 这里假设白噪声的均值为0，标准差为0.1，可以根据需要调整
    valid_mu_with_noise = valid_mu + noise 

    # 确保 valid_mu_with_noise 在 [0, 1] 区间内
    valid_mu_with_noise = np.clip(valid_mu_with_noise, 0, 1)

    # 记录 valid_mu 和客户端特征的信息
    logging.info(f'valid_mu: {valid_mu}, valid_mu_with_noise: {valid_mu_with_noise}, client_i_characteristic: {client_i_characteristic}')

    # 根据 valid_mu_with_noise 的值，使用二项分布随机生成一个0或1的值，返回该值
    return np.random.binomial(1, valid_mu_with_noise)


if __name__ == '__main__':
    # data_map_file = "/data/dataset/femnist/client_data_mapping/train.csv"
    # output_file = "/data/dataset/femnist/client_data_mapping/train_renumber_client_ids.csv"
    # data_map_file = "/nfs/dataset/google_speech/client_data_mapping/train.csv"
    # output_file = "/nfs/dataset/google_speech/client_data_mapping/train_renumber_client_ids.csv"
    # data_map_file = "/nfs/dataset/openImg/client_data_mapping/train.csv"
    # get_client_characteristic(data_map_file)
    # data_map_file = "/data/dataset/femnist/client_data_mapping/train_client_characteristic_normalization.csv"
    #renumber_client_ids(data_map_file, output_file)
    #get_client_characteristic(data_map_file)  # 注意需要在代码内部修改输出路径
    #renumber_client_ids(data_map_file,output_file)
    # count_client_entries(data_map_file)

    data_map_file = "/nfs/dataset/google_speech/client_data_mapping/train.csv"
    output_file = "/nfs/dataset/google_speech/client_data_mapping/train_renumber_client_ids.csv"
    #renumber_client_ids(data_map_file,output_file)
    get_client_characteristic(data_map_file) # 注意需要在代码内部修改输出路径
 