import d3rlpy
import torch
from denoising_diffusion_pytorch import Unet, GaussianDiffusion, Unet1D, GaussianDiffusion1D, Trainer1D, Dataset1D
import time
import training
import generation

def run(env, policy, num, n, index, train_step):
    new_data = []
    num = n
    
    ### 1. Sample offline data
    for i in range(0, num):
        buffer = d3rlpy.dataset.create_fifo_replay_buffer(
            limit=100000, env=env)  # start data collection
        # n_step is to specify the length of each episode
        policy.collect(env, buffer, n_steps=128)
        # save ReplayBuffer
        with open(f"1Dtrained_policy_dataset_new_{num}_{n}_{train_step}_{index}.h5", "w+b") as f:
            buffer.dump(f)
        # read ReplayBuffer
        with open(f"1Dtrained_policy_dataset_new_{num}_{n}_{train_step}_{index}.h5", "rb") as f:
            new_dataset = d3rlpy.dataset.ReplayBuffer.load(
                f, d3rlpy.dataset.InfiniteBuffer())
        new_data.append(new_dataset.episodes)

    # Extract reward from new_data
    new_reward = []
    for i in range(0, num):
        new_reward.append(torch.tensor(new_data[i][0].rewards).T)

    # new_reward.shape: (num, 1, 128)
    new_reward = torch.stack(new_reward, dim=0)

    # batch 1
    new_reward_batch_1 = new_reward[:int(n/2), :, :] # n/2
    # batch 2
    new_reward_batch_2 = new_reward[int(n/2):int(n), :, :] # n/2

    # Construct scaled average values based on each batch. 
    reward_generator_1 = torch.zeros_like(new_reward_batch_1)
    DimGenerator_1 = new_reward_batch_1.shape[0]
    for i in range(DimGenerator_1):
        reward_generator_1[i, 0, 0] = new_reward_batch_1[i, 0].sum()
    MaxElement_1 = torch.max(-reward_generator_1)
    reward_generatorScaled_1 = -reward_generator_1 / MaxElement_1

    reward_generator_2 = torch.zeros_like(new_reward_batch_2)
    DimGenerator_2 = new_reward_batch_2.shape[0]
    for i in range(DimGenerator_2):
        reward_generator_2[i, 0, 0] = new_reward_batch_2[i, 0].sum()
    MaxElement_2 = torch.max(-reward_generator_2)
    reward_generatorScaled_2 = -reward_generator_2 / MaxElement_2

    ### 2. Training 
    training.Train(reward_generatorScaled_1, 1, n, index, train_step)
    training.Train(reward_generatorScaled_2, 2, n, index, train_step)

    ### 3. Load the trained model
    model = Unet1D(
    dim=64,
    dim_mults=(1, 2, 4, 8),
    channels=1
    )
    diffusion1 = GaussianDiffusion1D(
    model,
    seq_length=128,
    timesteps=1000,  
    objective='pred_v'
    )
    diffusion2 = GaussianDiffusion1D(
    model,
    seq_length=128,
    timesteps=1000,  
    objective='pred_v'
    )
    diffusion1.load_state_dict(torch.load(f'diffusion_{n}_{index}_1_{train_step}.pth'))
    diffusion2.load_state_dict(torch.load(f'diffusion_{n}_{index}_2_{train_step}.pth'))

    ### 4. Sampling from the trained model
    DiffSampled1_100, GenerateTimeUsage_1_100 = generation.generation(diffusion1, 100, MaxElement_1)
    time_res = 'n = {}, m = 100, steps = {}, index = {}, batch = {}, TrainTimeUsage = {}\n'.format(n, train_step, index, 1, GenerateTimeUsage_1_100)
    with open(f'GenerationTimeUsage.txt', 'a') as file: 
        file.write(time_res)
    
    DiffSampled2_100, GenerateTimeUsage_2_100 = generation.generation(diffusion2, 100, MaxElement_2)
    time_res = 'n = {}, m = 100, steps = {}, index = {}, batch = {}, TrainTimeUsage = {}\n'.format(n, train_step, index, 2, GenerateTimeUsage_2_100)
    with open(f'GenerationTimeUsage.txt', 'a') as file: 
        file.write(time_res)
    
    DiffSampled1_500, GenerateTimeUsage_1_500 = generation.generation(diffusion1, 500, MaxElement_1)
    time_res = 'n = {}, m = 500, steps = {}, index = {}, batch = {}, TrainTimeUsage = {}\n'.format(n, train_step, index, 1, GenerateTimeUsage_1_500)
    with open(f'GenerationTimeUsage.txt', 'a') as file: 
        file.write(time_res)

    DiffSampled2_500, GenerateTimeUsage_2_500 = generation.generation(diffusion2, 500, MaxElement_2)
    time_res = 'n = {}, m = 500, steps = {}, index = {}, batch = {}, TrainTimeUsage = {}\n'.format(n, train_step, index, 2, GenerateTimeUsage_2_500)
    with open(f'GenerationTimeUsage.txt', 'a') as file: 
        file.write(time_res)

    DiffSampled1_1000, GenerateTimeUsage_1_1000 = generation.generation(diffusion1, 1000, MaxElement_1)
    time_res = 'n = {}, m = 1000, steps = {}, index = {}, batch = {}, TrainTimeUsage = {}\n'.format(n, train_step, index, 1, GenerateTimeUsage_1_1000)
    with open(f'GenerationTimeUsage.txt', 'a') as file: 
        file.write(time_res)

    DiffSampled2_1000, GenerateTimeUsage_2_1000 = generation.generation(diffusion2, 1000, MaxElement_2)
    time_res = 'n = {}, m = 1000, steps = {}, index = {}, batch = {}, TrainTimeUsage = {}\n'.format(n, train_step, index, 2, GenerateTimeUsage_2_1000)
    with open(f'GenerationTimeUsage.txt', 'a') as file: 
        file.write(time_res)


    ### 5. Concatenate
    # m = 100
    synthetic_reward_sum1_100 = torch.cat([DiffSampled1_100, reward_generator_2], dim=0)
    synthetic_reward_ave1_100 = synthetic_reward_sum1_100 / 128
    synthetic_reward_ave_Vector1_100 = torch.tensor([
        synthetic_reward_ave1_100[i, 0, 0] for i in range(int(100 + n / 2))])
    
    synthetic_reward_sum2_100 = torch.cat([DiffSampled2_100, reward_generator_1], dim=0)
    synthetic_reward_ave2_100 = synthetic_reward_sum2_100 / 128
    synthetic_reward_ave_Vector2_100 = torch.tensor([
    synthetic_reward_ave2_100[i, 0, 0] for i in range(int(100 + n / 2))])

    average_reward_SYN_1_100 = torch.mean(synthetic_reward_ave_Vector1_100)
    average_reward_SYN_2_100 = torch.mean(synthetic_reward_ave_Vector2_100)

    # m = 500
    synthetic_reward_sum1_500 = torch.cat([DiffSampled1_500, reward_generator_2], dim=0)
    synthetic_reward_ave1_500 = synthetic_reward_sum1_500 / 128
    synthetic_reward_ave_Vector1_500 = torch.tensor([
        synthetic_reward_ave1_500[i, 0, 0] for i in range(int(500 + n / 2))])
    
    synthetic_reward_sum2_500 = torch.cat([DiffSampled2_500, reward_generator_1], dim=0)
    synthetic_reward_ave2_500 = synthetic_reward_sum2_500 / 128
    synthetic_reward_ave_Vector2_500 = torch.tensor([
    synthetic_reward_ave2_500[i, 0, 0] for i in range(int(500 + n / 2))])

    average_reward_SYN_1_500 = torch.mean(synthetic_reward_ave_Vector1_500)
    average_reward_SYN_2_500 = torch.mean(synthetic_reward_ave_Vector2_500)

    # m = 1000
    synthetic_reward_sum1_1000 = torch.cat([DiffSampled1_1000, reward_generator_2], dim=0)
    synthetic_reward_ave1_1000 = synthetic_reward_sum1_1000 / 128
    synthetic_reward_ave_Vector1_1000 = torch.tensor([
        synthetic_reward_ave1_1000[i, 0, 0] for i in range(int(1000 + n / 2))])
    
    synthetic_reward_sum2_1000 = torch.cat([DiffSampled2_1000, reward_generator_1], dim=0)
    synthetic_reward_ave2_1000 = synthetic_reward_sum2_1000 / 128
    synthetic_reward_ave_Vector2_1000 = torch.tensor([
    synthetic_reward_ave2_1000[i, 0, 0] for i in range(int(1000 + n / 2))])

    average_reward_SYN_1_1000 = torch.mean(synthetic_reward_ave_Vector1_1000)
    average_reward_SYN_2_1000 = torch.mean(synthetic_reward_ave_Vector2_1000)

    average_reward_1 = torch.mean(reward_generator_1)
    average_reward_2 = torch.mean(reward_generator_2)

    return average_reward_SYN_1_100, average_reward_SYN_2_100, average_reward_SYN_1_500, average_reward_SYN_2_500, average_reward_SYN_1_1000, average_reward_SYN_2_1000, average_reward_1, average_reward_2
    






