import pickle
import numpy as np
from pathlib import Path
import torch
from iql import MLP, GaussianPolicy
import gym
from collections import defaultdict
import copy
import time
import matplotlib.pyplot as plt
import d4rl
import random

device = torch.device("cuda")

env = gym.make("halfcheetah-medium-expert-v2")
dataset1 = env.get_dataset()
dataset = copy.deepcopy(dataset1)

state_dim_in = env.observation_space.shape[0]
action_dim_in = env.action_space.shape[0]
max_action_in = float(env.action_space.high[0])


total_modded = 0
num_obs = len(dataset['observations'])
state_tensor = torch.from_numpy(dataset["observations"]).to(device)
action_tensor = torch.from_numpy(dataset["actions"]).to(device)
reward_tensor = torch.from_numpy(dataset["rewards"]).to(device)

# Clone the reward_tensor to create the 'poisoned' tensor
poisoned = reward_tensor.clone()

# Calculate the number of rewards to poison (e.g., 30% of the total)
num_rewards_to_poison = int(reward_tensor.numel() * 0.30)
print("num rewards poisoned: ", num_rewards_to_poison)

# Generate random indices for poisoning
poisoned_indices = random.sample(range(reward_tensor.numel()), num_rewards_to_poison)

# Poison selected rewards by multiplying by -1
poisoned[poisoned_indices] *= -1

# Save the poisoned tensor to a file
torch.save(poisoned, "rewards/rand30inverted.pt")

print("norm of difference is: ", torch.linalg.norm(reward_tensor - poisoned,ord=1))

