import glob
import os
# os.environ["XLA_PYTHON_CLIENT_PREALLOCATE"] = "false"
# os.environ["CUDA_VISIBLE_DEVICES"] = "0" 
import random

import numpy as np
import tqdm
from absl import app, flags
from ml_collections import config_flags
from tensorboardX import SummaryWriter
import wandb
from wandb_log import init_wandb_or_disable

import sys
sys.path.append("../")
from jax_rl.agents import AWACLearner, SACLearner
from jax_rl.datasets import ReplayBuffer
from jax_rl.evaluation import evaluate,rpp_evaluate
from jax_rl.utils import make_env, _should_record

FLAGS = flags.FLAGS

flags.DEFINE_string('env_name', 'Ant-v2', 'Environment name.')
flags.DEFINE_string('save_dir', './tmp/', 'Tensorboard logging dir.')
flags.DEFINE_integer('seed', 42, 'Random seed.')
flags.DEFINE_integer('eval_episodes', 10,
                     'Number of episodes used for evaluation.')
flags.DEFINE_integer('log_interval', 1000, 'Logging interval.')
flags.DEFINE_integer('eval_interval', 10000, 'Eval interval.')
flags.DEFINE_integer('batch_size', 256, 'Mini batch size.')
flags.DEFINE_integer('max_steps', int(1e6), 'Number of training steps.')
flags.DEFINE_integer('start_training', int(1e4),
                     'Number of training steps to start training.')
flags.DEFINE_boolean('tqdm', True, 'Use tqdm progress bar.')
flags.DEFINE_boolean('save_video', True, 'Save videos during evaluation.')
flags.DEFINE_integer('video_interval', 10000, 'Video saving interval.')
flags.DEFINE_boolean('rpp_value', False, 'Use RPP for value function')
flags.DEFINE_boolean('rpp_policy', True, 'Use RPP for policy function')
flags.DEFINE_string('group', '', 'Also use RPP for value function')
flags.DEFINE_float('equiv_wd', 1e-6, 'Policy Equivariant weight decay')
flags.DEFINE_float('basic_wd', 1e-6, 'Policy Basic weight decay')
flags.DEFINE_float('cequiv_wd', 0, 'Critic Equivariant weight decay')
flags.DEFINE_float('cbasic_wd', 0, 'Critic Basic weight decay')
flags.DEFINE_list('hidden_dims', [256,256], 'Dimension of hidden layers')
flags.DEFINE_boolean('small_init', True, 'Use smaller init for last policy layer')
flags.DEFINE_boolean('old_rep',False,"Use original rep allocation heuristic")
flags.DEFINE_boolean("gan_betas", False, "use GAN betas or not")
flags.DEFINE_float("tau", 0.005, 'tau for SAC updates')
flags.DEFINE_boolean('standardize',False,"Use equivariant standardization of the state")
flags.DEFINE_float('clipping', 0.5, 'Gradient Norm magnitude at which to clip')
flags.DEFINE_integer('ncritic', 1, 'Number of critic updates per policy update')
config_flags.DEFINE_config_file(
    'config',
    'configs/sac_default.py',
    'File path to the training hyperparameter configuration.',
    lock_config=False)
flags.DEFINE_string('wandb_project', None, "W&B project name")
flags.DEFINE_string('wandb_entity', None, "W&B entity name")
flags.DEFINE_string('wandb_run_name', None, 'W&B run name')
flags.DEFINE_string('wandb_group', None, 'W&B group name')
flags.DEFINE_list('wandb_tags', [], 'W&B tags')
flags.DEFINE_string('wandb_mode', 'online', 'W&B mode: online, offline, disabled')
flags.DEFINE_boolean('debug', False, 'Debug mode, no logging')

from representations import environment_symmetries
from emlp.groups import *
from jax import jit,vmap

def main(_):
    if FLAGS.rpp_value or FLAGS.rpp_policy:
        method = 'rpp'
    else:
        method = 'sac'
    print("CWD = ", os.getcwd())
    fname = f'{method}_seed{FLAGS.seed}'
    save_dir = FLAGS.save_dir

    if not FLAGS.debug:
        summary_writer = SummaryWriter(
            os.path.join(FLAGS.save_dir, FLAGS.env_name, fname))

    # if FLAGS.save_video:
    #     video_train_folder = os.path.join(save_dir, 'video', 'train')
    #     video_eval_folder = os.path.join(save_dir, 'video', 'eval')
    # else:
    #     video_train_folder = None
    #     video_eval_folder = None
    video_train_folder = None
    video_eval_folder = None

    env = make_env(FLAGS.env_name, FLAGS.seed, video_train_folder)
    # eval_env = make_env(FLAGS.env_name, FLAGS.seed + 42, video_eval_folder)

    np.random.seed(FLAGS.seed)
    random.seed(FLAGS.seed)

    kwargs = dict(FLAGS.config)
    kwargs.update(environment_symmetries[FLAGS.env_name])
    
    hidden_dims = tuple(int(hd) for hd in FLAGS.hidden_dims)
    kwargs['hidden_dims'] = hidden_dims
    
    kwargs['rpp_value']=FLAGS.rpp_value
    kwargs['rpp_policy']=FLAGS.rpp_policy
    if FLAGS.group:
        kwargs['symmetry_group']=eval(FLAGS.group)
    kwargs['state_rep'] = kwargs['state_rep'](kwargs['symmetry_group'])
    kwargs['action_rep'] = kwargs['action_rep'](kwargs['symmetry_group'])
    if FLAGS.old_rep:
        kwargs.pop('middle_rep',None)
        
    model_bits = [
        f"{method.upper()}",
        FLAGS.env_name,
        "VAL" if FLAGS.rpp_value else "NO-VAL",
        "POL" if FLAGS.rpp_policy else "NO-POL",
    ]
    model_name = "_".join(model_bits)
    
    if FLAGS.wandb_run_name is None:
        FLAGS.wandb_run_name = model_name + f"_seed{FLAGS.seed}"
    if not FLAGS.debug:
        init_wandb_or_disable(FLAGS, fname, model_name=model_name, extra_cfg=kwargs)

    replay_buffer_size = kwargs.pop('replay_buffer_size')
    action_dim = env.action_space.shape[0] if kwargs['action_space']=='continuous' else 1
    replay_buffer = ReplayBuffer(env.observation_space, action_dim,
                                 replay_buffer_size or FLAGS.max_steps,kwargs['state_rep'],
                                 kwargs['state_transform'],kwargs['inv_state_transform'],
                                 FLAGS.standardize)

    algo = kwargs.pop('algo')
    assert algo=='sac', "other RL algos not yet supported"
    
    if algo == 'sac':
        agent = SACLearner(FLAGS.seed,
                            env.observation_space.sample()[np.newaxis],
                            np.asarray(env.action_space.sample())[None], 
                            actor_basic_wd=FLAGS.basic_wd,
                            actor_equiv_wd=FLAGS.equiv_wd,
                            critic_basic_wd=FLAGS.cbasic_wd,
                            critic_equiv_wd=FLAGS.cequiv_wd,
                            standardizer=replay_buffer.running_stats.standardize if FLAGS.standardize else None,
                            clipping=FLAGS.clipping,
                            gan_betas=FLAGS.gan_betas,
                            tau=FLAGS.tau,**kwargs)
        if method == 'rpp':
            policy_mean_fn = jit(lambda p,x: agent.sac.actor.apply_fn.apply({'params':p},x)._distribution._loc)
    else:
        raise NotImplementedError()

    @jit
    def reprhos(x):
        gs = kwargs['symmetry_group'].samples(x.shape[0])
        ring = vmap(kwargs['state_rep'].rho_dense)(gs)
        routg = vmap(kwargs['action_rep'].rho_dense)(gs)
        return ring,routg

    

    eval_returns = []
    observation, done = env.reset(), False
    for i in tqdm.tqdm(range(1, FLAGS.max_steps + 1),
                       smoothing=0.1,
                       disable=not FLAGS.tqdm):
        if i < FLAGS.start_training:
            action = env.action_space.sample()
        else:
            action = agent.sample_actions(observation)
        next_observation, reward, done, info = env.step(action)

        if not done or 'TimeLimit.truncated' in info:
            mask = 1.0
        else:
            mask = 0.0

        replay_buffer.insert(observation, action, reward, mask,
                             next_observation)
        observation = next_observation

        if done:
            observation, done = env.reset(), False
            
            ep_log = {}
            for k, v in info['episode'].items():
                ep_log[f"training/{k}"] = v
            step_to_use = info.get('total', {}).get('timesteps', i)

            if not FLAGS.debug:
                wandb.log(ep_log, step=step_to_use)

                for k, v in info['episode'].items():
                    summary_writer.add_scalar(f'training/{k}', v,
                                            info['total']['timesteps'])

        if i >= FLAGS.start_training:
            for _ in range(FLAGS.ncritic-1):
                batch = replay_buffer.sample(FLAGS.batch_size)
                update_info = agent.update(batch,update_policy=False)
            batch = replay_buffer.sample(FLAGS.batch_size)
            update_info = agent.update(batch)

            if i % FLAGS.log_interval == 0 and not FLAGS.debug:
                for k, v in update_info.items():
                    summary_writer.add_scalar(f'training/{k}', v, i)
                summary_writer.flush()
                wandb.log({f"training/{k}": v for k, v in update_info.items()}, step=i)

        if i % FLAGS.eval_interval == 0:
            
            record_now = _should_record(i, FLAGS.save_video, FLAGS.video_interval)
            eval_vid_dir = (os.path.join(FLAGS.save_dir, FLAGS.env_name, fname, 'eval_videos', f"step_{i}")) if record_now else None
            
            eval_env = make_env(FLAGS.env_name, FLAGS.seed + 42, save_folder=eval_vid_dir)
            
            if (i//FLAGS.eval_interval)%8==0 and method == 'rpp': # only do equivariance calc ever 4 evals
                eval_stats = rpp_evaluate(agent,policy_mean_fn, eval_env, FLAGS.eval_episodes,kwargs,reprhos)
            else:
                eval_stats = evaluate(agent,eval_env,FLAGS.eval_episodes)
            
            step_to_use = info.get('total', {}).get('timesteps', i)
            if not FLAGS.debug:
                wandb.log({f"evaluation/average_{k}s": v for k, v in eval_stats.items()}, step=step_to_use)

                for k, v in eval_stats.items():
                    summary_writer.add_scalar(f'evaluation/average_{k}s', v,
                                            info['total']['timesteps'])
                summary_writer.flush()

                if eval_vid_dir is not None:
                    mp4s = sorted(glob.glob(os.path.join(eval_vid_dir, '*.mp4')))
                    for ep_idx, vf in enumerate(mp4s):
                        wandb.log(
                            {f"eval/video/ep{ep_idx}": wandb.Video(vf, fps=30, format="mp4",
                                                                   caption=f"{FLAGS.env_name} @ step {step_to_use} (ep {ep_idx})")},
                            step=step_to_use
                        )
                try:
                    eval_env.close()
                except Exception:
                    pass

                eval_returns.append(
                    (info['total']['timesteps'], eval_stats['return']))
                np.savetxt(os.path.join(FLAGS.save_dir, FLAGS.env_name, fname, f'{FLAGS.seed}.txt'),
                        eval_returns,
                        fmt=['%d', '%.1f'])

    wandb.finish()

if __name__ == '__main__':
    app.run(main)
