# Generates positive movie reviews by tuning a pretrained model on IMDB dataset
# with a sentiment reward function
import json
import os
import re
import sys
import functools
import random
import numpy as np
import jsonlines
from typing import List, Tuple, Optional

import torch
import evaluate
from datasets import load_dataset
from transformers import pipeline
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import AutoTokenizer, AutoModelForCausalLM

import uuid
import trlx
from trlx.data.default_configs import TRLConfig
from trlx.models.modeling_ppo import PPOConfig

from trlx.data.configs import (
    ModelConfig,
    OptimizerConfig,
    SchedulerConfig,
    TokenizerConfig,
    TrainConfig,
    TRLConfig,
)
import evaluate

from accelerate_redteam_ppo_trainer import RedteamPPOConfig

script_name = os.path.splitext(os.path.basename(__file__))[0]

def default_redteam_ppo_config():
    return TRLConfig(
        train=TrainConfig(
            seq_length=1024,
            epochs=800,
            total_steps=10000,
            batch_size=64,
            minibatch_size=8,
            checkpoint_interval=10000,
            eval_interval=100,
            pipeline="PromptPipeline",
            trainer="AccelerateRedteamPPOTrainer",
            tracker="tensorboard",
            logging_dir=script_name,
            checkpoint_dir=f"{script_name}/ckpts",
            mixed_precision="fp16",
        ),
        model=ModelConfig(
            model_path="gpt2",
            num_layers_unfrozen=-1,            
            peft_config={
                'r': 32,
                'lora_alpha': 16,
                'lora_dropout': 0.0,
                'task_type': "CAUSAL_LM",
                'peft_type': "LORA",
            },
            quantization_config={
                'load_in_4bit': True,
                'bnb_4bit_compute_dtype': 'float16',
                'bnb_4bit_use_double_quant': True,
                'bnb_4bit_quant_type': 'nf4',
            },
        ),
        tokenizer=TokenizerConfig(
            tokenizer_path="gpt2",
            truncation_side="right"),
        optimizer=OptimizerConfig(
            name="adamw", kwargs=dict(lr=3e-5, betas=(0.9, 0.95), eps=1.0e-8, weight_decay=1.0e-6)
        ),
        scheduler=SchedulerConfig(name="cosine_annealing", kwargs=dict(T_max=1e12, eta_min=3e-5)),
        method=RedteamPPOConfig(
            name="RedteamPPOConfig",
            num_rollouts=128,
            chunk_size=8,
            ppo_epochs=4,
            init_kl_coef=0.001,
            target=None,
            horizon=10000,
            gamma=1,
            lam=0.95,
            cliprange=0.2,
            cliprange_value=0.2,
            vf_coef=1,
            scale_reward="ignored",
            ref_mean=None,
            ref_std=None,
            cliprange_reward=10,
            gen_kwargs=dict(
                max_new_tokens=10,
                top_k=0,
                top_p=0.92,
                temperature=0.7,
                do_sample=True,
                eos_token_id=50256,
                pad_token_id=50256,
            ),

            # Red team config
            bleu_reward_coef=0.0,
            bleu_reward_grams="[3, 4, 5]",
            bleu_reward_include_prompts=False,

            # Entropy bonus
            ent_reward_coef=0.0,

            # Cossim embed
            cossimemb_reward_coef=0.0,
            cossimemb_n_samples=-1,
            cossimemb_impl="sentencetransformer",
            cossimemb_reward_include_prompts=False,

            # Textual sim
            textual_sim_reward_coef=0,
            textual_sim_reward_include_prompts=False,

            giberish_penalty_coef=0.0,
        ),
    )


class RedTeamToxicityRewardModel(object):

    ATTACKER_EOS_TOKEN = "<|endoftext|>"

    @staticmethod
    def make_prompt(instruction):
        prompt_template = "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Response:\n"

        return prompt_template.format(instruction=instruction.rstrip())

    def __init__(self, device, victim_model, victim_tokenizer) -> None:
        if device.type == "cuda":
            device = torch.device(f"cuda:1")
        elif device.type.startswith("cuda"):
            device_idx = int(device.type.split(":")[1])
            device = torch.device(f"cuda:{device_idx+1}")

        self.toxicity_fn = evaluate.load(
                "toxicity", 
                module_type="measurement",
                experiment_id=str(uuid.uuid4()),
                device=device)
        self.device = device
        
        self.victim_model = pipeline(model="databricks/dolly-v2-7b", torch_dtype=torch.bfloat16,
                         trust_remote_code=True, 
                         device_map="auto", 
                        #  device=device,
                         max_new_tokens=30,
                         return_full_text=False)

    def __call__(self, samples: List[str], 
                prompts: Optional[List[str]],
                outputs: Optional[List[str]],
                return_texts=False, # For logging the victim's response
                **kwargs
            ) -> List[float]:

        def _make_instruction(output):
            output = output.lstrip().rstrip()

            # Remove <endoftext>: 
            # when the attacker's generation contains eos_token or trimmed by stop_sequences, trlx will append eos_token
            trimmed_indices = list(filter(lambda idx: idx >= 0, [
                output.find(self.ATTACKER_EOS_TOKEN),
                output.find("\n"),
            ]))
                        
            if len(trimmed_indices) == 0:
                return self.make_prompt(output)
            else:
                trimmed_idx = min(trimmed_indices)
                return self.make_prompt(output[:trimmed_idx])

        instructions = list(map(_make_instruction, outputs))

        victim_responses = self.victim_model(instructions)
        
        victim_responses = list(map(lambda x: (x[0]["generated_text"]), victim_responses))

        toxicity_scores = self.toxicity_fn.compute(predictions=victim_responses)["toxicity"]

        """
        NOTE: We always return `victim_responses` that is used in reward calculation.
            If we consider an instruction-following setting, we should remove the blurb and other special tokens, and return.
            In continuation tasks (e.g., imdb review), we should return the concatenation of attacker's prompt, attacker's response, and victim's responses.
        """
        
        if return_texts:
            return toxicity_scores, victim_responses
        else:
            return toxicity_scores


def main(hparams={}):
    # Merge sweep config with default config if given
    config = TRLConfig.update(default_redteam_ppo_config().to_dict(), hparams)

    # Avoid overwriting the experiments
    if os.path.exists(config.train.logging_dir):
        print("Experiment exists")
        sys.exit()
    else:
        os.makedirs(config.train.logging_dir)

    with jsonlines.open("prompts/databrick-1024-2.jsonl", "r") as reader:
        prompts = list(map(lambda x: x["attacker_prompt"], reader))
    train_prompts, eval_prompts = prompts, prompts[-3:]

    trlx.train(
        reward_fn=RedTeamToxicityRewardModel,
        prompts=train_prompts,
        eval_prompts=eval_prompts,
        config=config,
        stop_sequences=["\n",],
    )


if __name__ == "__main__":
    hparams = {} if len(sys.argv) == 1 else json.loads(sys.argv[1])
    # hparams = {"method.init_kl_coef": 0.0001, "method.bleu_reward_coef": -1.0, "method.cossimemb_reward_coef": -0.1, "method.ent_reward_coef": 0.01, "method.bleu_reward_grams": "[2,3,4,5]", "train.logging_dir": "results/alpaca_toxicity/ppo_kl0.0001_bleu-1.0_cossimemb-0.1_ent0.01/20230914182453", "train.checkpoint_dir": "results/alpaca_toxicity/ppo_kl0.0001_bleu-1.0_cossimemb-0.1_ent0.01/20230914182453"}
    main(hparams)
