# from data_utils import load_dataset
from utils import construct_prompt, random_sampling, construct_prompt_without_test, construct_prompt_instance_prompt_text, construct_prompt_without_test_emptyanswer, construct_prompt_with_random_prefix_without_test, construct_prompt_with_random_prefix
import numpy as np
import torch
import transformers
from peft import PeftModel
from transformers import GenerationConfig, LlamaForCausalLM, LlamaTokenizer, LlamaForCompressionCausalLM, AutoConfig, AutoTokenizer, AutoModelForCausalLM
import argparse
from typing import Dict, Optional, Sequence
import itertools
import copy
import json
import random
import evaluate
from openpyxl import Workbook
from datasets import load_dataset
import nltk
import requests
# stopwords_list = requests.get("https://gist.githubusercontent.com/rg089/35e00abf8941d72d419224cfd5b5925d/raw/12d899b70156fd0041fa9778d657330b024b959c/stopwords.txt").content
# stopwords = set(stopwords_list.decode().splitlines()) 
stopwords = ["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now", 
             '!', "@",  '#', "$", '%', "^", '&', "*", '(', ")", '-', "_", '+', "=", '[', "]", '{', "}",  "|", "\'", ';', "\"", "\'", "<", ">", ",", "." , "?", "/", "\n"
            ]
# global construct_prompt_without_test 
# global construct_prompt_instance_prompt_text
# import deepcopy
def smart_tokenizer_and_embedding_resize(
    special_tokens_dict: Dict,
    tokenizer: transformers.PreTrainedTokenizer,
    # model: transformers.PreTrainedModel,
):
    """Resize tokenizer and embedding.

    Note: This is the unoptimized version that may make your embedding size not be divisible by 64.
    """
    num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict)
    print("num_new_tokens = ", num_new_tokens)
    return num_new_tokens

def load_model_lora(base_model, device, lora_weights, compression, compression_length, use_partial_mask):
    config = AutoConfig.from_pretrained(
        base_model,
        cache_dir='/network/scratch/y/yu.bai/.cache',
    )
    if compression:
        config.compression_size = compression_length
        config.use_partial_mask = use_partial_mask
        print('use_partial_mask = ', use_partial_mask)
        
    if device == "cuda":
        if compression:
            model = LlamaForCompressionCausalLM.from_pretrained(
                base_model,
                load_in_8bit=False,
                torch_dtype=torch.float16,
                device_map="auto",
                config=config,
            )
        else:
            model = LlamaForCausalLM.from_pretrained(
                base_model,
                load_in_8bit=False,
                torch_dtype=torch.float16,
                device_map="auto",
            )
        model = PeftModel.from_pretrained(
            model,
            lora_weights,
            torch_dtype=torch.float16,
        )
    elif device == "mps":
        if compression:
            model = LlamaForCompressionCausalLM.from_pretrained(
                base_model,
                device_map={"": device},
                torch_dtype=torch.float16,
                config=config,
            )
        else:
            model = LlamaForCausalLM.from_pretrained(
                base_model,
                device_map={"": device},
                torch_dtype=torch.float16,
            )
        model = PeftModel.from_pretrained(
            model,
            lora_weights,
            device_map={"": device},
            torch_dtype=torch.float16,
        )
    else:
        if compression:
            model = LlamaForCompressionCausalLM.from_pretrained(
                base_model, device_map={"": device}, low_cpu_mem_usage=True,
                config=config,
            )
        else:
            model = LlamaForCausalLM.from_pretrained(
                base_model, device_map={"": device}, low_cpu_mem_usage=True
            )
        model = PeftModel.from_pretrained(
            model,
            lora_weights,
            device_map={"": device},
        )
    return model

def main(model, lora_weight, dataset, lang1, lang2, instruction_text, num_seeds, all_shots, subsample_test_set, compression, compression_length, use_partial_mask, compression_without_input, with_prompt_text, compression_without_prompt_text, with_sequence_order, compression_token_initialization, add_final_article, include_answer, load_in_8bit, load_in_4bit, only_answer, include_punctuation, include_content, include_template, include_colon, fix_cls, only_article_template, only_answer_template, not_include_colon, exclude_nxt, random_template, without_punctuation,  fixed_random, nonfixed_random, same_random, revert_random, only_one_colon):
    test_inference = True

    print("lora_weight = ", lora_weight)
    print("model = ", model)
    print("dataset = ", dataset)
    print("with_prompt_text = ", with_prompt_text)
    print("compression_without_prompt_text = ", compression_without_prompt_text)




    if test_inference:
        if torch.cuda.is_available():
            device = "cuda"
        else:
            device = "cpu"
        base_model = 'decapoda-research/llama-7b-hf' if lora_weight is not None else model
        # base_model = model

        

        # model = LlamaForCausalLM.from_pretrained(
        #     base_model, device_map={"": device}, low_cpu_mem_usage=True, cache_dir='/network/scratch/y/yu.bai/.cache',
        # )
        # model = PeftModel.from_pretrained(
        #     model,
        #     lora_weights,
        #     device_map={"": device},
        # )
        if model == 'decapoda-research/llama-7b-hf':
            tokenizer = LlamaTokenizer.from_pretrained('baffo32/decapoda-research-llama-7B-hf', cache_dir='/network/scratch/y/yu.bai/.cache')
        elif model == 'openlm-research/open_llama_3b':
            tokenizer = LlamaTokenizer.from_pretrained('openlm-research/open_llama_3b',local_files_only=True,  cache_dir='/network/scratch/y/yu.bai/.cache')
        elif model == 'decapoda-research/llama-13b-hf':
            tokenizer = LlamaTokenizer.from_pretrained('dfurman/LLaMA-13B', cache_dir='/network/scratch/y/yu.bai/.cache')
        elif model == 'decapoda-research/llama-30b-hf':
            tokenizer = LlamaTokenizer.from_pretrained('TheBloke/llama-30b-supercot-SuperHOT-8K-fp16', cache_dir='/network/scratch/y/yu.bai/.cache')
        else:
            tokenizer = AutoTokenizer.from_pretrained(base_model,local_files_only=False,  cache_dir='/network/scratch/y/yu.bai/.cache')
        # exit()
        # if lora_weight is None:

        IGNORE_INDEX = -100
        DEFAULT_PAD_TOKEN = "[PAD]"
        DEFAULT_EOS_TOKEN = "</s>"
        DEFAULT_BOS_TOKEN = "<s>"
        DEFAULT_UNK_TOKEN = "<unk>"
        special_tokens_dict = dict()
        if tokenizer.pad_token is None:
            special_tokens_dict["pad_token"] = DEFAULT_PAD_TOKEN
        if tokenizer.eos_token is None:
            special_tokens_dict["eos_token"] = DEFAULT_EOS_TOKEN
        if tokenizer.bos_token is None:
            special_tokens_dict["bos_token"] = DEFAULT_BOS_TOKEN
        if tokenizer.unk_token is None:
            special_tokens_dict["unk_token"] = DEFAULT_UNK_TOKEN

        
        num_new_tokens = smart_tokenizer_and_embedding_resize(
            special_tokens_dict=special_tokens_dict,
            tokenizer=tokenizer,
            # model=model,
        )

        if lora_weight is not None:
            model = load_model_lora(base_model, device, lora_weight, compression, compression_length, use_partial_mask)
                        # unwind broken decapoda-research config
            # model.config.pad_token_id = tokenizer.pad_token_id = 0  # unk
            # model.config.bos_token_id = 1
            # model.config.eos_token_id = 2

            print("model = ", model)
        else:
            # if "llama" in model: 
            config = transformers.AutoConfig.from_pretrained(

                base_model,
                cache_dir='/network/scratch/y/yu.bai/.cache',
                local_files_only=True if 'llama' in model else False,
            )
            # else:
            # if model == 'decapoda-research/llama-7b-hf':
            #     config = transformers.AutoConfig.from_pretrained(
            #         'decapoda-research/llama-7b-hf',
            #         cache_dir='/network/scratch/y/yu.bai/.cache',
            #     )
            # else:
            #     config = transformers.AutoConfig.from_pretrained(
            #         'openlm-research/open_llama_3b',
            #         cache_dir='/network/scratch/y/yu.bai/.cache',
            #     )
            config.compression_token_initialization = compression_token_initialization
            if compression_token_initialization:
                config.initialize_ids = tokenizer("Article: N/A \n\n Answer: N/A \n\n", return_tensors="pt")['input_ids'].tolist()
            if compression: 
                
                config.vocab_size += num_new_tokens
                config.compression_size = compression_length
                config.use_partial_mask = use_partial_mask
                print("use_partial_mask = ", use_partial_mask)

                # if '13b' in base_model:
                model = LlamaForCompressionCausalLM.from_pretrained(
                    base_model,
                    config=config,
                    load_in_8bit=load_in_8bit,
                    trust_remote_code=True,
                    # tie_weights=True,
                    torch_dtype=torch.float16,
                    device_map="auto",
                    cache_dir='/network/scratch/y/yu.bai/.cache',
                )   
                # else:
                #     model = LlamaForCompressionCausalLM.from_pretrained(
                #         base_model,
                #         config=config,
                #         # load_in_8bit=load_8bit,
                #         trust_remote_code=True,
                #         # tie_weights=True,
                #         torch_dtype=torch.float16,
                #         device_map="auto",
                #         cache_dir='/network/scratch/y/yu.bai/.cache',
                #     )                
            else:
                if 'llama' in base_model:
                    model = LlamaForCausalLM.from_pretrained(
                        base_model,
                        config=config,
                        load_in_8bit=load_in_8bit,
                        # tie_weights=True,
                        torch_dtype=torch.float16,
                        device_map="auto",
                        cache_dir='/network/scratch/y/yu.bai/.cache',
                        local_files_only=True,
                    )
                else:
                    model = AutoModelForCausalLM.from_pretrained(
                        base_model,
                        config=config,
                        load_in_8bit=load_in_8bit,
                        # tie_weights=True,
                        torch_dtype=torch.float16,
                        device_map="auto",
                        cache_dir='/network/scratch/y/yu.bai/.cache',
                        local_files_only=False,
                    )

            
        # num_new_tokens = smart_tokenizer_and_embedding_resize(
        #     special_tokens_dict=special_tokens_dict,
        #     tokenizer=tokenizer,
        #     # model=model,
        # )
        temperature=0.8
        top_p=0.75
        top_k=40
        num_beams=4
        max_new_tokens=90
        generation_config = GenerationConfig(
            temperature=temperature,
            top_p=top_p,
            top_k=top_k,
            num_beams=num_beams,
            # use_cache=True,
        )

    # dataset_name = ['agnews', 'atis', 'cb', 'dbpedia', 'lama', 'rte', 'slot-movies', 'sst2', 'trec']

    # data_path = '/home/mila/y/yu.bai/projects/sensitivity_data/few-shot-learning/data'


    # if dataset == 'atis':
    model.eval()


    
    # Different ids got different training set, so we can not just mix them up.
    if dataset == "lama":
        all_lamas = [1001,101,103,106,108,127,1303,131,136,1376,138,140,1412,159,17,176,178,19,190,20,264,27,276,279,30,31,36,361,364,37,39,407,413,449,463,47,495,527,530,740,937]
        all_params = []
        for which_lama in all_lamas:
            # p = deepcopy(default_params)
            p = {}
            p['dataset'] = f"lama_{which_lama}"
            all_params.append(p)
        
        correct = 0
        test_samples_num = 0
        samples_num = 0
        for param_index, params in enumerate(all_params):
            orig_train_sentences, orig_train_labels, orig_test_sentences, orig_test_labels = load_dataset(params)
            
            np.random.seed(num_seeds)
            # AgNews 7600
            # samples_num = subsample_test_set
            few_shot = all_shots
            if test_samples_num == 0:
                test_sentences, test_labels = orig_test_sentences, orig_test_labels
                samples_num += len(orig_test_labels)

            else:
                test_sentences, test_labels = random_sampling(orig_test_sentences, orig_test_labels, samples_num)

            train_sentences, train_labels = random_sampling(orig_train_sentences, orig_train_labels, few_shot)
            # print("----------Train-----------")
            # print(orig_train_sentences[:5], orig_train_labels[:5])
            # print("----------Test-----------")
            # print(test_sentences[0], test_labels[0])



            # print('----------Prompt--------------')

            for i, each_test in enumerate(test_sentences):
                # print("i = ", i, end='\r')
                prompt = construct_prompt(params, train_sentences, train_labels, each_test)
                # print("constructed prompt = ", prompt)


                if test_inference:
                    # prompt = 'The highest mountain in the world is '
                    inputs = tokenizer(prompt, return_tensors="pt")
                    input_ids = inputs["input_ids"].to(device)
                    # generate_params = {
                    #     "input_ids": input_ids,
                    #     "generation_config": generation_config,
                    #     "return_dict_in_generate": True,
                    #     "output_scores": True,
                    #     "max_new_tokens": max_new_tokens,
                    # }
                    # print("yesyesyes???")
                    with torch.no_grad():
                        if not compression:
                            generation_output = model.generate(
                                input_ids=input_ids,
                                generation_config=generation_config,
                                return_dict_in_generate=True,
                                output_scores=True,
                                max_new_tokens=max_new_tokens,
                            )
                        else:
                            generation_output = model.compression_generate(
                                input_ids=input_ids,
                                generation_config=generation_config,
                                return_dict_in_generate=True,
                                output_scores=True,
                                max_new_tokens=max_new_tokens,
                            )

                    s = generation_output.sequences[0]
                    output = tokenizer.decode(s)

                    # answer = output.split()[-1]
                    answer = output.split(':')[-1].strip()
                    # print("------------Model Output------------")
                    
                    # print("label = ", params['label_dict'][test_labels[i]])
                    # print("label = ", test_labels[i])
                    # print("generation_output = ", generation_output)
                    # print("output = ", output)
                    # if i > 20: 
                    #     exit()
                    if dataset in ['cb', 'rte', 'dbpedia', 'sst2', 'trec']:
                        if answer in params['inv_label_dict'].keys() and test_labels[i] == params['inv_label_dict'][answer]:
                            correct += 1
                    else:
                        if answer.startswith(test_labels[i]) or test_labels[i].startswith(answer):
                            correct += 1
        print("model = ", model)
        print("dataset = ", dataset)
        print("num_seeds = ", num_seeds)
        print("all_shots = ", all_shots)
        print("ACC = ", correct / samples_num)

    # if dataset == "slot-movies":
    else:
        params = {
            'dataset': dataset,
            'q_prefix': "Source: ",
            'a_prefix': "Target: ",
            'prompt_prefix': "Translate the following source sentences into target sentences. \n\n",
            'task_format': "qa",
        }
        # orig_train_sentences, orig_train_labels, orig_test_sentences, orig_test_labels = load_dataset(params)
        raw_datasets = load_dataset(dataset,
            lang1 + '-' + lang2,
            # lang=lang,
            # lang2=lang2,
            trust_remote_code=True,
            cache_dir='/network/scratch/y/yu.bai/.cache',
            
        )
        print("raw_datasets = ", raw_datasets.keys())
        column_names = raw_datasets["devtest"].column_names
        print("column_names = ", column_names)
        source_column = 'sentence_' + lang1

        target_column = "sentence_" + lang2
        predict_dataset = raw_datasets["devtest"]
        train_dataset = raw_datasets["dev"]
        # exit()
        orig_train_sentences = []
        orig_train_labels = []
        orig_test_sentences = []
        orig_test_labels = []
        for each in train_dataset:
            orig_train_sentences.append(each[source_column])
            orig_train_labels.append(each[target_column])
        for each in predict_dataset:
            orig_test_sentences.append(each[source_column])
            orig_test_labels.append(each[target_column])
        # random.seed(6)
        # random_sample = random.sample(list(zip(sources, targets)), 5)
        # training_sources, training_targets = zip(*random_sample)
        if instruction_text is not None:
            params['prompt_prefix'] = instruction_text  

    # train_sentences = orig_train_sentences[:3]
    # train_labels = orig_train_labels[:3]
    # test_sentences = orig_test_sentences[0]
    # test_labels = orig_test_labels[0]

        # AgNews 7600
        np.random.seed(num_seeds)
        few_shot = all_shots
        if subsample_test_set == 0:
            test_sentences, test_labels = orig_test_sentences, orig_test_labels
            samples_num = len(orig_test_labels)
        else:
            samples_num = min(subsample_test_set, len(orig_test_labels))
            test_sentences, test_labels = random_sampling(orig_test_sentences, orig_test_labels, samples_num)

        final_ans = []
        for ii in range(num_seeds):
            np.random.seed(ii)
            train_sentences, train_labels = random_sampling(orig_train_sentences, orig_train_labels, few_shot)
            print("----------Train-----------")
            print(train_sentences[:8],train_labels[:8])

    
            # easier_train_sentences, easier_train_labels, easier_idx = construct_easier_order(train_sentences, train_labels)
            # harder_train_sentences, harder_train_labels, harder_idx = construct_harder_order(train_sentences, train_labels)
            # adjusted_train_sentences = [easier_train_sentences, train_sentences, harder_train_sentences]
            # adjusted_train_labels = [easier_train_labels, train_labels, harder_train_labels]

            def generate_permutations(n):
                # 生成1到n的数字列表
                nums = list(range(1, n+1))
                # 使用itertools.permutations生成全排列
                permutations = list(itertools.permutations(nums))
                return permutations

            all_rank = generate_permutations(few_shot)
            adjusted_train_sentences = [[train_sentences[i - 1] for i in each_rank] for each_rank in all_rank[:1]]
            adjusted_train_labels = [[train_labels[i - 1] for i in each_rank] for each_rank in all_rank[:1]]



            # answer_tokens = params['inv_label_dict'] 
            # if random_template:
            #     if dataset == 'cb' or dataset == 'rte':
            #         params["q_prefix"]  = "fdafdasjklfdadf: {hypothesis}\nzcxvnmxcjkfdas: {premise}" 
            #         # params["q_prefix"]  = ": {hypothesis}\n: {premise}" 
            #     else:
            #     # params["q_prefix"] = "Hypothesis:{hypothesis}\nPremise: {premise}"
            #         params["q_prefix"]  = 'dsafjkldafdsajk: '
            #         # params["q_prefix"]  = ': '
            #     # params["a_prefix"]  = ': '
            #     params["a_prefix"]  = 'reqwiorewsdafjl: '
                        # answer_tokens = params['inv_label_dict'] 
            # construct_prompt_without_test = construct_prompt_without_test
            if random_template:
                # flag_random = fixed_random | nonfixed_random | same_random | revert_random
                if fixed_random:
                    construct_prompt_without_test = construct_prompt_with_random_prefix_without_test
                    construct_prompt = construct_prompt_with_random_prefix
                    if dataset == 'cb' or dataset == 'rte':
                        params["q_prefix"]  = ["fdafdasjklfdadf: {hypothesis}\nzcxvnmxcjkfdas: {premise}" for i in range(5)]
                    else:
                    # params["q_prefix"] = "Hypothesis:{hypothesis}\nPremise: {premise}"
                        params["q_prefix"]  = ['dsafjkldafdsajk: ' for i in range(5)]
                    params["a_prefix"]  = ['reqwiorewsdafjl: ' for i in range(5)]
                elif nonfixed_random:
                    construct_prompt_without_test = construct_prompt_with_random_prefix_without_test
                    # construct_prompt_without_test = construct_prompt_with_random_prefix
                    construct_prompt = construct_prompt_with_random_prefix
                    if dataset == 'cb' or dataset == 'rte': 
                        params["q_prefix"] = [
                            "fdafdasjklfdadf: {hypothesis}\nzcxvnmxcjkfdas: {premise}",
                            "gfhdajkgfhdasfj: {hypothesis}\ncvxhlkdadsajfk: {premise}",
                            "rrqetrizxcsdafq: {hypothesis}\nvncmxasdgfadsl: {premise}",
                            "mvfvxadfawewqro: {hypothesis}\nlkajsdfopsadfp: {premise}",
                            "sdsajfjdsaczvvv: {hypothesis}\nhkljfdiabasdfj: {premise}",
                        ]
                    else:
                        params["q_prefix"] = ['dsafjkldaasdfjkl: ', 'ewqroudajfsdafq: ', 'eqdashcxzlreqguio: ', 'cxzvadeqrczxdsa: ', 'vcxnkfgahvczxkl: ']
                    params["a_prefix"] = ['xiadfjdsalgfweqrjl: ', 'yufoufgaddavfdnsl: ', 'afdgvcxjlzxnvxzla: ', 'fgsgfskjvcdafds: ', 'dafhglajfdvcaol: ']
                elif same_random:
                    # construct_prompt_without_test = construct_prompt_with_random_prefix
                    construct_prompt_without_test = construct_prompt_with_random_prefix_without_test
                    construct_prompt = construct_prompt_with_random_prefix
                    if dataset == 'cb' or dataset == 'rte':
                        params["q_prefix"]  = ["fdafdasjklfdadf: {hypothesis}\nfdafdasjklfdadf: {premise}" for i in range(5)]
                    else:
                        params["q_prefix"] = ['fdafdasjklfdadf: ' for i in range(5)]
                    params["a_prefix"] = ['fdafdasjklfdadf: ' for i in range(5)]
                
                elif revert_random:
                    # construct_prompt_without_test = construct_prompt_with_random_prefix
                    construct_prompt_without_test = construct_prompt_with_random_prefix_without_test
                    construct_prompt = construct_prompt_with_random_prefix
                    if dataset == 'cb' or dataset == 'rte':
                        params["q_prefix"]  = ["Answer: {hypothesis}\n Hypothesis: {premise}" for i in range(5)]
                        params['a_prefix'] = ['Premise: ' for i in range(5)]
                    else:
                        tmp_string = params["q_prefix"]
                        params["q_prefix"] = [params["a_prefix"] for i in range(5)]
                        params["a_prefix"] = [tmp_string for i in range(5)]
                else:
                    if dataset == 'cb' or dataset == 'rte':
                        params["q_prefix"]  = "fdafdasjklfdadf: {hypothesis}\nzcxvnmxcjkfdas: {premise}" 
                        # params["q_prefix"]  = ": {hypothesis}\n: {premise}" 
                    else:
                    # params["q_prefix"] = "Hypothesis:{hypothesis}\nPremise: {premise}"
                        params["q_prefix"]  = 'dsafjkldafdsajk: '
                        # params["q_prefix"]  = ': '
                    # params["a_prefix"]  = ': '
                    params["a_prefix"]  = 'reqwiorewsdafjl: '
                
            else:
                from utils import construct_prompt, construct_prompt_without_test # , construct_prompt_instance_prompt_text 
                # params["q_prefix"] = "dasjfklfdjgdslc: "
                # params["a_prefix"] = "dfagfdsretrqwer: "
            # count = 3
            # while count > 0:
            count = 0
            # best_res = 0
            # best_rank = None
            # res_list = []
            final_res = {}
            position_res = {}
            final_attention_mask = {}
            final_anti_attention_mask = {}



            print("inlcude_punctuation = ", include_punctuation)
            print("inlcude_template = ", include_template)
            print("inlcude_answer = ", include_answer)
            print("only_answer = ", only_answer)
            print("include_content = ", include_content)
            print("add_final_article = ", add_final_article)
            if include_punctuation or without_punctuation:
                tmp_params = {}
                for each in params:
                    tmp_params[each] = copy.deepcopy(params[each])
                if include_colon:
                    tmp_params["prompt_prefix"] += '\n\n : '
                # print("punctuation prompt = ", tmp_params["prompt_prefix"] + '\n\n') 
                stopwords_str = ''
                # for each in stopwords:
                #     stopwords_str = stopwords_str + " " + each
                for each in adjusted_train_labels[0]:
                    stopwords_str = stopwords_str + " " + each


                punctuation_prompt = tokenizer(tmp_params["prompt_prefix"]  + "\n" + stopwords_str + '\n\n', return_tensors="pt")['input_ids'].to(device)        
            if include_template:
                tmp_params = {}
                for each in params:
                    tmp_params[each] = copy.deepcopy(params[each])
                if only_answer_template:
                    if not_include_colon:
                        if dataset == 'cb' or dataset == 'rte':
                            tmp_params["q_prefix"] = ' {hypothesis}\n {premise} ' 
                        else:
                            tmp_params["q_prefix"] = ' '  
                        tmp_params["a_prefix"] = tmp_params["a_prefix"].split(":")[0] + ' '
                    else:
                        if dataset == 'cb' or dataset == 'rte':
                            tmp_params["q_prefix"] = ': {hypothesis}\n: {premise} ' 
                        else:
                            tmp_params["q_prefix"] = ': '  
                if only_article_template:
                    if not_include_colon:
                        tmp_params["a_prefix"] = ' '  
                        if dataset == 'cb' or dataset == 'rte':
                            # tmp_params["q_prefix"] = ': {hypothesis}\n: {premise} ' 
                            tmp_params["q_prefix"] = "Hypothesis {hypothesis}\nPremise {premise}"
                        else:
                            tmp_params["q_prefix"] = tmp_params["q_prefix"].split(":")[0] + ' '
                    else:
                        tmp_params["a_prefix"] = ': '  
                

                tmp_params["prompt_prefix"] = ''
                if include_answer:
                    if isinstance(adjusted_train_sentences[0][0], dict):
                        empty_sentences = [{'hypothesis':'', 'premise': ''} for each in adjusted_train_labels[0]]
                        if only_answer:
                            if not_include_colon:
                                tmp_params["q_prefix"] = ' {hypothesis}\n {premise} ' 
                                tmp_params["a_prefix"] = ' '  
                            else:
                                tmp_params["q_prefix"] = ': {hypothesis}\n: {premise} ' 
                                tmp_params["a_prefix"] = ': ' 
                    else:
                        empty_sentences = ['' for each in adjusted_train_labels[0]]
                        if only_answer:
                            if not_include_colon:
                                tmp_params["q_prefix"] = ' ' 
                                tmp_params["a_prefix"] = ' '  
                            else:
                                tmp_params["q_prefix"] = ': ' 
                                tmp_params["a_prefix"] = ': ' 
                    

                    if not only_answer and not only_answer_template and not only_article_template and not_include_colon:
                        if isinstance(adjusted_train_sentences[0][0], dict):
                            tmp_params["q_prefix"] = "Hypothesis {hypothesis}\nPremise {premise}"
                        else:
                            tmp_params["q_prefix"] = tmp_params["q_prefix"].split(":")[0] + ' '
                        tmp_params["a_prefix"] = tmp_params["a_prefix"].split(":")[0] + ' '


                    template_prompt, _ = construct_prompt_without_test(tmp_params, empty_sentences, adjusted_train_labels[0], '')
                    print("template_prompt = ", template_prompt)

                    # exit()
                    if exclude_nxt:
                        template_prompt = template_prompt.replace('\n', '    ')
                    

                    template_prompt = tokenizer(template_prompt, return_tensors="pt")['input_ids'].to(device)

                else:
                    if isinstance(adjusted_train_sentences[0][0], dict):
                        empty_sentences = [{'hypothesis':'', 'premise': ''} for each in adjusted_train_labels[0]]
                        if only_answer:
                            if not_include_colon:
                                tmp_params["q_prefix"] = ' {hypothesis}\n {premise} ' 
                                tmp_params["a_prefix"] = ' '  
                            else:
                                tmp_params["q_prefix"] = ': {hypothesis}\n: {premise} ' 
                                tmp_params["a_prefix"] = ': ' 
                    else:
                        empty_sentences = ['' for each in adjusted_train_labels[0]]
                        if only_answer:
                            if not_include_colon:
                                tmp_params["q_prefix"] = ' ' 
                                tmp_params["a_prefix"] = ' '  
                            else:
                                tmp_params["q_prefix"] = ': ' 
                                tmp_params["a_prefix"] = ': ' 
                    if not only_answer and not only_answer_template and not only_article_template and not_include_colon:
                        if isinstance(adjusted_train_sentences[0][0], dict):
                            tmp_params["q_prefix"] = "Hypothesis {hypothesis}\nPremise {premise}"
                        else:
                            tmp_params["q_prefix"] = tmp_params["q_prefix"].split(":")[0] + ' '
                        tmp_params["a_prefix"] = tmp_params["a_prefix"].split(":")[0] + ' '
                    # empty_sentences = ['' for each in adjusted_train_labels[0]]
                    template_prompt, _ = construct_prompt_without_test_emptyanswer(tmp_params, empty_sentences, empty_sentences, '')
                    # print("template_prompt = ", template_prompt)
                    if exclude_nxt:
                        template_prompt = template_prompt.replace('\n', '    ')
                    template_prompt = tokenizer(template_prompt, return_tensors="pt")['input_ids'].to(device)           
            if include_content:
                tmp_params = {}
                for each in params:
                    tmp_params[each] = copy.deepcopy(params[each])
                # content_prompt = tokenizer(' '.join(train) + '\n\n', return_tensors="pt")['input_ids'].to(device)   
                # use this to avoid always deleting answers
                tmp_params["prompt_prefix"] = ''
                if isinstance(adjusted_train_sentences[0][0], dict):
                    empty_sentences = [{'hypothesis':'', 'premise': ''} for each in adjusted_train_labels[0]]
                    #  should not have this
                    # tmp_params["q_prefix"] = ''
                else:
                    empty_sentences = ['' for each in adjusted_train_labels[0]]
                # here use the original params to exclude the instruction tokens.
                # no it should not be excluded since the following code do that.
                # also this setting should include the answer
                # if include_answer, then the anti mask should not include the answer

                # use this to include : into content words
                if include_colon:
                    tmp_params["q_prefix"] = tmp_params["q_prefix"].split(':')[0] + ' '
                    tmp_params["a_prefix"] = tmp_params["a_prefix"].split(':')[0] + ' '
                if include_answer:
                    new_template_prompt, _ = construct_prompt_without_test_emptyanswer(tmp_params, empty_sentences, ['' for each in adjusted_train_labels[0]], '')
                else:
                    new_template_prompt, _ = construct_prompt_without_test(tmp_params, empty_sentences, adjusted_train_labels[0], '')
                # print("anti_content_prompt = ", new_template_prompt)
                new_template_prompt = tokenizer(new_template_prompt, return_tensors="pt")['input_ids'].to(device)
                anti_content_prompt = new_template_prompt




            for tmp_train_sentences, tmp_train_labels in zip(adjusted_train_sentences, adjusted_train_labels):
                correct = 0
                prompt, _ = construct_prompt_without_test(params, tmp_train_sentences, tmp_train_labels, '')
                if add_final_article:
                    prompt += params["q_prefix"]
                # print("prompt = ", prompt)



                if test_inference:
                    inputs = tokenizer(prompt, return_tensors="pt")
                    input_ids = inputs["input_ids"].to(device)

                    # template tokens mask
                    tmp_mask = torch.zeros_like(input_ids).to(device).bool()
                    # if include_answer:
                        # print("template_prompt = ", template_prompt)
                        # exit()

                    if include_content:
                        new_tmp_mask = torch.zeros_like(input_ids).to(device).bool()
                        for each in anti_content_prompt[0]:
                            new_tmp_mask = new_tmp_mask | input_ids.eq(each).bool()
                        tmp_mask = tmp_mask | (~new_tmp_mask).bool()

                    if include_punctuation:
                        for each in punctuation_prompt[0]:
                            tmp_mask = tmp_mask | input_ids.eq(each)
                    if without_punctuation:
                        new_tmp_mask = torch.zeros_like(input_ids).to(device).bool()
                        for each in punctuation_prompt[0]:
                            new_tmp_mask = new_tmp_mask | input_ids.eq(each).bool()
                        tmp_mask = tmp_mask & (~new_tmp_mask).bool()

                    if include_template:
                        for each in template_prompt[0]:
                            tmp_mask = tmp_mask | input_ids.eq(each)
                        if only_one_colon:
                            assert (only_article_template or only_answer_template) and not not_include_colon
                            colon_token = ":"
                            colon_token_id = tokenizer.convert_tokens_to_ids(colon_token)
                            # colon_mask = input_ids.eq(colon_token_id)
                            rows, cols = input_ids.shape
                            # for col, row in input_ids.size():
                            for row in range(rows):
                                for col in range(cols):
                                    if input_ids[row][col] == colon_token_id:
                                        assert col > 1
                                        assert tmp_mask[row][col] == True
                                        if tmp_mask[row][col - 1] == False:
                                            tmp_mask[row][col] = False


                            # for each in input_ids


                        
                    

                    # print("tmp_mask = ", tmp_mask)
                    # print("input_ids = ", input_ids)
                    # exit()
                    # else:



                    # tmp_anti_mask = ~tmp_mask


                    position_ids = torch.ones(input_ids.size()).to(input_ids.device)
                    position_ids = position_ids.long().cumsum(-1) - 1

                    with torch.no_grad():
                        # generation_output = model.generate(
                        #     input_ids=input_ids,
                        #     generation_config=generation_config,
                        #     return_dict_in_generate=True,
                        #     output_scores=True,
                        #     max_new_tokens=max_new_tokens,
                        # )
                        output = model(input_ids, output_hidden_states=True)
                        hidden_states = output.hidden_states

                    # 依次取出每一个sample每一层的representation
                    # 然后存在一个文件里，变成list然后json就可以
                    # 现在所有的rep都看，到时候做增强的时候需要把instruction的也加进来。
                    # 写代码的时候，还是需要qkv的结果，emmm
                    # 还是写一个representation 转kv的吧，这样的话会好弄一点，至少不用在generation_utils里边做很多修改？。
                    # 13 = convert_tokens_to_ids('<0x0A>')
                    if 'llama' in base_model:
                        nxtline_id = tokenizer.convert_tokens_to_ids('<0x0A>')
                    else:
                        nxtline_id = tokenizer("\n")['input_ids'][0]
                        


                    # print("nxtline_id = ", nxtline_id)
                    all_mask = input_ids.eq(nxtline_id).long()
                    all_mask = torch.cumsum(all_mask, dim=-1)
                    # # print("all_mask = ", all_mask)
                    # # 矫正一下位置
                    all_mask =torch.cat((all_mask[:, :1], all_mask[:, :-1]), dim=-1)
                    # # print("all_mask2 = ", all_mask)
                    # # print("all_mask.gt = ", all_mask.ge(2))
                    # # print("all_mask.le = ", all_mask.lt(5))



                    # mask_list = []
                    # for i in range(few_shot):
                    #     tmp_mask = all_mask.ge(2 + (i * 3)) & all_mask.lt(2 + (i + 1) * 3)
                    #     mask_list.append(tmp_mask)

                    if count == 0:
                        instruction_mask = all_mask.lt(2)
                    #     instruction_rep = [torch.masked_select(hidden_states[i], instruction_mask.unsqueeze(-1).repeat(1, 1, hidden_states[i].size(-1))).view(1, -1, hidden_states[i].size(-1)) for i in range(len(hidden_states))]
                    #     instruction_pos = torch.masked_select(position_ids, instruction_mask).view(1, -1)
                    #     # 这里要管instruction_rep
                    #     # print("instruction_rep = ", instruction_rep.size())
                    #     # print(instruction_rep)


                    # result_list = [[] for i in range(len(mask_list))]
                    # position_list = [None for i in range(len(mask_list))]
                    
                    # for demo, demo_mask in enumerate(mask_list):
                    #     # real_demo = all_rank[count][demo]
                    #     # print("real_demo = ", real_demo)
                    #     for layer, hidden_state in enumerate(hidden_states):
                    #         sentence_rep =  torch.masked_select(hidden_state, demo_mask.unsqueeze(-1).repeat(1, 1, hidden_state.size(-1))).view(1, -1,hidden_state.size(-1))
                    #         # 这里记录的是每个句子的。
                    #         # result_list[demo].append(sentence_rep.mean(dim=-2).tolist())
                    #         result_list[demo].append(sentence_rep)
                    #         # print("sentence_rep = ", sentence_rep.size())
                    #         # print(sentence_rep)
                    #         # exit()
                    #     position_list[demo] = torch.masked_select(position_ids, demo_mask).view(1, -1)
                    # print("prompt = ", prompt)
                    # print("input_ids = ", input_ids)
                    # print("tmp_mask = ", tmp_mask.masked_fill(instruction_mask, True))
                    # print("anti_tmp_mask = ",  (~tmp_mask).masked_fill(instruction_mask, True))
                    # exit()

                    position_res[str(all_rank[count])] = position_ids
                    # 存成一个文件，然后做一个py处理这个文件，算一个相似度矩阵。
                    # 在读的时候做一个all permutation转字符串的操作吧，这样应该就可以了。
                    final_res[str(all_rank[count])] = list(hidden_states)
                    final_attention_mask[str(all_rank[count])] = tmp_mask.bool().masked_fill(instruction_mask, True)
                    final_anti_attention_mask[str(all_rank[count])] = (~(tmp_mask.bool())).masked_fill(instruction_mask, True)
                    if count == 0:
                        print("final_unmasked_tokens = ", tokenizer.decode((input_ids * final_attention_mask[str(all_rank[count])].long())[0]))
                        # exit()
                    count += 1





            # print('final_res = ', final_res)

            origin_prefix = copy.deepcopy(params['prompt_prefix'])
            
            count = 0
            for tmp_train_sentences, tmp_train_labels in zip(adjusted_train_sentences, adjusted_train_labels):
                # only_single_sentence = tmp_train_sentences[-1:]
                # only_single_label = tmp_train_labels[-1:]
                current_rank = all_rank[count]
                # # print("---------------------------------------")
                # # print("current_rank = ", current_rank)


                # correct_single = 0
                # # print("prompt = ", prompt)
                # # for single sentence
                # for i, each_test in enumerate(test_sentences):
                #     params['prompt_prefix'] = origin_prefix
                #     # print("origin_prefix = ", or)
                #     prompt = construct_prompt(params, only_single_sentence, only_single_label, each_test)
                #     if test_inference:
                #         inputs = tokenizer(prompt, return_tensors="pt")
                #         input_ids = inputs["input_ids"].to(device)

                #         with torch.no_grad():
                #             generation_output = model.generate(
                #                 input_ids=input_ids,
                #                 generation_config=generation_config,
                #                 return_dict_in_generate=True,
                #                 output_scores=True,
                #                 max_new_tokens=max_new_tokens,
                #             )

                #         s = generation_output.sequences[0]
                #         s = s.masked_fill(s.eq(-99), 0)
                #         output = tokenizer.decode(s)
                #         answer = output.split(':')[-1].replace("</s>", '').strip()
                        
                #         if dataset in ['cb', 'rte', 'dbpedia', 'sst2', 'trec', 'agnews']:
                #             if answer in params['inv_label_dict'].keys() and test_labels[i] == params['inv_label_dict'][answer]:
                #                 correct_single += 1
                #         else:
                #             if answer.startswith(test_labels[i]) or test_labels[i].startswith(answer):
                #                 correct_single += 1
                #     # pass


                # # print("model = ", model)
                # # print("dataset = ", dataset)
                # # print("num_seeds = ", num_seeds)
                # # print("all_shots = ", all_shots)
                # # print("single_sentence = ", only_single_sentence)
                # print("tmp_rank = ", all_rank[count])
                # print("ACC_single = ", correct_single / samples_num)
                # # count += 1
            

                correct_rep = 0
                correct_anti_rep = 0
                # print("prompt = ", prompt)
                # for single sentence
                # gold_target = []
                system_output = []
                for i, each_test in enumerate(test_sentences):
                    # prev_prefix = params['prompt_prefix']
                    params['prompt_prefix'] = ""
                    if add_final_article:
                        params["q_prefix"] = ""

                    prompt = construct_prompt(params, [] , [], each_test)
                    params['prompt_prefix'] = origin_prefix
                    # prompt = each_test

                    input_reps1 = final_res[str(current_rank)][:]
                    # input_reps2 = final_res[str(current_rank)][:]
                    cur_attention_mask = final_attention_mask[str(current_rank)]
                    anti_attention_mask = final_anti_attention_mask[str(current_rank)]
                    # layer_num * seqlen * dim
                    # print("instruction_rep = ", instruction_rep.size())
                    # print(instruction_rep)
                    # print("input_rep = ")
                    # print(input_rep)
                    # layers
                    # print("input_rep = ", len(input_reps))
                    # print(input_rep.size())
                    # 1 * seqlen * dim
                    # print(input_reps[0].size())
                    # input_reps = [torch.concat((instruction_rep[i], each_rep), dim=-2) for i, each_rep in enumerate(input_reps)]
                    # print("new input_rep = ", input_reps[0])
                    # exit()
                    if test_inference:
                        inputs = tokenizer(prompt, return_tensors="pt")
                        if fix_cls:
                            input_ids = inputs["input_ids"][:,1:].to(device)
                        else:
                            input_ids = inputs["input_ids"].to(device)

                        # attention
                        prompt_pos = torch.ones(input_ids.size()).to(input_ids.device)
                        prompt_pos = prompt_pos.long().cumsum(-1) + position_res[str(current_rank)][0][-1]



                        attention_mask = torch.ones(prompt_pos.size()).to(input_ids.device)

                        anti_attention_mask = torch.cat((anti_attention_mask, attention_mask), dim=-1)
                        attention_mask = torch.cat((cur_attention_mask, attention_mask), dim=-1)



                        position_ids = torch.cat((position_res[str(current_rank)], prompt_pos), dim=-1)
                        # print("position_ids = ", position_ids.size())
                        # print(position_ids)


                        with torch.no_grad():
                            # print("??")
                            generation_output = model.generate(
                                input_ids=input_ids,
                                input_reps=input_reps1,
                                position_ids=position_ids,
                                attention_mask=attention_mask,
                                generation_config=generation_config,
                                return_dict_in_generate=True,
                                output_scores=True,
                                early_stopping=True, 
                                eos_token_id=nxtline_id,
                                max_new_tokens=max_new_tokens,
                            )
                            # print("generation_output = ", generation_output)
                        # exit()

                        s = generation_output.sequences[0]
                        s = s.masked_fill(s.eq(-99), 0)
                        output = tokenizer.decode(s)
                        # prin
                        answer = output.split(':')[-1].replace("</s>", '').strip()
                        
                        # print('output = ', output)
                        # print('answer = ', answer)
                        # print("label[i] = ", params['label_dict'][test_labels[i]])
                        # if i >= 20:
                        #     exit()

                        # exit()
                        # 
                        # if dataset in ['cb', 'rte', 'dbpedia', 'sst2', 'trec', 'agnews']:
                        #     if answer in params['inv_label_dict'].keys() and test_labels[i] == params['inv_label_dict'][answer]:
                        #         correct_rep += 1
                        # else:
                        #     if answer.startswith(test_labels[i]) or test_labels[i].startswith(answer):
                        #         correct_rep += 1
                        # gold_target.append(test_labels[i])
                        system_output.append(answer)
                            



                        # with torch.no_grad():
                        #     # print("??")
                        #     generation_output = model.generate(
                        #         input_ids=input_ids,
                        #         input_reps=input_reps2,
                        #         position_ids=position_ids,
                        #         attention_mask=anti_attention_mask,
                        #         generation_config=generation_config,
                        #         return_dict_in_generate=True,
                        #         output_scores=True,
                        #         max_new_tokens=max_new_tokens,
                        #     )
                        #     # print("generation_output = ", generation_output)
                        # # exit()

                        # s = generation_output.sequences[0]
                        # s = s.masked_fill(s.eq(-99), 0)
                        # output = tokenizer.decode(s)
                        # answer = output.split(':')[-1].replace("</s>", '').strip()
                        
                        # # print('output = ', output)
                        # # print('answer = ', answer)
                        # # print("label[i] = ", params['label_dict'][test_labels[i]])
                        # # if i >= 20:
                        # #     exit()

                        # # exit()
                        # # 
                        # if dataset in ['cb', 'rte', 'dbpedia', 'sst2', 'trec', 'agnews']:
                        #     if answer in params['inv_label_dict'].keys() and test_labels[i] == params['inv_label_dict'][answer]:
                        #         correct_anti_rep += 1
                        # else:
                        #     if answer.startswith(test_labels[i]) or test_labels[i].startswith(answer):
                        #         correct_anti_rep += 1
                    # pass



                # print("dataset = ", dataset)
                # print("num_seeds = ", num_seeds)
                # print("all_shots = ", all_shots)
                # print("single_sentence = ", only_single_sentence)
                # print("tmp_rank = ", all_rank[count])
                metric = evaluate.load("bleu")
                def postprocess_text(preds, labels):
                    preds = [pred.strip() for pred in preds]
                    labels = [label.strip() for label in labels]

                    # rougeLSum expects newline after each sentence
                    preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
                    labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]

                    return preds, labels
                decoded_preds, decoded_labels = postprocess_text(system_output, test_labels)
                result = metric.compute(predictions=decoded_preds, references=decoded_labels)
                # result = {k: round(v * 100, 4) for k, v in result.items()}
                print(result)

                # print(correct_rep / samples_num)
                final_ans.append(result['bleu'])
                # print("Anti ACC_rep = ", correct_anti_rep / samples_num)
            


            

                count += 1
            # file_prefix = 'rep_save/'
            # file_name = file_prefix + base_model.split('/')[-1] + "_"+ str(num_seeds) + '.json'
            # res_name = file_prefix + base_model.split('/')[-1] + "_"+ str(num_seeds) + '.res'

            # json_data = json.dumps(final_res, indent=4)
            # with open(file_name, 'w') as file:
            #     file.write(json_data)
            # with open(res_name, 'w') as file:
            #     file.write(' ')
                
            # exit()


        import statistics
        print(final_ans)
        print(statistics.mean(final_ans))
        print(statistics.stdev(final_ans))
                    

                    # s = generation_output.sequences[0]
                    # s = s.masked_fill(s.eq(-99), 0)
                    # exit()
                    # output = tokenizer.decode(s)

                    # answer = output.split()[-1]
                    # answer = output.split(':')[-1].replace("</s>", '').strip()
                    # if compression:
                    #     print("------------Model Output------------")
                        
                    #     print("s = ", s)
                    #     print("output = ", output)
                    #     print("answer = ", answer)
                    #     print("label = ", params['label_dict'][test_labels[i]])
                    #     print("label = ", test_labels[i])
                    #     if i > 20: 
                    #         exit()
                        # print("output = ", output)
                        # print("generation_output = ", generation_output)
                        # exit()
                    # exit()

                        
                #         if dataset in ['cb', 'rte', 'dbpedia', 'sst2', 'trec', 'agnews']:
                #             if answer in params['inv_label_dict'].keys() and test_labels[i] == params['inv_label_dict'][answer]:
                #                 correct += 1
                #         else:
                #             if answer.startswith(test_labels[i]) or test_labels[i].startswith(answer):
                #                 correct += 1

            # print final results
            # harder_rank, easier_rank, full_rank

            # print("rank_number = ", len(all_rank))
        # print("all_permutation = ", all_rank)


        # sorted_zipped = sorted(zip(res_list, all_rank), key=lambda x: x[0], reverse=True)

        # new_res_list, new_rank = zip(*sorted_zipped)
        # harder_idx = tuple([each + 1 for each in harder_idx])
        # easier_idx = tuple([each + 1 for each in easier_idx])
        # print("harder_idx = ", harder_idx)
        # print("easier_idx = ", easier_idx)
        # print("harder_rank = ", new_rank.index(harder_idx))
        # print("harder_res = ", new_res_list[new_rank.index(harder_idx)])
        # print("easier_rank = ", new_rank.index(easier_idx))
        # print("easier_res = ", new_res_list[new_rank.index(easier_idx)])

        # print("best_res = ", best_res)
        # print("best_permutation = ", best_rank)
        # print("worst_res = ", new_res_list[-1])
        # print("worst_permutation = ", new_rank[-1])

        # print("average_res = ", sum(new_res_list)/len(new_res_list))
        # print("mid_res = ", new_res_list[len(new_res_list) // 2])



        
            # print("")
            # count -= 1
            # change the order of the training sentences
            # combined = list(zip(train_sentences, train_labels))
            # random.shuffle(combined)
            # train_sentences, train_labels = zip(*combined)










if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    # required arguments
    parser.add_argument('--model', dest='model', action='store', required=True, help='name of model(s), e.g., GPT2-XL')
    parser.add_argument('--lora_weight', dest='lora_weight', action='store', required=False, default=None, help='name of model(s), e.g., GPT2-XL')
    parser.add_argument('--dataset', dest='dataset', action='store', required=True, help='name of dataset(s), e.g., agnews')
    parser.add_argument('--lang1', dest='lang1', action='store', required=True, help='name of dataset(s), e.g., agnews')
    parser.add_argument('--lang2', dest='lang2', action='store', required=True, help='name of dataset(s), e.g., agnews')
    parser.add_argument('--num_seeds', dest='num_seeds', action='store', required=True, help='num seeds for the training set', type=int)
    parser.add_argument('--add_final_article', dest='add_final_article', action='store_const', const=True, default=False,
                        help='whether to test the sequence order of the model')
    parser.add_argument('--include_answer', dest='include_answer', action='store_const', const=True, default=False,
                        help='whether to test the sequence order of the model')
    parser.add_argument('--include_punctuation', dest='include_punctuation', action='store_const', const=True, default=False,
                        help='whether to test the sequence order of the model')
    parser.add_argument('--without_punctuation', dest='without_punctuation', action='store_const', const=True, default=False,
                        help='whether to test the sequence order of the model')
    parser.add_argument('--include_template', dest='include_template', action='store_const', const=True, default=False,
                        help='whether to test the sequence order of the model')
    parser.add_argument('--include_content', dest='include_content', action='store_const', const=True, default=False,
                        help='whether to test the sequence order of the model')                        
    parser.add_argument('--include_colon', dest='include_colon', action='store_const', const=True, default=False,
                        help='whether to test the sequence order of the model')    
    parser.add_argument('--random_template', dest='random_template', action='store_const', const=True, default=False,
                        help='whether to test the sequence order of the model')
    parser.add_argument('--exclude_nxt', dest='exclude_nxt', action='store_const', const=True, default=False,
                        help='whether to test the sequence order of the model')    
    parser.add_argument('--not_include_colon', dest='not_include_colon', action='store_const', const=True, default=False,
                        help='whether to test the sequence order of the model')    
    parser.add_argument('--only_answer', dest='only_answer', action='store_const', const=True, default=False,
                        help='whether to test the sequence order of the model')
    parser.add_argument('--only_article_template', dest='only_article_template', action='store_const', const=True, default=False,
                        help='whether to test the sequence order of the model')
    parser.add_argument('--only_answer_template', dest='only_answer_template', action='store_const', const=True, default=False,
                        help='whether to test the sequence order of the model')
    parser.add_argument('--fix_cls', dest='fix_cls', action='store_const', const=True, default=False,
                        help='whether to test the sequence order of the model')
    # other arguments
    parser.add_argument('--subsample_test_set', dest='subsample_test_set', action='store', required=False, type=int,
                        default=None, help='size of test set to use to speed up eval. None means using all test set')
    parser.add_argument('--fixed_random', dest='fixed_random', action='store_const', const=True, default=False,
                        help='whether to use the compression generation mode')
    parser.add_argument('--nonfixed_random', dest='nonfixed_random', action='store_const', const=True, default=False,
                        help='whether to use the compression generation mode')                   
    parser.add_argument('--same_random', dest='same_random', action='store_const', const=True, default=False,
                        help='whether to use the compression generation mode')    
    parser.add_argument('--revert_random', dest='revert_random', action='store_const', const=True, default=False,
                        help='whether to use the compression generation mode')    
    # parser.add_argument('--api_num_log_prob', dest='api_num_log_prob', action='store', required=False, type=int,
    #                     default=100, help='number of top tokens to ask for when querying the model. Capped at 100 for OpenAI GPT-3 API')
    # parser.add_argument('--bs', dest='bs', action='store', required=False, type=int, default=None,
    #                     help='batch size for model queries. For OpenAI API, capped at 20. For local running, set this to max out your GPU memory.')
    # flags
    # parser.add_argument('--use_saved_results', dest='use_saved_results', action='store_const', const=True, default=False,
    #                     help='whether to load the results from pickle files and not run the model')
    parser.add_argument('--compression', dest='compression', action='store_const', const=True, default=False,
                        help='whether to use the compression generation mode')
    parser.add_argument('--only_one_colon', dest='only_one_colon', action='store_const', const=True, default=False,
                        help='in template mask, we only retain one colon when we ablating the input and output template')
    parser.add_argument('--load_in_8bit', dest='load_in_8bit', action='store_const', const=True, default=False,
                        help='whether to use the compression generation mode')
    parser.add_argument('--load_in_4bit', dest='load_in_4bit', action='store_const', const=True, default=False,
                        help='whether to use the compression generation mode')
    parser.add_argument('--compression_without_input', dest='compression_without_input', action='store_const', const=True, default=False,
                        help='whether to use the compression generation mode')
    parser.add_argument('--use_partial_mask', dest='use_partial_mask', action='store_const', const=True, default=False,
                        help='whether to use the partial_mask')
    parser.add_argument('--with_prompt_text', dest='with_prompt_text', action='store_const', const=True, default=False,
                        help='whether to use the prompt text')
    parser.add_argument('--compression_without_prompt_text', dest='compression_without_prompt_text', action='store_const', const=True, default=False,
                        help='whether to use the prompt text')
    parser.add_argument('--with_sequence_order', dest='with_sequence_order', action='store_const', const=True, default=False,
                        help='whether to test the sequence order of the model')
    parser.add_argument('--compression_token_initialization', dest='compression_token_initialization', action='store_const', const=True, default=False,
                        help='whether to test the sequence order of the model')
    # parser.add_argument('--with_prompt_text', dest='with_prompt_text', action='store_const', const=True, default=False,
    #                     help='whether to use the prompt text')
    parser.add_argument('--compression_length', dest='compression_length', action='store', required=False, default=None, help='num of compression_tokens', type=int)
    parser.add_argument('--all_shots', dest='all_shots', action='store', required=True, help='num training examples to use', type=int)
    parser.add_argument('--instruction_text', dest='instruction_text', action='store', required=False, default=None, help='num training examples to use', type=str)
    # compression_token_initialization

    args = parser.parse_args()
    args = vars(args)

    # simple processing
    # def convert_to_list(items, is_int=False):
    #     if is_int:
    #         return [int(s.strip()) for s in items.split(",")]
    #     else:
    #         return [s.strip() for s in items.split(",")]

    # args['models'] = convert_to_list(args['models'])
    # args['datasets'] = convert_to_list(args['datasets'])
    # args['all_shots'] = convert_to_list(args['all_shots'], is_int=True)

    main(**args)