import os

# GPT 复合函数
import argparse

parser = argparse.ArgumentParser()
parser.add_argument('--optim_multiplier', type=float, default=15)
parser.add_argument('--gpu_id', type=int, default=5)
parser.add_argument('--nh', type=int, default=1)
parser.add_argument('--nl', type=int, default=1)
parser.add_argument('--embedding_std', type=float, default=0.5)
parser.add_argument('--qk_std', type=float, default=0.5)
parser.add_argument('--vo_std', type=float, default=0.5)
parser.add_argument('--mlp_std', type=float, default=0.5)
parser.add_argument('--all_std', type=float, default=0.85)
parser.add_argument('--weight_decay', type=float, default=0)
parser.add_argument('--freeze_embedding', type=int, default=0)
parser.add_argument('--activation', type=str, default='tanh')

args = parser.parse_args()

# std_rate = args.std_rate
optim_multiplier = args.optim_multiplier
optim = 'AdamW'
# dtype = 'float32'
# std_rate1, std_rate2 = args.std_rate1, args.std_rate2
# import os
# os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
target = 'similar'

# dir_suffix = f'diff_lr_5e-4_composition_task_34_unseen_43_unseen_diff_ini_{std_rate}_optim_multiplier_{optim_multiplier}'
lr = 1e-5
gpu_id = args.gpu_id
batch_size = 1000
scheduler = 'GradualWarmupScheduler_CosineAnnealingLR'
# scheduler = 'None'
model = 'GPT_specific_test_for_resnet'
activation = args.activation
# model='GPT2_prenorm_RoPE_onehot'
data_size = 900000

# xm0表示x mod seq-1 = 0为测试集，xel表示x else，即训练集
dname = ['xm0', 'xel']
dmode = dname
dtrain = [0, 1]

###数据集更改过了！！！！！！！！！！！！！！！！！！！

dshow = [1, 0]
dpercent = [0.05, 0.95]

dn = ' '.join(map(str, dname))
dp = ' '.join(map(str, dpercent))
dmode = ' '.join(map(str, dmode))
dtrain = ' '.join(map(str, dtrain))
dshow = ' '.join(map(str, dshow))

embedding_mean = 0.0
seed = 1984

L,H=args.nl,args.nh

import time
proj_name=f'seed_experiment_LTP_prenorm_{activation}_nh_{H}_nl_{L}_std_{args.all_std}_lr_{lr}'

if args.all_std is not None:
       args.embedding_std = args.all_std
       args.qk_std = args.all_std
       args.vo_std = args.all_std
       args.mlp_std = args.all_std

for embedding_mean in [0.]:
       dir_suffix = f'seed{seed}_embedding_mean{embedding_mean}_{time.strftime("%Y-%m-%d_%H-%M-%S", time.localtime())}'
       # 正常训练
       # os.system(f'CUDA_VISIBLE_DEVICES={gpu_id} python3 -m main_dynamic -data_size {data_size} -seed {seed} -func {target} -lr {lr} -m {model}\
       #               -scheduler {scheduler} -ne 150 -nl {L} -nh {H} -bs {batch_size} -dir_suffix {dir_suffix} -pname {proj_name} -dk 64 -dv 640 -d_ff 1280 -dm 640 \
       #               -freeze_embedding {args.freeze_embedding} \
       #               -dmode {dmode} -dp {dp} -dn {dn} -dtrain {dtrain} -dshow {dshow} -suffix {suffix}\
       #               -ple 1 -pae 1 -plae 1 -sme 1 -wd {args.weight_decay} -embedding_std {args.embedding_std} -qk_std {args.qk_std} -vo_std {args.vo_std} -mlp_std {args.mlp_std}\
       #               --optim_T_max 200 --optim_eta_min 1e-7 --optim_multiplier {optim_multiplier} --optim_total_epoch 10 --anchor_num 2 --optim {optim} --activation {activation} --dtype {dtype}') 

       os.system(f'CUDA_VISIBLE_DEVICES={gpu_id} python3 -m main -data_size {data_size} -seed {seed} -func {target} -lr {lr} -m {model}\
       -scheduler {scheduler} -ne 20 -nl {L} -nh {H} -bs {batch_size} -dir_suffix {dir_suffix} -pname {proj_name} -dk 640 -dv 640 -d_ff 1280 -dm 640 \
       -freeze_embedding {args.freeze_embedding} -embedding_mean {embedding_mean} -embedding_std {args.embedding_std} \
       -dmode {dmode} -dp {dp} -dn {dn} -dtrain {dtrain} -dshow {dshow} \
       -ple 1 -pae 1 -plae 1 -sme 1 -wd {args.weight_decay} -qk_std {args.qk_std} -vo_std {args.vo_std} -mlp_std {args.mlp_std}\
       --optim_T_max 200 --optim_eta_min 1e-5 --optim_multiplier {optim_multiplier} --optim_total_epoch 10 --anchor_num 2 --optim {optim} --activation {activation}') 
