import os
from itertools import product
import datetime

lr = 1e-5
gpu_id = str(6)
# gpu_id = '7'  
print(f'Using GPU ID: {gpu_id}')
batch_size = 1000
# scheduler = 'None'  # 'StepLR'
scheduler = 'GradualWarmupScheduler_CosineAnnealingLR'
model = 'GPT_PreLN'
data_size = int(6e5)
target = 'wikitext'          
dir_suffix = '_'        
L = 2
H = 1
seed = 2048
training_step_method = 'last_token_NTP'
script_name = os.path.basename(__file__).split('.')[0] + '.py'
user = 'cza'
dk_list = [256]
dv_list = [256]
d_ff_list = [800]
dm_list = [256]
embedding_mean = 0.0
activation = 'gelu'
freeze_embedding = 1  # 是否冻结 embedding 层
all_std = 1.0
embedding_std = all_std
qk_std = all_std
vo_std = all_std
mlp_std = all_std

for dk, dv, d_ff, dm in zip(dk_list, dv_list, d_ff_list, dm_list):
    datetime_str = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
    suffix = f'{L}L{H}H_dm_{dm}_dff_{d_ff}_lr_{lr}_std_{all_std}/{datetime_str}_seed_{seed}_embedding_mean_{embedding_mean}'

    os.system(f'CUDA_VISIBLE_DEVICES={gpu_id} python -m main_wikitext -data_size {data_size} -seed {seed} -func {target} -lr {lr} -m {model}\
            -scheduler {scheduler} -ne 5 -nl {L} -nh {H} -bs {batch_size} -dir_suffix {dir_suffix} -lds 1000 -ldr 1\
            -suffix {suffix}  --activation {activation} \
            -mode {target} -embedding_std {embedding_std} -qk_std {qk_std} -vo_std {vo_std} -mlp_std {mlp_std} --all_std {all_std} \
            --freeze_embedding {freeze_embedding} \
            -ple 1 -pae 1 -plae 1 -sme 1 -dk {dk} -dv {dv} -d_ff {d_ff} -dm {dm}\
            --optim_T_max 200 --optim_eta_min 1e-5  --optim_multiplier 15 --optim_total_epoch 10\
            -script_file_name {script_name} -dataset_on_cuda 1 --weight_decay 0.0 --embedding_mean {embedding_mean}')

