# ### Experimental for prodigy
# if args.prodigy_setting == 0:
#     # default setting
#     print('Default setting!')
#     optim = Prodigy(model.parameters(), lr=1., safeguard_warmup=True, use_bias_correction=True, weight_decay=0.01)
#     scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optim, T_max=args.n_epochs)
# elif args.prodigy_setting == 1:
#     print('No weight decay!')
#     optim = Prodigy(model.parameters(), lr=1., safeguard_warmup=True, use_bias_correction=True, weight_decay=1e-12)
#     scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optim, T_max=args.n_epochs)
# elif args.prodigy_setting == 2:
#     print('No scheduler!')
#     optim = Prodigy(model.parameters(), lr=1., safeguard_warmup=True, use_bias_correction=True, weight_decay=0.01)
#     scheduler = None
# elif args.prodigy_setting == 3:
#     print('No safeguard_warmup!')
#     optim = Prodigy(model.parameters(), lr=1., safeguard_warmup=False, use_bias_correction=True, weight_decay=0.01)
#     scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optim, T_max=args.n_epochs)
# elif args.prodigy_setting == 4:
#     print('No use_bias_correction!')
#     optim = Prodigy(model.parameters(), lr=1., safeguard_warmup=True, use_bias_correction=False, weight_decay=0.01)
#     scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optim, T_max=args.n_epochs)
# ### END experiment 1
# elif args.prodigy_setting == 12:
#     print('no weight decay and no scheduler!')
#     optim = Prodigy(model.parameters(), lr=1., safeguard_warmup=True, use_bias_correction=True, weight_decay=1e-12)
#     scheduler = None
# elif args.prodigy_setting == 14:
#     print('no weight decay and no use_bias_correction!')
#     optim = Prodigy(model.parameters(), lr=1., safeguard_warmup=True, use_bias_correction=False, weight_decay=1e-12)
#     scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optim, T_max=args.n_epochs)
# elif args.prodigy_setting == 24:
#     print('no scheduler and no use_bias_correction!')
#     optim = Prodigy(model.parameters(), lr=1., safeguard_warmup=True, use_bias_correction=False, weight_decay=0.01)
#     scheduler = None
# elif args.prodigy_setting == 124:
#     print('no weight decay and no scheduler and no use_bias_correction!')
#     optim = Prodigy(model.parameters(), lr=1., safeguard_warmup=True, use_bias_correction=False, weight_decay=1e-12)
#     scheduler = None
# elif args.prodigy_setting == 5:
#     # default setting
#     print('Actual default setting!')
#     optim = Prodigy(model.parameters())
#     scheduler = None
# elif args.prodigy_setting == 6:
#     # default setting
#     print('Actual default setting with decouple=False!')
#     optim = Prodigy(model.parameters(), decouple=False)
#     scheduler = None
# elif args.prodigy_setting == 7:
#     # default setting
#     print('Actual default setting with scheduler!')
#     optim = Prodigy(model.parameters())
#     scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optim, T_max=args.n_epochs)
# elif args.prodigy_setting == 8:
#     # default setting
#     print('Actual default setting with scheduler with T_max=n_epochs//10!')
#     optim = Prodigy(model.parameters())
#     scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optim, T_max=args.n_epochs//10)
# ### END EXPERIMENT 2
# elif args.prodigy_setting == 9:
#     # default setting
#     print('AdamW with lr=1e-3!')
#     optim = torch.optim.AdamW(
#         model.parameters(),
#         lr=1e-3, amsgrad=True,
#         weight_decay=1e-12)
#     scheduler = None
# elif args.prodigy_setting == 10:
#     # default setting
#     print('AdamW with lr=5e-3!')
#     optim = torch.optim.AdamW(
#         model.parameters(),
#         lr=5e-3, amsgrad=True,
#         weight_decay=1e-12)
#     scheduler = None
# ### END EXPERIMENT 3
# elif args.prodigy_setting == 11:
#     print('Default setting with epochs=100!')
#     args.n_epochs = 100
#     args.test_epochs = 1
#     optim = Prodigy(model.parameters(), lr=1., safeguard_warmup=True, use_bias_correction=True, weight_decay=0.01)
#     scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optim, T_max=args.n_epochs)
# elif args.prodigy_setting == 15:
#     print('Actual Default setting with epochs=100!')
#     args.n_epochs = 100
#     args.test_epochs = 1
#     optim = Prodigy(model.parameters())
#     scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optim, T_max=args.n_epochs)
# elif args.prodigy_setting == 13:
#     print('AdamW with scheduler!')
#     optim = torch.optim.AdamW(
#         model.parameters(),
#         lr=1e-4, amsgrad=True,
#         weight_decay=1e-12)
#     scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optim, T_max=args.n_epochs)
# ### End Experimental
# elif args.prodigy_setting == 16:
#     print('Actual Default setting with epochs=200!')
#     args.n_epochs = 200
#     args.test_epochs = 3
#     optim = Prodigy(model.parameters())
#     scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optim, T_max=args.n_epochs)
# elif args.prodigy_setting == 17:
#     print('Actual Default setting with epochs=200!')
#     args.n_epochs = 150
#     args.test_epochs = 2
#     optim = Prodigy(model.parameters())
#     scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optim, T_max=args.n_epochs)


# elif args.prodigy_setting == 18:
#     args.test_epochs = 5
#     optim = Prodigy(model.parameters(), lr=1., safeguard_warmup=True, use_bias_correction=True, weight_decay=0.01, d_coef=0.5)
#     scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optim, T_max=args.n_epochs)
# elif args.prodigy_setting == 19:
#     args.test_epochs = 5
#     optim = Prodigy(model.parameters(), lr=1., safeguard_warmup=True, use_bias_correction=True, weight_decay=0.01, d_coef=0.2)
#     scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optim, T_max=args.n_epochs)
# elif args.prodigy_setting == 20:
#     args.test_epochs = 5
#     optim = Prodigy(model.parameters(), lr=1., safeguard_warmup=True, use_bias_correction=True, weight_decay=0.01, d_coef=0.9)
#     scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optim, T_max=args.n_epochs)


# elif args.prodigy_setting == 21:
#     args.n_epochs = 500
#     args.test_epochs = 5
#     optim = Prodigy(model.parameters(), lr=1., safeguard_warmup=True, use_bias_correction=True, weight_decay=0.01, d_coef=0.2)
#     scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optim, T_max=args.n_epochs)
# elif args.prodigy_setting == 22:
#     args.n_epochs = 300
#     args.test_epochs = 5
#     optim = Prodigy(model.parameters(), lr=1., safeguard_warmup=True, use_bias_correction=True, weight_decay=0.01, d_coef=0.2)
#     scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optim, T_max=args.n_epochs)
# elif args.prodigy_setting == 23:
#     args.test_epochs = 5
#     optim = Prodigy(model.parameters(), lr=1., safeguard_warmup=True, use_bias_correction=True, weight_decay=0.01, d_coef=0.1)
#     scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optim, T_max=args.n_epochs)

# # Goal of the next experiment is to find good d_coef that lasts longest without scheduler
# elif args.prodigy_setting == 25:
#     # We have a WINNER!!!! -> FCD = 0.8222
#     args.n_epochs = 500
#     args.test_epochs = 5
#     optim = Prodigy(model.parameters(), lr=1., safeguard_warmup=True, use_bias_correction=True, weight_decay=0.01, d_coef=0.1)
#     scheduler = None
# elif args.prodigy_setting == 26:
#     args.n_epochs = 500
#     args.test_epochs = 5
#     optim = Prodigy(model.parameters(), lr=1., safeguard_warmup=True, use_bias_correction=True, weight_decay=0.01, d_coef=0.05)
#     scheduler = None

# # now that we found good d_coef=0.1 let's find a good scheduler
# elif args.prodigy_setting == 27:
#     # -> FCD = 0.8121
#     # run was finished -> new default setting!
#     args.n_epochs = 500
#     args.test_epochs = 5
#     optim = Prodigy(model.parameters(), lr=1., safeguard_warmup=True, use_bias_correction=True, weight_decay=0.01, d_coef=0.1)
#     scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optim, T_max=args.n_epochs)

# elif args.prodigy_setting == 28:
#     args.n_epochs = 500
#     args.test_epochs = 5
#     optim = Prodigy(model.parameters(), lr=1., safeguard_warmup=True, use_bias_correction=True, weight_decay=0.01, d_coef=0.15)
#     scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optim, T_max=args.n_epochs)

# # Some more final experiments
# elif args.prodigy_setting == 29:
#     args.n_epochs = 500
#     args.test_epochs = 5
#     optim = Prodigy(model.parameters(), lr=1., safeguard_warmup=True, use_bias_correction=True, weight_decay=0.01, d_coef=0.125)
#     scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optim, T_max=args.n_epochs)
# elif args.prodigy_setting == 30:
#     args.n_epochs = 500
#     args.test_epochs = 5
#     optim = Prodigy(model.parameters(), lr=1., safeguard_warmup=True, use_bias_correction=True, weight_decay=0.01, d_coef=0.1)
#     scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optim, T_max=args.n_epochs, eta_min=0.01)
# elif args.prodigy_setting == 31:
#     args.n_epochs = 500
#     args.test_epochs = 5
#     optim = Prodigy(model.parameters(), lr=1., safeguard_warmup=True, use_bias_correction=True, weight_decay=1e-12, d_coef=0.1)
#     scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optim, T_max=args.n_epochs)

# # For robust training
# elif args.prodigy_setting == 32:
#     optim = Prodigy(model.parameters(), lr=1., safeguard_warmup=True, use_bias_correction=True, weight_decay=0.01, d_coef=0.05)
#     scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optim, T_max=args.n_epochs)
