# example to run our experiments
#### SOFTMAX ATTENTION
python run_tasks.py --model softmax --task listops --dropout_prob 0.1 --attention_dropout 0.1 --learning_rate 0.0001 --weight_decay 0.0 --num_layers 2 --max_seq_len 2048 --num_train_steps 5000 --num_eval_steps 62 --eval_frequency 50 --batch_size 32 --warmup 1000 --n_train_samples 96000 --n_dev_samples 2000 --n_test_samples 2000 --num_classes 10 

#### ATTENTION-BN
python run_tasks.py --model softmax --task listops --dropout_prob 0.1 --attention_dropout 0.1 --learning_rate 0.0001 --weight_decay 0.0 --num_layers 2 --max_seq_len 2048 --num_train_steps 5000 --num_eval_steps 62 --eval_frequency 50 --batch_size 32 --warmup 1000 --n_train_samples 96000 --n_dev_samples 2000 --n_test_samples 2000 --num_classes 10 --beta 1.

#### ATTENTION-SH
python run_tasks.py --model sh --task listops --dropout_prob 0.1 --attention_dropout 0.1 --learning_rate 0.0001 --weight_decay 0.0 --num_layers 2 --max_seq_len 2048 --num_train_steps 5000 --num_eval_steps 62 --eval_frequency 50 --batch_size 32 --warmup 1000 --n_train_samples 96000 --n_dev_samples 2000 --n_test_samples 2000 --num_classes 10 

#### ATTENTION-BN+SH
python run_tasks.py --model sh --task listops --dropout_prob 0.1 --attention_dropout 0.1 --learning_rate 0.0001 --weight_decay 0.0 --num_layers 2 --max_seq_len 2048 --num_train_steps 5000 --num_eval_steps 62 --eval_frequency 50 --batch_size 32 --warmup 1000 --n_train_samples 96000 --n_dev_samples 2000 --n_test_samples 2000 --num_classes 10 --beta 1.


