set -e

for seed in 42 43 44 45 46; do
    for task in copy_first_op no_carry only_carry reverse_add_trans reverse_sub; do
        CUDA_VISIBLE_DEVICES=0,1 WANDB_PROJECT='inheritance_new' WANDB_MODE=online torchrun --nproc_per_node=2 run.py \
            experiments/inheritance/common/data.yaml experiments/inheritance/common/model_14M.yaml experiments/inheritance/$task.yaml \
            --seed $seed \
            --max_steps 1
    done
done

for seed in 42 43 44 45 46; do
    for task in copy_first_op no_carry only_carry reverse_add_trans reverse_sub; do
        CUDA_VISIBLE_DEVICES=0,1 WANDB_PROJECT='inheritance_new' WANDB_MODE=online torchrun --nproc_per_node=2 run.py \
            experiments/inheritance/common/data.yaml experiments/inheritance/common/SmolLM.yaml experiments/inheritance/$task.yaml \
            --seed=$seed \
            --max_steps=1
    done
done

# Part-to-whole generalization
# for seed in 42 43 44 45 46; do
#     for task in maze reverse_mult reverse_add; do
#         # CUDA_VISIBLE_DEVICES=0,1 WANDB_PROJECT='inheritance_new' WANDB_MODE=online torchrun --nproc_per_node=2 run.py experiments/inheritance/$task.yaml
#         CUDA_VISIBLE_DEVICES=0,1 WANDB_PROJECT='inheritance_new' WANDB_MODE=online torchrun --nproc_per_node=2 run.py experiments/inheritance/$task'_with_padding.yaml' \
#             --seed=$seed
#     done
# done

# for seed in 42 43 44 45 46; do
#     for task in maze reverse_mult reverse_add; do
#         # CUDA_VISIBLE_DEVICES=0,1 WANDB_PROJECT='inheritance_new' WANDB_MODE=online torchrun --nproc_per_node=2 run.py experiments/inheritance/$task.yaml
#         CUDA_VISIBLE_DEVICES=0,1 WANDB_PROJECT='inheritance_new' WANDB_MODE=online torchrun --nproc_per_node=2 run.py experiments/inheritance/$task'_with_padding.yaml' \
#             --seed=$seed \
#             --model_id='HuggingFaceTB/SmolLM-135M' \
#             --from_pretrained=True \
#             --use_character_tokenizer=True \
#             --learning_rate=5e-4 \
#             --max_steps=1 \
#             --lr_scheduler_kwargs='{"num_stable_step": 8000, "num_decay_steps": 10000}' \
#             --per_device_train_batch_size=16 \
#             --gradient_accumulation_steps=8
#     done
# done

# CUDA_VISIBLE_DEVICES=0,1 WANDB_PROJECT='inheritance_new' WANDB_MODE=online torchrun --nproc_per_node=2 run.py experiments/inheritance/maze_with_padding.yaml
CUDA_VISIBLE_DEVICES=0,1 WANDB_PROJECT='inheritance_new' WANDB_MODE=online torchrun --nproc_per_node=2 run.py experiments/inheritance/maze.yaml \
    --seed=42 \
    --model_id='HuggingFaceTB/SmolLM-135M' \
    --from_pretrained=True \
    --use_character_tokenizer=True \
    --learning_rate=1e-4 \
    --max_steps=5000 \
    --lr_scheduler_kwargs='{"num_stable_steps": 2000, "num_decay_steps": 2500}' \
    --per_device_train_batch_size=16 \
    --gradient_accumulation_steps=8