CUDA_VISIBLE_DEVICES=0 nohup python3 benchmark.py --tf32 -op all -o total1.csv -d 512x512,20480x20480 -b 512 -i 100 -c cuda -a &
CUDA_VISIBLE_DEVICES=1 nohup python3 benchmark.py --tf32 -op all -o total2.csv -d 1024x1024,10240x10240 -b 512 -i 100 -c cuda -a &
CUDA_VISIBLE_DEVICES=2 nohup python3 benchmark.py --tf32 -op all -o total3.csv -d 2048x2048,8096x8096 -b 512 -i 100 -c cuda -a &
CUDA_VISIBLE_DEVICES=3 nohup python3 benchmark.py --tf32 -op all -o total4.csv -d 4096x4096 -b 512 -i 100 -c cuda -a &
#wait
CUDA_VISIBLE_DEVICES=0 nohup python3 benchmark.py --tf32 -op all -o fwd1.csv -d 512x512,20480x20480 -b 512 -i 100 -c cuda -a -m forward &
CUDA_VISIBLE_DEVICES=1 nohup python3 benchmark.py --tf32 -op all -o fwd2.csv -d 1024x1024,10240x10240 -b 512 -i 100 -c cuda -a  -m forward &
CUDA_VISIBLE_DEVICES=2 nohup python3 benchmark.py --tf32 -op all -o fwd3.csv -d 2048x2048,8096x8096 -b 512 -i 100 -c cuda -a -m forward &
CUDA_VISIBLE_DEVICES=3 nohup python3 benchmark.py --tf32 -op all -o fwd4.csv -d 4096x4096 -b 512 -i 100 -c cuda -a -m forward &
wait
