
python ../eval_benchmark/visualization.py \
  --model "Qwen3-1.7B,Qwen3-4B,Qwen3-8B" \
  --dataset "reward_bench,reward_bench_v2,judgebench" \
  --outdir ../empirical_analysis_qwen/ \
  --resdir ../results/

