python plot_baseline_gspo_length_metrics.py \
  --input /mnt/shared-storage-user/p1-shared/wangfuting/codes/project_tts_extrapolation/eval_scripts/analysis_mar/results/baseline-length-analysis_results.json \
  --input /mnt/shared-storage-user/p1-shared/wangfuting/codes/project_tts_extrapolation/eval_scripts/analysis_mar/results/LIE-analysis_results.json \
  --input /mnt/shared-storage-user/p1-shared/wangfuting/codes/project_tts_extrapolation/eval_scripts/analysis_mar/results/llama-LIE-analysis_results.json \
  --input /mnt/shared-storage-user/p1-shared/wangfuting/codes/project_tts_extrapolation/eval_scripts/analysis_mar/results/qwen3-4b-baseline-LIE-analysis_results.json \
  --input /mnt/shared-storage-user/p1-shared/wangfuting/codes/project_tts_extrapolation/eval_scripts/analysis_mar/results/distinct_bonus-analysis_results.json \
  --input /mnt/shared-storage-user/p1-shared/wangfuting/codes/project_tts_extrapolation/eval_scripts/analysis_mar/results/sft-gspo-ours-dapo-math-max12k-analysis_results.json \
  --input /mnt/shared-storage-user/p1-shared/wangfuting/codes/project_tts_extrapolation/eval_scripts/analysis_mar/results/sft4k-gspo-dapo-math-minibsz32-max12k-analysis_results.json \
  --run-key baseline \
  --run-key LIE \
  --run-key sft4k-gspo-dapo-math-minibsz32-max12k \
  --run-key sft-gspo-ours-dapo-math-max12k \
  --run-label "GSPO"\
  --run-label "GSPO + LINE" \
  --run-label 'SFT + GSPO' \
  --run-label 'SFT + GSPO + LINE' \
  --max-step 300 \
  --output /mnt/shared-storage-user/p1-shared/wangfuting/codes/project_tts_extrapolation/eval_scripts/analysis_mar/results/plots/two_runs_comparison_SFT_10gram_1x4.pdf