python3 measure_total_stats.py --dataset princeton-nlp/SWE-bench --eval_output_dir evaluation_output/swt_golden_test/mode_vanillafuzzy/ --split test
python3 measure_gold_tests.py --eval_output_dir evaluation_output/swt_golden_test/mode_vanillafuzzy/