source secret.sh
# python3 harness/run_evaluation.py --vanilla-patch --verbose --predictions_path "inference_output/swe-agent-demo3__swt_bench_lite__test.jsonl" --log_dir evaluation_output --num_processes 20 --swe_bench_tasks test
# python3 harness/run_evaluation.py --vanilla-patch --verbose --predictions_path "inference_output/swe-agent-demo3-haiku__swt_bench_lite__test.jsonl" --log_dir evaluation_output --num_processes 20 --swe_bench_tasks test
# python3 harness/run_evaluation.py --vanilla-patch --verbose --predictions_path "inference_output/swe-agent-demo3-mixtral__swt_bench_lite__test.jsonl" --log_dir evaluation_output --num_processes 20 --swe_bench_tasks test
python3 harness/run_evaluation.py --vanilla-patch --verbose --predictions_path "inference_output/swe-agent-demo4__swt_bench_lite__test.jsonl" --log_dir evaluation_output --num_processes 20 --swe_bench_tasks test
