source secret.sh
python3 inference/run_api.py --dataset_name_or_path "./datasets/swt_bench" --split dev --model_name_or_path "azure_phi-3-mini-128k-instruct-4" --output_dir inference_output --model_args "max_tokens=1500"
python3 harness/run_evaluation.py --custom-patch --verbose --predictions_path "inference_output/azure_phi-3-mini-128k-instruct-4__swt_bench__max_tokens=1500__dev.jsonl" --log_dir evaluation_output --num_processes 10
