dataset,split,model,dataset_size,mean_accuracy,std_accuracy,max_new_tokens,num_generations
MMLU-Pro,test,qwen2.5-7b-it,20,65.00,5.00,4096,2
DigitalLearningGmbH/MATH-lighteval,test,qwen2.5-7b-it,20,85.00,0.00,4096,2
HuggingFaceH4/MATH-500,test,llama3.1-8b-lora256-samples4000,500,20.20,0.00,16384,1
HuggingFaceH4/MATH-500,test,llama3.1-8b-full-checkpoint-1000,500,24.53,1.24,16384,3
HuggingFaceH4/MATH-500,test,llama3.1-8b-lora256-samples1000,500,40.87,0.52,16384,3
HuggingFaceH4/MATH-500,test,llama3.1-8b-lora256-samples2000,500,33.53,0.57,16384,3
HuggingFaceH4/MATH-500,test,llama3.1-8b-full,500,30.47,1.00,16384,3
HuggingFaceH4/MATH-500,test,llama3.1-8b-lora256-samples4000,500,21.87,0.47,16384,3
HuggingFaceH4/MATH-500,test,llama3.1-8b-lora256-checkpoint-3000,500,24.53,0.25,16384,3
HuggingFaceH4/MATH-500,test,llama3.1-8b-lora256-checkpoint-2000,500,22.27,0.90,16384,3
HuggingFaceH4/MATH-500,test,llama3.1-8b-lora256-checkpoint-1000,500,22.53,0.75,16384,3
DigitalLearningGmbH/MATH-lighteval,test,llama3.1-8b-lora256-samples1000,5000,39.38,0.00,16384,1
DigitalLearningGmbH/MATH-lighteval,test,llama3.1-8b-lora256-samples2000,5000,32.36,0.00,16384,1
DigitalLearningGmbH/MATH-lighteval,test,llama3.1-8b-full,5000,28.90,0.00,16384,1
MMLU-Pro,test,llama3.1-8b-lora256-samples1000,12032,40.67,0.00,16384,1
DigitalLearningGmbH/MATH-lighteval,test,llama3.1-8b-full-checkpoint-1000,5000,24.02,0.00,16384,1
DigitalLearningGmbH/MATH-lighteval,test,llama3.1-8b-lora256-samples4000,5000,21.14,0.00,16384,1
MMLU-Pro,test,llama3.1-8b-full-checkpoint-1359,12032,36.59,0.00,16384,1
MMLU-Pro,test,llama3.1-8b-full,12032,36.59,0.00,16384,1
MMLU-Pro,test,llama3.1-8b-lora256-samples2000,12032,41.45,0.00,16384,1
MMLU-Pro,test,llama3.1-8b-lora256-samples4000,12032,37.54,0.00,16384,1
MMLU-Pro,test,llama3.1-8b-full-checkpoint-1000,12032,32.51,0.00,16384,1
