Model,Goal_F1_V,Goal_F1_B,Action_Task_SR_V,Action_Task_SR_B,Action_Exec_SR_V,Action_Exec_SR_B,Subgoal_Task_SR_V,Subgoal_Task_SR_B,Subgoal_Exec_SR_V,Subgoal_Exec_SR_B,Transition_F1_V,Transition_F1_B,Transition_Planner_SR_V,Transition_Planner_SR_B,Average_Module_SR_V,Average_Module_SR_B
Claude-3 Haiku,28.0,52.5,54.8,26.0,60.7,32.0,78.4,30.0,82.8,35.0,42.3,51.6,30.4,64.0,49.4,41.6
Claude-3 Sonnet,29.4,69.4,58.0,44.0,63.3,60.0,86.1,39.0,86.4,43.0,41.2,56.2,13.2,80.0,49.4,55.1
Claude-3 Opus,31.4,77.0,64.6,51.0,69.5,59.0,86.7,41.0,89.9,47.0,48.8,63.4,61.8,82.0,59.5,60.4
Claude-3.5 Sonnet,33.0,82.7,76.1,60.0,81.3,69.0,89.1,39.0,92.0,44.0,48.9,67.9,80.5,82.0,65.7,64.2
Cohere Command R,36.7,36.0,44.9,16.0,44.3,19.0,71.3,15.0,78.1,25.0,11.7,24.1,51.1,41.0,46.1,24.9
Cohere Command R+,22.4,51.2,54.1,27.0,65.2,32.5,77.8,25.0,83.7,37.0,30.8,49.7,37.2,59.0,47.1,39.1
Gemini 1.0 Pro,23.8,60.0,45.6,27.0,56.7,32.0,70.4,24.0,84.6,33.0,41.8,45.8,11.8,16.0,41.7,35.5
Gemini 1.5 Flash,26.8,74.8,69.5,40.0,75.4,52.0,89.1,34.0,94.1,42.0,45.7,53.4,46.6,66.0,57.9,52.1
Gemini 1.5 Pro,36.2,79.6,76.7,42.0,83.6,54.0,87.0,31.0,91.1,37.0,34.1,45.8,91.9,39.0,65.7,48.8
GPT-3.5-turbo,22.7,50.4,24.9,16.0,40.7,20.0,69.2,24.0,81.4,36.0,30.0,42.1,0.7,41.0,33.0,33.0
GPT-4-turbo,33.2,77.2,60.0,38.0,65.2,45.0,85.5,38.0,94.1,47.0,42.9,44.2,36.1,46.0,57.1,49.6
GPT-4o,36.5,79.2,71.5,47.0,81.3,53.0,87.6,49.0,91.1,55.0,46.7,60.9,68.2,67.0,63.3,59.8
Llama 3 8B Instruct,22.6,28.3,21.3,10.0,23.6,16.0,48.8,22.0,58.0,29.0,12.9,35.0,28.7,29.0,28.4,23.1
Llama 3 70B Instruct,26.9,70.9,59.0,34.0,66.6,42.0,78.4,21.0,87.3,30.0,37.4,55.1,12.2,78.0,47.3,48.1
Mistral Large,26.8,74.3,78.4,33.0,84.6,50.0,84.3,31.0,92.0,38.0,36.1,49.5,31.1,77.0,55.8,50.4
Mixtral 8x22B MoE,26.6,54.7,63.3,30.0,67.9,40.0,80.5,28.0,90.2,33.0,42.0,52.4,37.5,55.0,52.5,41.6
o1-mini,31.2,76.4,71.5,56.0,76.4,65.0,79.3,31.0,84.6,39.0,41.5,56.4,69.0,77.0,59.3,57.5
o1-preview,42.7,81.6,65.2,81.0,72.5,91.0,89.4,60.0,93.2,62.0,48.0,69.5,72.4,89.0,64.4,74.9
