problem_id,correctness,coarse_grained_novelty
2008_IMO_Problems_5,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'YES', 'claude-3-7-sonnet-20250219': 'YES', 'gemini-2.0-flash': 'YES', 'final_decision': 'YES'}"
2006_AMC_12B_Problems_22,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'NO', 'claude-3-7-sonnet-20250219': 'NO', 'gemini-2.0-flash': 'NO', 'final_decision': 'NO'}"
1996_AJHSME_Problems_1,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'YES', 'claude-3-7-sonnet-20250219': 'YES', 'gemini-2.0-flash': 'YES', 'final_decision': 'YES'}"
1994_AHSME_Problems_24,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'YES', 'claude-3-7-sonnet-20250219': 'YES', 'gemini-2.0-flash': 'YES', 'final_decision': 'YES'}"
2001_AMC_12_Problems_20,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'YES', 'claude-3-7-sonnet-20250219': 'YES', 'gemini-2.0-flash': 'YES', 'final_decision': 'YES'}"
1964_AHSME_Problems_40,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'NO', 'claude-3-7-sonnet-20250219': 'YES', 'gemini-2.0-flash': 'NO', 'final_decision': 'NO'}"
2018_USAJMO_Problems_5,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'NO', 'claude-3-7-sonnet-20250219': 'YES', 'gemini-2.0-flash': 'YES', 'final_decision': 'YES'}"
1994_AJHSME_Problems_7,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'NO', 'claude-3-7-sonnet-20250219': 'NO', 'gemini-2.0-flash': 'NO', 'final_decision': 'NO'}"
2019_AMC_8_Problems_20,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'YES', 'claude-3-7-sonnet-20250219': 'YES', 'gemini-2.0-flash': 'YES', 'final_decision': 'YES'}"
2015_USAJMO_Problems_6,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'YES', 'claude-3-7-sonnet-20250219': 'YES', 'gemini-2.0-flash': 'YES', 'final_decision': 'YES'}"
1986_AJHSME_Problems_6,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'NO', 'claude-3-7-sonnet-20250219': 'NO', 'gemini-2.0-flash': 'NO', 'final_decision': 'NO'}"
1984_AHSME_Problems_22,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'NO', 'claude-3-7-sonnet-20250219': 'YES', 'gemini-2.0-flash': 'YES', 'final_decision': 'YES'}"
2010_USAJMO_Problems_2,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'YES', 'claude-3-7-sonnet-20250219': 'YES', 'gemini-2.0-flash': 'YES', 'final_decision': 'YES'}"
2008_AIME_II_Problems_1,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'YES', 'claude-3-7-sonnet-20250219': 'YES', 'gemini-2.0-flash': 'YES', 'final_decision': 'YES'}"
1984_AIME_Problems_9,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'YES', 'claude-3-7-sonnet-20250219': 'YES', 'gemini-2.0-flash': 'YES', 'final_decision': 'YES'}"
2008_AMC_10A_Problems_13,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'NO', 'claude-3-7-sonnet-20250219': 'YES', 'gemini-2.0-flash': 'NO', 'final_decision': 'NO'}"
2010_AIME_II_Problems_4,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'YES', 'claude-3-7-sonnet-20250219': 'NO', 'gemini-2.0-flash': 'NO', 'final_decision': 'NO'}"
2014_AMC_10A_Problems_3,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'YES', 'claude-3-7-sonnet-20250219': 'YES', 'gemini-2.0-flash': 'YES', 'final_decision': 'YES'}"
2020_AMC_10A_Problems_1,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'YES', 'claude-3-7-sonnet-20250219': 'YES', 'gemini-2.0-flash': 'YES', 'final_decision': 'YES'}"
2002_AMC_10A_Problems_9,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'YES', 'claude-3-7-sonnet-20250219': 'NO', 'gemini-2.0-flash': 'YES', 'final_decision': 'YES'}"
