problem_id,correctness,coarse_grained_novelty
2016_AMC_8_Problems_15,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'YES', 'claude-3-7-sonnet-20250219': 'YES', 'gemini-2.0-flash': 'YES', 'final_decision': 'YES'}"
1991_AJHSME_Problems_25,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'YES', 'claude-3-7-sonnet-20250219': 'YES', 'gemini-2.0-flash': 'NO', 'final_decision': 'YES'}"
1996_AJHSME_Problems_20,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'YES', 'claude-3-7-sonnet-20250219': 'YES', 'gemini-2.0-flash': 'YES', 'final_decision': 'YES'}"
2001_AMC_8_Problems_25,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'YES', 'claude-3-7-sonnet-20250219': 'YES', 'gemini-2.0-flash': 'YES', 'final_decision': 'YES'}"
1996_AJHSME_Problems_12,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'NO', 'claude-3-7-sonnet-20250219': 'NO', 'gemini-2.0-flash': 'YES', 'final_decision': 'NO'}"
1996_AJHSME_Problems_12,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'YES', 'claude-3-7-sonnet-20250219': 'YES', 'gemini-2.0-flash': 'YES', 'final_decision': 'YES'}"
1996_AJHSME_Problems_25,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'YES', 'claude-3-7-sonnet-20250219': 'YES', 'gemini-2.0-flash': 'YES', 'final_decision': 'YES'}"
2012_AMC_10A_Problems_21,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'YES', 'claude-3-7-sonnet-20250219': 'NO', 'gemini-2.0-flash': 'YES', 'final_decision': 'YES'}"
2008_AMC_10A_Problems_5,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'YES', 'claude-3-7-sonnet-20250219': 'YES', 'gemini-2.0-flash': 'YES', 'final_decision': 'YES'}"
2005_AMC_10B_Problems_19,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'YES', 'claude-3-7-sonnet-20250219': 'YES', 'gemini-2.0-flash': 'YES', 'final_decision': 'YES'}"
2009_AMC_10B_Problems_14,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'NO', 'claude-3-7-sonnet-20250219': 'YES', 'gemini-2.0-flash': 'NO', 'final_decision': 'NO'}"
2006_AMC_12A_Problems_3,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'YES', 'claude-3-7-sonnet-20250219': 'YES', 'gemini-2.0-flash': 'YES', 'final_decision': 'YES'}"
2012_AMC_12B_Problems_24,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'NO', 'claude-3-7-sonnet-20250219': 'YES', 'gemini-2.0-flash': 'YES', 'final_decision': 'YES'}"
2007_AMC_12B_Problems_5,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'YES', 'claude-3-7-sonnet-20250219': 'NO', 'gemini-2.0-flash': 'YES', 'final_decision': 'YES'}"
2021_AMC_12A_Problems_2,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'YES', 'claude-3-7-sonnet-20250219': 'YES', 'gemini-2.0-flash': 'YES', 'final_decision': 'YES'}"
2003_AMC_12B_Problems_6,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'YES', 'claude-3-7-sonnet-20250219': 'NO', 'gemini-2.0-flash': 'YES', 'final_decision': 'YES'}"
2020_AMC_12B_Problems_20,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'NO', 'claude-3-7-sonnet-20250219': 'NO', 'gemini-2.0-flash': 'NO', 'final_decision': 'NO'}"
2011_AMC_12B_Problems_25,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'YES', 'claude-3-7-sonnet-20250219': 'YES', 'gemini-2.0-flash': 'YES', 'final_decision': 'YES'}"
1986_IMO_Problems_6,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'NO', 'claude-3-7-sonnet-20250219': 'YES', 'gemini-2.0-flash': 'YES', 'final_decision': 'YES'}"
1990_IMO_Problems_1,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'YES', 'claude-3-7-sonnet-20250219': 'YES', 'gemini-2.0-flash': 'YES', 'final_decision': 'YES'}"
1968_IMO_Problems_2,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'YES', 'claude-3-7-sonnet-20250219': 'YES', 'gemini-2.0-flash': 'YES', 'final_decision': 'YES'}"
2019_USAMO_Problems_4,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'NO', 'claude-3-7-sonnet-20250219': 'NO', 'gemini-2.0-flash': 'YES', 'final_decision': 'NO'}"
2019_USAMO_Problems_4,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'NO', 'claude-3-7-sonnet-20250219': 'YES', 'gemini-2.0-flash': 'YES', 'final_decision': 'YES'}"
1996_USAMO_Problems_6,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'YES', 'claude-3-7-sonnet-20250219': 'YES', 'gemini-2.0-flash': 'YES', 'final_decision': 'YES'}"
2021_USAMO_Problems_6,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'NO', 'claude-3-7-sonnet-20250219': 'NO', 'gemini-2.0-flash': 'YES', 'final_decision': 'NO'}"
2020_USAMO_Problems_1,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'YES', 'claude-3-7-sonnet-20250219': 'YES', 'gemini-2.0-flash': 'YES', 'final_decision': 'YES'}"
1996_USAMO_Problems_5,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'YES', 'claude-3-7-sonnet-20250219': 'YES', 'gemini-2.0-flash': 'YES', 'final_decision': 'YES'}"
1977_USAMO_Problems_1,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'NO', 'claude-3-7-sonnet-20250219': 'NO', 'gemini-2.0-flash': 'YES', 'final_decision': 'NO'}"
2016_USAMO_Problems_3,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'YES', 'claude-3-7-sonnet-20250219': 'YES', 'gemini-2.0-flash': 'YES', 'final_decision': 'YES'}"
1975_USAMO_Problems_5,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'YES', 'claude-3-7-sonnet-20250219': 'YES', 'gemini-2.0-flash': 'YES', 'final_decision': 'YES'}"
2013_AIME_II_Problems_11,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'NO', 'claude-3-7-sonnet-20250219': 'NO', 'gemini-2.0-flash': 'YES', 'final_decision': 'NO'}"
2015_AIME_II_Problems_1,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'NO', 'claude-3-7-sonnet-20250219': 'YES', 'gemini-2.0-flash': 'YES', 'final_decision': 'YES'}"
2017_AIME_I_Problems_14,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'NO', 'claude-3-7-sonnet-20250219': 'NO', 'gemini-2.0-flash': 'YES', 'final_decision': 'NO'}"
2022_AIME_I_Problems_4,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'YES', 'claude-3-7-sonnet-20250219': 'YES', 'gemini-2.0-flash': 'YES', 'final_decision': 'YES'}"
2000_AIME_II_Problems_5,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'YES', 'claude-3-7-sonnet-20250219': 'YES', 'gemini-2.0-flash': 'YES', 'final_decision': 'YES'}"
2023_AIME_II_Problems_10,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'YES', 'claude-3-7-sonnet-20250219': 'YES', 'gemini-2.0-flash': 'YES', 'final_decision': 'YES'}"
1990_AIME_Problems_3,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'YES', 'claude-3-7-sonnet-20250219': 'YES', 'gemini-2.0-flash': 'YES', 'final_decision': 'YES'}"
1990_AIME_Problems_3,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'NO', 'claude-3-7-sonnet-20250219': 'NO', 'gemini-2.0-flash': 'NO', 'final_decision': 'NO'}"
1994_AIME_Problems_10,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'NO', 'claude-3-7-sonnet-20250219': 'NO', 'gemini-2.0-flash': 'NO', 'final_decision': 'NO'}"
2003_AIME_II_Problems_7,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'YES', 'claude-3-7-sonnet-20250219': 'YES', 'gemini-2.0-flash': 'YES', 'final_decision': 'YES'}"
1987_AIME_Problems_8,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'YES', 'claude-3-7-sonnet-20250219': 'YES', 'gemini-2.0-flash': 'YES', 'final_decision': 'YES'}"
1958_AHSME_Problems_31,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'YES', 'claude-3-7-sonnet-20250219': 'YES', 'gemini-2.0-flash': 'YES', 'final_decision': 'YES'}"
1952_AHSME_Problems_16,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'YES', 'claude-3-7-sonnet-20250219': 'YES', 'gemini-2.0-flash': 'YES', 'final_decision': 'YES'}"
1957_AHSME_Problems_38,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'NO', 'claude-3-7-sonnet-20250219': 'YES', 'gemini-2.0-flash': 'YES', 'final_decision': 'YES'}"
1973_AHSME_Problems_30,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'NO', 'claude-3-7-sonnet-20250219': 'YES', 'gemini-2.0-flash': 'YES', 'final_decision': 'YES'}"
1988_AHSME_Problems_4,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'YES', 'claude-3-7-sonnet-20250219': 'YES', 'gemini-2.0-flash': 'YES', 'final_decision': 'YES'}"
1966_AHSME_Problems_30,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'YES', 'claude-3-7-sonnet-20250219': 'YES', 'gemini-2.0-flash': 'YES', 'final_decision': 'YES'}"
2012_USAJMO_Problems_1,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'NO', 'claude-3-7-sonnet-20250219': 'YES', 'gemini-2.0-flash': 'YES', 'final_decision': 'YES'}"
2022_USAJMO_Problems_5,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'YES', 'claude-3-7-sonnet-20250219': 'UNCLEAR', 'gemini-2.0-flash': 'YES', 'final_decision': 'YES'}"
2022_USAJMO_Problems_4,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'YES', 'claude-3-7-sonnet-20250219': 'YES', 'gemini-2.0-flash': 'YES', 'final_decision': 'YES'}"
