problem_id,correctness,coarse_grained_novelty
2013_USAJMO_Problems_2,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'NO', 'claude-3-7-sonnet-20250219': 'NO', 'gemini-2.0-flash': 'NO', 'final_decision': 'NO'}"
1954_AHSME_Problems_38,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'YES', 'claude-3-7-sonnet-20250219': 'YES', 'gemini-2.0-flash': 'YES', 'final_decision': 'YES'}"
2012_AIME_II_Problems_11,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'YES', 'claude-3-7-sonnet-20250219': 'YES', 'gemini-2.0-flash': 'YES', 'final_decision': 'YES'}"
1979_AHSME_Problems_13,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'YES', 'claude-3-7-sonnet-20250219': 'YES', 'gemini-2.0-flash': 'YES', 'final_decision': 'YES'}"
1993_AHSME_Problems_15,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'NO', 'claude-3-7-sonnet-20250219': 'YES', 'gemini-2.0-flash': 'YES', 'final_decision': 'YES'}"
2002_AMC_12P_Problems_10,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'YES', 'claude-3-7-sonnet-20250219': 'YES', 'gemini-2.0-flash': 'YES', 'final_decision': 'YES'}"
2006_AMC_12B_Problems_10,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'NO', 'claude-3-7-sonnet-20250219': 'NO', 'gemini-2.0-flash': 'YES', 'final_decision': 'NO'}"
2024_USAJMO_Problems_3,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'NO', 'claude-3-7-sonnet-20250219': 'YES', 'gemini-2.0-flash': 'YES', 'final_decision': 'YES'}"
2022_AMC_12A_Problems_9,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'YES', 'claude-3-7-sonnet-20250219': 'NO', 'gemini-2.0-flash': 'YES', 'final_decision': 'YES'}"
2016_AMC_8_Problems_6,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'YES', 'claude-3-7-sonnet-20250219': 'YES', 'gemini-2.0-flash': 'YES', 'final_decision': 'YES'}"
2010_USAMO_Problems_6,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'NO', 'claude-3-7-sonnet-20250219': 'NO', 'gemini-2.0-flash': 'YES', 'final_decision': 'NO'}"
2023_USAJMO_Problems_5,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'YES', 'claude-3-7-sonnet-20250219': 'YES', 'gemini-2.0-flash': 'YES', 'final_decision': 'YES'}"
2003_AMC_8_Problems_18,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'YES', 'claude-3-7-sonnet-20250219': 'YES', 'gemini-2.0-flash': 'YES', 'final_decision': 'YES'}"
2011_AIME_I_Problems_12,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'NO', 'claude-3-7-sonnet-20250219': 'NO', 'gemini-2.0-flash': 'NO', 'final_decision': 'NO'}"
2004_USAMO_Problems_6,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'YES', 'claude-3-7-sonnet-20250219': 'YES', 'gemini-2.0-flash': 'YES', 'final_decision': 'YES'}"
2021_USAJMO_Problems_4,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'NO', 'claude-3-7-sonnet-20250219': 'YES', 'gemini-2.0-flash': 'NO', 'final_decision': 'NO'}"
1962_AHSME_Problems_11,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'YES', 'claude-3-7-sonnet-20250219': 'YES', 'gemini-2.0-flash': 'YES', 'final_decision': 'YES'}"
1994_AJHSME_Problems_20,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'NO', 'claude-3-7-sonnet-20250219': 'NO', 'gemini-2.0-flash': 'YES', 'final_decision': 'NO'}"
2023_AMC_12B_Problems_4,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'YES', 'claude-3-7-sonnet-20250219': 'YES', 'gemini-2.0-flash': 'NO', 'final_decision': 'YES'}"
1989_AIME_Problems_12,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'YES', 'claude-3-7-sonnet-20250219': 'YES', 'gemini-2.0-flash': 'YES', 'final_decision': 'YES'}"
2002_AMC_12P_Problems_8,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'NO', 'claude-3-7-sonnet-20250219': 'YES', 'gemini-2.0-flash': 'YES', 'final_decision': 'YES'}"
1998_AIME_Problems_13,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'YES', 'claude-3-7-sonnet-20250219': 'YES', 'gemini-2.0-flash': 'YES', 'final_decision': 'YES'}"
2002_AMC_12A_Problems_21,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'NO', 'claude-3-7-sonnet-20250219': 'NO', 'gemini-2.0-flash': 'NO', 'final_decision': 'NO'}"
1999_AMC_8_Problems_8,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'YES', 'claude-3-7-sonnet-20250219': 'YES', 'gemini-2.0-flash': 'YES', 'final_decision': 'YES'}"
2000_IMO_Problems_5,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'YES', 'claude-3-7-sonnet-20250219': 'YES', 'gemini-2.0-flash': 'YES', 'final_decision': 'YES'}"
2016_AMC_10A_Problems_16,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'YES', 'claude-3-7-sonnet-20250219': 'YES', 'gemini-2.0-flash': 'YES', 'final_decision': 'YES'}"
2001_AMC_12_Problems_25,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'NO', 'claude-3-7-sonnet-20250219': 'YES', 'gemini-2.0-flash': 'NO', 'final_decision': 'NO'}"
2023_AMC_12A_Problems_23,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'YES', 'claude-3-7-sonnet-20250219': 'YES', 'gemini-2.0-flash': 'YES', 'final_decision': 'YES'}"
2024_USAJMO_Problems_4,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'NO', 'claude-3-7-sonnet-20250219': 'YES', 'gemini-2.0-flash': 'YES', 'final_decision': 'YES'}"
1987_AIME_Problems_11,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'NO', 'claude-3-7-sonnet-20250219': 'YES', 'gemini-2.0-flash': 'YES', 'final_decision': 'YES'}"
2008_IMO_Problems_5,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'YES', 'claude-3-7-sonnet-20250219': 'YES', 'gemini-2.0-flash': 'YES', 'final_decision': 'YES'}"
2006_AMC_12B_Problems_22,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'NO', 'claude-3-7-sonnet-20250219': 'NO', 'gemini-2.0-flash': 'NO', 'final_decision': 'NO'}"
1996_AJHSME_Problems_1,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'YES', 'claude-3-7-sonnet-20250219': 'YES', 'gemini-2.0-flash': 'YES', 'final_decision': 'YES'}"
1994_AHSME_Problems_24,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'YES', 'claude-3-7-sonnet-20250219': 'YES', 'gemini-2.0-flash': 'YES', 'final_decision': 'YES'}"
2001_AMC_12_Problems_20,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'YES', 'claude-3-7-sonnet-20250219': 'YES', 'gemini-2.0-flash': 'YES', 'final_decision': 'YES'}"
1964_AHSME_Problems_40,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'NO', 'claude-3-7-sonnet-20250219': 'YES', 'gemini-2.0-flash': 'NO', 'final_decision': 'NO'}"
2018_USAJMO_Problems_5,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'NO', 'claude-3-7-sonnet-20250219': 'YES', 'gemini-2.0-flash': 'YES', 'final_decision': 'YES'}"
1994_AJHSME_Problems_7,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'NO', 'claude-3-7-sonnet-20250219': 'NO', 'gemini-2.0-flash': 'NO', 'final_decision': 'NO'}"
2019_AMC_8_Problems_20,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'YES', 'claude-3-7-sonnet-20250219': 'YES', 'gemini-2.0-flash': 'YES', 'final_decision': 'YES'}"
2015_USAJMO_Problems_6,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'YES', 'claude-3-7-sonnet-20250219': 'YES', 'gemini-2.0-flash': 'YES', 'final_decision': 'YES'}"
1986_AJHSME_Problems_6,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'NO', 'claude-3-7-sonnet-20250219': 'NO', 'gemini-2.0-flash': 'NO', 'final_decision': 'NO'}"
1984_AHSME_Problems_22,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'NO', 'claude-3-7-sonnet-20250219': 'YES', 'gemini-2.0-flash': 'YES', 'final_decision': 'YES'}"
2010_USAJMO_Problems_2,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'YES', 'claude-3-7-sonnet-20250219': 'YES', 'gemini-2.0-flash': 'YES', 'final_decision': 'YES'}"
2008_AIME_II_Problems_1,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'YES', 'claude-3-7-sonnet-20250219': 'YES', 'gemini-2.0-flash': 'YES', 'final_decision': 'YES'}"
1984_AIME_Problems_9,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'YES', 'claude-3-7-sonnet-20250219': 'YES', 'gemini-2.0-flash': 'YES', 'final_decision': 'YES'}"
2008_AMC_10A_Problems_13,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'NO', 'claude-3-7-sonnet-20250219': 'YES', 'gemini-2.0-flash': 'NO', 'final_decision': 'NO'}"
2010_AIME_II_Problems_4,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'YES', 'claude-3-7-sonnet-20250219': 'NO', 'gemini-2.0-flash': 'NO', 'final_decision': 'NO'}"
2014_AMC_10A_Problems_3,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'YES', 'claude-3-7-sonnet-20250219': 'YES', 'gemini-2.0-flash': 'YES', 'final_decision': 'YES'}"
2020_AMC_10A_Problems_1,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'YES', 'claude-3-7-sonnet-20250219': 'YES', 'gemini-2.0-flash': 'YES', 'final_decision': 'YES'}"
2002_AMC_10A_Problems_9,"{'claude-3-7-sonnet-20250219': 'YES', 'final_decision': 'YES'}","{'gpt-4.1': 'YES', 'claude-3-7-sonnet-20250219': 'NO', 'gemini-2.0-flash': 'YES', 'final_decision': 'YES'}"
