dataset,metric_name,category,count
CoGymTravelOutcome_outcomeRating,Cultural_and_Local_Integration_Rubric,rubric,5
CoGymTravelOutcome_outcomeRating,Feasibility_and_Realism_Qwen3-32B,llm_judge,5
CoGymTravelOutcome_outcomeRating,Accommodation_Options_Qwen3-32B,llm_judge,4
CoGymTravelOutcome_outcomeRating,Cultural_and_Local_Experiences_Qwen3-32B,llm_judge,4
CoGymTravelOutcome_outcomeRating,Safety_and_Health_Considerations_Qwen3-32B,llm_judge,2
CoGymTravelOutcome_outcomeRating,outcomeRating_Qwen3-32B_examples,examples_judge,2
CoGymTravelOutcome_outcomeRating,Customization_to_User_Preferences_Qwen3-32B,llm_judge,1
CoGymTravelOutcome_outcomeRating,Relevance_to_User_Preferences_Rubric,rubric,1
CoGymTravelOutcome_outcomeRating,cogym_travel_outcome_outcomeRating_Qwen3-32B_optimized_seed43,optimized_judge,1
EvalGenProduct_grade,grade_Qwen3-32B_examples,examples_judge,5
EvalGenProduct_grade,Formatting_Compliance_Qwen3-32B,llm_judge,4
EvalGenProduct_grade,Avoidance_of_Weaknesses_Qwen3-32B,llm_judge,3
EvalGenProduct_grade,Call_to_Action__CTA__Strength_Qwen3-32B,llm_judge,2
EvalGenProduct_grade,Customer_Review_Integration_Rubric,rubric,2
EvalGenProduct_grade,Feature-Benefit_Distinction_Rubric,rubric,2
EvalGenProduct_grade,Conciseness_and_Word_Count_Qwen3-32B,llm_judge,1
EvalGenProduct_grade,Feature_vs__Benefit_Clarity_Qwen3-32B,llm_judge,1
EvalGenProduct_grade,Markdown_Formatting_Compliance_Qwen3-32B,llm_judge,1
EvalGenProduct_grade,SEO_Optimization_Rubric,rubric,1
EvalGenProduct_grade,evalgen_product_grade_Qwen3-32B_optimized_seed42,optimized_judge,1
EvalGenProduct_grade,evalgen_product_grade_Qwen3-32B_optimized_seed43,optimized_judge,1
EvalGenProduct_grade,evalgen_product_grade_Qwen3-32B_optimized_seed44,optimized_judge,1
HelpSteer2_helpfulness,GRMRewardModel,named_metric,5
HelpSteer2_helpfulness,INFORMRewardModel,named_metric,5
HelpSteer2_helpfulness,Relevance_to_the_Query_Qwen3-32B,llm_judge,4
HelpSteer2_helpfulness,Accuracy_and_Correctness_Qwen3-32B,llm_judge,3
HelpSteer2_helpfulness,helpfulness_Qwen3-32B_examples,examples_judge,3
HelpSteer2_helpfulness,HelpSteer2_helpfulness_Qwen3-32B_optimized_seed42,optimized_judge,1
HelpSteer2_helpfulness,HelpSteer2_helpfulness_Qwen3-32B_optimized_seed43,optimized_judge,1
HelpSteer2_helpfulness,HelpSteer2_helpfulness_Qwen3-32B_optimized_seed44,optimized_judge,1
HelpSteer2_helpfulness,HelpSteer2_helpfulness_Qwen3-32B_optimized_seed45,optimized_judge,1
HelpSteer2_helpfulness,HelpSteer2_helpfulness_Qwen3-32B_optimized_seed46,optimized_judge,1
RealHumanEval_accepted,Code_Readability_Qwen3-32B,llm_judge,5
RealHumanEval_accepted,GRMRewardModel,named_metric,5
RealHumanEval_accepted,INFORMRewardModel,named_metric,5
RealHumanEval_accepted,Correctness_of_Core_Logic_Qwen3-32B,llm_judge,4
RealHumanEval_accepted,Adherence_to_PEP8_Best_Practices_Qwen3-32B,llm_judge,2
RealHumanEval_accepted,Modularity_and_Reusability_Qwen3-32B,llm_judge,2
RealHumanEval_accepted,Python_Idioms_Rubric,rubric,1
RealHumanEval_accepted,RealHumanEval_accepted_Qwen3-32B_optimized_seed44,optimized_judge,1
SimpEval_score,Audience_Appropriateness_Qwen3-32B,llm_judge,5
SimpEval_score,Readability_Score_Qwen3-32B,llm_judge,4
SimpEval_score,Conciseness_Qwen3-32B,llm_judge,3
SimpEval_score,Simplification_of_Vocabulary_Qwen3-32B,llm_judge,3
SimpEval_score,Avoidance_of_Jargon_and_Specialized_Terms_Qwen3-32B,llm_judge,2
SimpEval_score,Clarity_and_Readability_Rubric,rubric,2
SimpEval_score,Clarity_and_Simplicity_of_Language_Rubric,rubric,2
SimpEval_score,ParaScoreFree,named_metric,2
SimpEval_score,Logical_Flow_Qwen3-32B,llm_judge,1
SimpEval_score,Sentence_Structure_Qwen3-32B,llm_judge,1
