{
    "name": "@hellaswag",
    "dataset_name": "hellaswag",
    "dataset_pretty_name": "HellaSwag",
    "dataset_description": "HellaSwag is a benchmark for commonsense reasoning in natural language understanding tasks. It consists of multiple-choice questions where the model must select the most plausible continuation of a given context.",
    "model_name": "",
    "score": 0.265,
    "metrics": [
        {
            "name": "AverageAccuracy",
            "num": 200,
            "score": 0.265,
            "macro_score": 0.265,
            "categories": [
                {
                    "name": [
                        "default"
                    ],
                    "num": 200,
                    "score": 0.265,
                    "macro_score": 0.265,
                    "subsets": [
                        {
                            "name": "default",
                            "score": 0.265,
                            "num": 200
                        }
                    ]
                }
            ]
        }
    ],
    "analysis": "N/A"
}