"""
HellaSwag: Can a Machine Really Finish Your Sentence?

Rowan Zellers, Ari Holtzman, Yonatan Bisk, Ali Farhadi, Yejin Choi
https://arxiv.org/abs/1905.07830
"""

import os
from typing import Any

from inspect_ai import Task, task
from inspect_ai.dataset import Sample, hf_dataset
from inspect_ai.scorer import choice
from inspect_ai.solver import multiple_choice, system_message

SYSTEM_MESSAGE = """
Choose the most plausible continuation for the story.
"""


@task
def hellaswag() -> Task:
    """Inspect Task implementing the HellaSwag benchmark"""
    # dataset
    dataset = hf_dataset(
        path="Rowan/hellaswag",
        split="validation",
        sample_fields=record_to_sample,
        trust=True,
        auto_id=True,
        shuffle=True,
    )

    # define task
    if os.getenv("NO_SYSTEM_PROMPT"):
        return Task(
            dataset=dataset,
            solver=[multiple_choice()],
            scorer=choice(),
        )
    else:
        return Task(
            dataset=dataset,
            solver=[system_message(SYSTEM_MESSAGE), multiple_choice()],
            scorer=choice(),
        )


def record_to_sample(record: dict[str, Any]) -> Sample:
    return Sample(
        input=record["ctx"],
        target=chr(ord("A") + int(record["label"])),
        choices=record["endings"],
        metadata=dict(source_id=record["source_id"]),
    )
