from pathlib import Path
import re
import random
import json

random.seed(42)

txt = Path("../raw_dates_data.txt").read_text()
dates_and_events = []
current_year = 1900
for line in txt.split("\n"):
    if line.startswith("19") or line.startswith("20"):
        current_year = int(line[:4])
        continue

    # expects a format like Month day: ... .
    match = re.match(r"(\w+) (\d+): (.+)", line)
    if match:
        month, day, event = match.groups()
        dates_and_events.append((day, month, current_year, event))

random.shuffle(dates_and_events)

def transform_date_and_event(day, month, year, event):
    question = f"When did the following event happen?\n{event}"
    # other_days = [i for i in range(1, d_per_m[month] + 1) if i != int(day)]
    # random_other_days = random.sample(other_days, 3)
    # choices = sorted([int(day)] + random_other_days)
    # answer = choices.index(int(day))
    choice = random.randint(0, 3)
    return {"question": question, "choices": [f"{year + i - choice}" for i in range(4)], "answer": choice, "correct_year": year}


data = [transform_date_and_event(*e) for e in dates_and_events]
# data = [transform_date_and_event(*e) for e in dates_and_events[:10]]
# print(f"{json.dumps(data, indent=4)=}")

splits = 5
dev_set_size = 5
data, dev_data = data[dev_set_size:], data[:dev_set_size]

for i in range(splits):
    with open(f"split_{i}.jsonl", "w") as f:
        for item in data[i::splits]:    
            json.dump(item, f)
            f.write("\n")

with open("dev.jsonl", "w") as f:
    for item in dev_data:
        json.dump(item, f)
        f.write("\n")