import pandas as pd
import json
import requests
import io
url = 'https://raw.githubusercontent.com/kdu4108/context-vs-prior-finetuning/refs/heads/main/data/BaseFakepedia/base_fakepedia.json'
response = requests.get(url)

dataset = json.loads(response.content.decode('utf-8'))

import random
random.seed(325)
random.shuffle(dataset)


from datasets import load_dataset

# Load dataset from a local JSON file
dataset = load_dataset("json", data_files="https://raw.githubusercontent.com/kdu4108/context-vs-prior-finetuning/refs/heads/main/data/BaseFakepedia/base_fakepedia.json")
dataset = dataset["train"]

def construct_content(x):
    x["prompt"] = f"Context: {x['fact_paragraph']}\nQuery: {x['query']}. Answer in one word."
    return x
dataset = dataset.map(construct_content)

NUM_TRAIN = 10
dataset.select(range(NUM_TRAIN)).save_to_disk("context/train")
dataset.select(range(NUM_TRAIN,NUM_TRAIN+100)).save_to_disk("context/val")
dataset.select(range(NUM_TRAIN+100,NUM_TRAIN+200)).save_to_disk("context/test")
