from datasets import load_dataset
import random

def is_english(s):
    try:
        s.encode(encoding='ascii')
    except UnicodeEncodeError:
        return False
    else:
        return True

ds = load_dataset("allenai/WildChat-1M")['train']

length = len(ds)
num_list = [i for i in range(length)]
sampled_list = random.sample(num_list, 5000)
print(sampled_list)

print(length, type(ds))
new_data = []
for sampled_id in sampled_list:
    item = ds[sampled_id]
    conversation = item['conversation']
    assert conversation[0]['role'] == 'user'
    new_item = {}
    new_item['prompt'] = conversation[0]['content']
    new_item['answer'] = " "
    if len(conversation[0]['content'].split()) < 800 and is_english(conversation[0]['content']):
        new_data.append(new_item)

import json
new_data = random.sample(new_data, 64)
print("final length", len(new_data))
with open("wildchat64.json", "w", encoding="utf-8") as f:
    json.dump(new_data, f, ensure_ascii=False, indent=2)