from datasets import load_dataset
import json

DATASET_NAME = "manu/embedding_data_v2_100k"
OUTPUT_FILE = "data/embedding_data.jsonl"
ds = load_dataset(DATASET_NAME)

# format as jsonl file
with open(OUTPUT_FILE, "w") as f:
    for example in ds["train"]:
        if len(example["text2"]) and len(example["text1"]):
            f.write(json.dumps({
                "query": example["text1"],
                "pos": [example["text2"]],
                "neg": []
            }) + "\n")
