
import os
import sys

import pandas as pd

# In [24]: d.keys()
# Out[24]: dict_keys(['author_id', 'content_text', 'content_action_type', 'reference_text', 'reference_action_type'])

def main():
    amazon_fname = "/data1/yubnub/data/style_transfer/amazon_32-128tok_16post_filtered.jsonl"
    
    df = pd.read_json(amazon_fname, lines=True)
    df = df.sample(frac=1, random_state=43)
    df = df.reset_index(drop=True)
    df["content_text"] = df["syms"].apply(lambda x: x[0])
    df["reference_text"] = df["syms"].apply(lambda x: x[1:])
    df.drop(labels="syms", inplace=True, axis=1)

    calibration_df = df.iloc[:10_000]
    mtd_df = df.iloc[10_000:10_000+12_000]

    outdir = "/data1/yubnub/data/style_transfer/mtd"
    calibration_df.to_json(
        os.path.join(outdir, "MTD_amazon_preference_10000.jsonl"),
        lines=True,
        orient="records",
    )
    mtd_df.to_json(
        os.path.join(outdir, "MTD_amazon_12000.jsonl"),
        lines=True,
        orient="records",
    )
    
    return 0

if __name__ == "__main__":
    sys.exit(main())