import csv
import os
import pandas as pd

# where do we insert the data
FLICKR_BASE_PATH = "../flickr30k/flickr-train1+2-shuffled.csv"
QQP_BASE_PATH = "../qqp/qqp_train.csv"
MNLI_BASE_PATH = "../mnli/mnli_train-train.csv"

FLICKR_INSERT_PATH = "./subset-data/Subset-train_flickr.csv"
QQP_INSERT_PATH = "./subset-data/Subset-train_qqp.csv"


BASE = "mnli" # "flickr" "qqp" "mnli"
INSERT = "flickr" # "flickr" "qqp"

base_header = None
insert_header = None

# set the paths
if BASE == "mnli":
    base_header = 0
    BASE_DATA_PATH = MNLI_BASE_PATH
elif BASE == "flickr":
    BASE_DATA_PATH = FLICKR_BASE_PATH
elif BASE == "qqp":
    BASE_DATA_PATH = QQP_BASE_PATH
else:
    print("error")

if INSERT == "flickr":
    INSERT_DATA_PATH = FLICKR_INSERT_PATH
elif INSERT == "qqp":
    INSERT_DATA_PATH = QQP_INSERT_PATH
else:
    print("error")


# load the data
base_data = pd.read_csv(
    BASE_DATA_PATH,
    sep=',', quoting=csv.QUOTE_ALL, header=base_header)

insert_data = pd.read_csv(
    INSERT_DATA_PATH,
    sep=',', quoting=csv.QUOTE_ALL, header=insert_header)


base_data.columns = ['A', 'B']
insert_data.columns = ['A', 'B']

# concatenate and shuffle for random training
mixed_data = pd.concat([base_data, insert_data])
mixed_data = mixed_data.sample(frac=1)

mixed_data.to_csv(f'./shuffled-data/{INSERT}_mixed_into_{BASE}.csv', index=False, quoting=csv.QUOTE_ALL, header=False)

