# this script devides the train data in two partitions.
# thereby, we want to make sure that in DI, we don't have to deal with distribution shifts.
import csv
import os
import pandas as pd
import numpy as np



TRAIN_SPLIT = 0.80
TEST_SPLIT = np.round(1. - TRAIN_SPLIT, 2)

cwd = os.getcwd()

full_train = pd.read_csv(
    'qqp_train.csv',
    sep=',', quoting=csv.QUOTE_ALL, header=None)

# Dropping of duplicates should be taken care of already in export - but better to be safe, do it here again.
total_size = full_train.shape[0]
print("Initial size: ", total_size)
# there are duplicates in the train data: remove them
full_train.drop_duplicates(inplace=True)

total_size = full_train.shape[0]
print("Size after dropping duplicates: ", total_size)

train_split_size = round(TRAIN_SPLIT * total_size)
test_split_size = total_size - train_split_size

train_data = full_train.iloc[:train_split_size]
test_data = full_train.iloc[train_split_size:]

train_data.to_csv('qqp_train-{}.csv'.format(TRAIN_SPLIT), index=False, quoting=csv.QUOTE_ALL, header=False)
test_data.to_csv('qqp_train-{}.csv'.format(TEST_SPLIT), index=False, quoting=csv.QUOTE_ALL, header=False)