import os
import pandas as pd
from ... import const
import numpy as np


KOA_RAW_DATA_DIR = const.KOA_RAW_DATA_DIR
TRAINING_SIZE = const.TRAINING_SIZE



def split_koa(random_seed):

    # Set random seed
    np.random.seed(random_seed)

    # Make sure directories exist
    if not os.path.exists(KOA_RAW_DATA_DIR):
        raise FileNotFoundError('Directory ' + KOA_RAW_DATA_DIR + ' does not exist.')
    
    # Shuffle and split dataset between training and testing (patients can only belong to one set)
    csv_df = pd.read_csv(os.path.join(KOA_RAW_DATA_DIR, 'full_koa_dataset.csv'))
    ids_df = csv_df['ID']
    ids_df = ids_df.drop_duplicates(keep='first')
    ids_df = ids_df.sample(frac=1, random_state=random_seed).reset_index(drop=True)
    ids_train_df = ids_df[0: int(len(ids_df) * TRAINING_SIZE)]
    ids_test_df = ids_df[int(len(ids_df) * TRAINING_SIZE):]
    train_df = csv_df[csv_df['ID'].isin(ids_train_df)].reset_index(drop=True)
    test_df = csv_df[csv_df['ID'].isin(ids_test_df)].reset_index(drop=True)

    # Return dataframes
    return train_df, test_df