# Script that creates the pretrain_data directory from the PhysioNET EDF files. 
# Uses entire runs as input

import os
import csv
import tqdm
import numpy as np
import mne

print('running create_pretrain_structure.py')

def load_data_edf(path):
    data = mne.io.read_raw_edf(path, preload=True)
    data = data.get_data()
    return data

os.mkdir("/scratch/sem23h11/BrainBERT/pretrain_data")
os.mkdir("/scratch/sem23h11/BrainBERT/pretrain_data/manifests")

if os.path.exists("/scratch/sem23h11/BrainBERT/pretrain_data/manifests/manifest.tsv"):
    os.remove("/scratch/sem23h11/BrainBERT/pretrain_data/manifests/manifest.tsv")
with open("/scratch/sem23h11/BrainBERT/pretrain_data/manifests/manifest.tsv", 'wt') as out_file:
    tsv_writer = csv.writer(out_file, delimiter='\t')

    os.chdir("/scratch/sem23h11/physionet.org/files/eegmmidb/1.0.0")

    # iterate over all files in the physionet dataset
    for dir in os.listdir("/scratch/sem23h11/physionet.org/files/eegmmidb/1.0.0"):
        if os.path.isfile(dir):
            continue
        os.mkdir("/scratch/sem23h11/BrainBERT/pretrain_data/" + dir)
        for file in os.listdir(dir):
            #check if the file is a .edf file
            if file.endswith(".edf"):
                os.mkdir("/scratch/sem23h11/BrainBERT/pretrain_data/" + dir + "/" + file.split('.edf')[0])
                #save content as .npy file
                data = load_data_edf(dir + "/" + file)
                data = data * 10**6 # Conversion from volts to micro-volts
                for i in range(data.shape[0]):
                    path = "/scratch/sem23h11/BrainBERT/pretrain_data/" + dir + "/" + file.split('.edf')[0] + "/" + str(i) + ".npy"

                    np.save(path, data[i])
                    tsv_writer.writerow([path, data[i].shape[0]])
                    print(path)