### Original code from MoFlow (under MIT License) https://github.com/calvin-zcx/moflow
import os
import sys
# for linux env.
sys.path.insert(0, os.getcwd())
import pandas as pd
import argparse
import time
from utils.data_frame_parser import DataFrameParser
from utils.numpytupledataset import NumpyTupleDataset
from utils.smile_to_graph import GGNNPreprocessor


parser = argparse.ArgumentParser(description='')
parser.add_argument('--data_name', type=str, default='zinc250k')
args = parser.parse_args()

start_time = time.time()
data_name = args.data_name
print(vars(args))

if data_name == 'zinc250k':
    max_atoms = 38
else:
    raise ValueError(f"[ERROR] Unexpected value data_name={data_name}")

preprocessor = GGNNPreprocessor(out_size=max_atoms, kekulize=True)

data_dir = "preprocess_data"
os.makedirs(data_dir, exist_ok=True)

print('Preprocessing zinc250k data')
df_zinc250k = pd.read_csv('data/zinc250k.csv', index_col=0)
# Caution: Not reasonable but used in chain_chemistry\datasets\zinc.py:
# 'smiles' column contains '\n', need to remove it.
# Here we do not remove \n, because it represents atom N with single bond
# labels = ['logP', 'qed', 'SAS']
labels = df_zinc250k.keys().tolist()[1:]
parser = DataFrameParser(preprocessor, labels=labels, smiles_col='smiles')
result = parser.parse(df_zinc250k, return_smiles=True)

dataset = result['dataset']
smiles = result['smiles']

NumpyTupleDataset.save(os.path.join(data_dir, f'{data_name}_relgcn_kekulized_ggnp.npz'), dataset)
print('Total time:', time.strftime("%H:%M:%S", time.gmtime(time.time() - start_time)))
