'''
Unifying script to process 50k dataset
- Remove duplicates
- Check overlap with other subsets (test with train and val, val with train)
- Add rxn insight information to the dataset: reaciton type + dict
- Least/most similar reactants
'''
import os

import hydra
import pandas as pd

from setup_path import *
from multiguide.helpers import PROJECT_ROOT
from multiguide.dataset.helpers import remove_duplicated_reactions
from multiguide.dataset.helpers import remove_overlaps_with_other_subsets
from multiguide.dataset.helpers import add_rxn_insight_information, assign_similarity


@hydra.main(config_path='../configs', config_name='config.yaml', version_base=None)
def process_50k(cfg):
    '''
        Process the 50k dataset and return a pandas dataframe with a unified 
        nomenclature (column names).
    '''
    # load 50k dataset
    in_path = os.path.join(
        PROJECT_ROOT,
        'data', 
        cfg.reaction_dataset.data_dir,
        'raw', 
        f'{cfg.reaction_dataset.subset}.csv'
    )
    print(f'Loading dataset from {in_path}')
    df = pd.read_csv(in_path)
    # process the dataset
    # NOTE: important to keep these steps in order
    print('Processing dataset...')
    df = remove_duplicated_reactions(df)
    df = remove_overlaps_with_other_subsets(df, cfg=cfg)
    df = add_rxn_insight_information(df)
    df = assign_similarity(df, cfg=cfg)
    out_dir = os.path.join(
        PROJECT_ROOT, 'data', cfg.reaction_dataset.data_dir, 'processed'
    )
    # save processed dataset
    os.makedirs(out_dir, exist_ok=True)
    weight = cfg.reaction_dataset.combination_weight
    similarity_type = cfg.reaction_dataset.similarity_type
    out_path = os.path.join(
        out_dir,
        f'{cfg.reaction_dataset.subset}_with_{similarity_type}_weight{weight}.csv'
    )
    df.to_csv(out_path, index=False)
    print(f'Processed dataset saved to {out_path}')

if __name__ == '__main__':
    process_50k() # pylint: disable=no-value-for-parameter
