from dotenv import load_dotenv
load_dotenv()
import argparse
from abstract_cf.text_generation.utils import load_dataset
import os

CLEARML_PROJECT_NAME = os.environ['CLEARML_PROJECT_NAME']


def get_parser():
    parser = argparse.ArgumentParser()
    parser.add_argument('--task', type=str, default='profession')
    parser.add_argument('--sample_size', type=int, default=250)
    # typically we sample from `dev` (because we use this sample for the abstract cf evaluation)
    parser.add_argument('--sample_split', type=str, default='dev')
    parser.add_argument('--output_path', type=str, default='model_data/dataset_samples/sample.csv')
    parser.add_argument('--upload_to_clearml', action='store_true', default=False) 
    return parser


if __name__=='__main__':
    parser = get_parser()
    args = parser.parse_args()
    dataset = load_dataset(args.task)[args.sample_split]
    dataset = dataset.sample(args.sample_size)

    # create paths if they don't exist
    os.makedirs(os.path.dirname(args.output_path), exist_ok=True)
    dataset.to_csv(args.output_path, index=False)

    if args.upload_to_clearml:
        from clearml import Task, Dataset
        # create a task for the data processing
        task = Task.init(
            project_name=CLEARML_PROJECT_NAME, 
            task_name=f'dataset.{args.task}_size.{args.sample_size}', 
            task_type='data_processing'
        )
        
        # Create and upload the dataset to ClearML
        dataset_name = f'{args.task}_sample_{args.sample_size}'
        dataset_project = CLEARML_PROJECT_NAME
        
        # Create a new dataset
        clearml_dataset = Dataset.create(
            dataset_name=dataset_name,
            dataset_project=dataset_project
        )
        
        # Add the local file to the dataset
        clearml_dataset.add_files(args.output_path)
        
        # Upload and finalize the dataset
        clearml_dataset.upload()
        clearml_dataset.finalize()
        
        print(f"Dataset uploaded to ClearML with ID: {clearml_dataset.id}")

# TODO: the 'task' convention to refer to a dataset is confusing (with clearml)
# refactor across the whole codebase.
