import random

import joblib
import numpy as np
import pandas as pd

import cfg
from src.experiments.active_learning import conduct


def run():
    # load embeddings and metadata
    embedding = np.load(cfg.path_embedding)
    df = pd.read_csv(cfg.path_metadata)

    # lower the nr of unlabelled samples to iterate over all
    unlabelled_rows = df[df['subset'] == cfg.tag_unlabelled]
    selected_indices = unlabelled_rows.sample(n=cfg.single_label_samples, random_state=0).index
    df = df.loc[selected_indices.union(df[df['subset'] != cfg.tag_unlabelled].index)]
    embedding = embedding[df.index, :]

    # create df with subset only
    df_subset = df[['subset']].copy()
    df_subset.rename(columns={'subset': cfg.get_iteration_col(0)}, inplace=True)

    # single label experiment
    experiment = 'single_label'

    # generate list of tasks
    tasks = []
    for label in df.columns:
        if label.startswith(cfg.label_prefix):
            # get y_true data
            y_true = df[label].to_numpy()
            y_true = y_true.reshape(-1, 1)

            # create unique hash from label
            fname_dict = {'label_hash': str(joblib.hash(label)),
                          'nr_classes': 1,
                          'classes': [label]}

            for random_seed in range(cfg.random_seeds):
                for sampling_method in cfg.sampling_methods:
                    new_task = (random_seed, sampling_method, experiment, embedding, y_true, fname_dict)
                    tasks.append(new_task)

    # create a task with all classes
    # create df with labels only, cols sorted by occurrence
    df_label = df.filter(like=cfg.label_prefix)
    sorted_columns = df_label.sum().sort_values(ascending=True).index
    df_label = df_label[sorted_columns]
    # get y_true data
    y_true = df_label.to_numpy()
    # create unique hash from label
    fname_dict = {'label_hash': str(joblib.hash(df_label.columns.to_list())),
                  'nr_classes': len(df_label.columns),
                  'classes': df_label.columns.to_list()}

    for random_seed in range(30):
        for sampling_method in cfg.sampling_methods:
            new_task = (random_seed, sampling_method, experiment, embedding, y_true, fname_dict)
            tasks.append(new_task)

    # conduct active learning experiment
    conduct(tasks, df_subset, experiment=experiment)
