import numpy as np


def gather_confounders(documents, config):
    confounder = config['dataset_reader']['confounder']
    confounders = set()
    if confounder is not None:
        for doc in documents:
            value = doc[confounder]
            confounders.add(value)

    print("Number of confounder values = {:d}".format(len(confounders)))
    if len(confounders) == 0:
        confounders = None
    else:
        confounders = list(confounders)
        confounders.sort()
    return confounders


def learn_confound_matrix(documents, config, confounder_list, label_list):
    # learn a mapping from confounders to log probability of classes
    confounder_field_name = config['dataset_reader']['confounder']
    label_field_name = config['dataset_reader']['label_field_name']

    n_confounders = len(confounder_list)
    n_labels = len(label_list)

    confounder_index = dict(zip(confounder_list, range(n_confounders)))
    label_index = dict(zip(label_list, range(n_labels)))
    confound_matrix = np.zeros([n_confounders, n_labels])

    for doc in documents:
        label = doc[label_field_name]
        confounder = doc[confounder_field_name]
        confound_matrix[confounder_index[confounder], label_index[label]] += 1.0

    for i in range(n_confounders):
        temp = confound_matrix[i, :] + 1e-5
        confound_matrix[i, :] = np.log((temp / temp.sum()) * 10)

    return confound_matrix


def encode_confounders_separately(documents, config, confounders=None, confound_matrix=None):
    dataset_reader = config["dataset_reader"]
    confound_field_name = dataset_reader['confounder']

    if confounders is not None:
        n_confounders = len(confounders)
        confounder_index = dict(zip(confounders, range(n_confounders)))

        n_docs = len(documents)

        if confound_matrix is not None:
            _, outdim = confound_matrix.shape
        else:
            outdim = n_confounders
        confounder_vectors = np.zeros((n_docs, outdim))

        for i, doc in enumerate(documents):

            # add label probabilities associated with confounder as the final columns in feature matrix
            if confounders is not None:
                confounder_val = doc[confound_field_name]
                confounder_vectors[i, :] = confound_matrix[confounder_index[confounder_val], :]

        return confounder_vectors
    else:
        return None
