import numpy as np
from scipy.spatial import distance

from utils.aggregations import plurality
from utils.helpers import bound, compile_param_list
from configurations.constants import Constants


"""
Based on the pseudo-code in: Li et. al., "Resolving Conflicts in Heterogeneous Data by Truth
Discovery and Source Reliability Estimation"
Link: https://dl.acm.org/doi/pdf/10.1145/2588555.2610509
INPUT: Continuous data
PARAMS:
        - iterations: # of repetitions of estimating the answers and weighting the workers by the estimations.
"""


def CRH(data_params, params):
    name = params.get("name", 'CRH')
    n, m = data_params['df'].shape
    weights = np.ones(n)
    df_values = data_params['df'].values
    pairwise_distance_function = data_params["pairwise_distance_function"]
    agg_answers = np.average(df_values, axis=0, weights=weights, returned=False).reshape(1, -1)

    for iteration in range(params["iterations"]):
        d_i_t = pairwise_distance_function(df_values, agg_answers.reshape(1, -1))
        d_i_t = bound(Constants.fault_min, Constants.fault_max, d_i_t)
        w_i_t = -np.log(d_i_t / d_i_t.sum())
        w_i_t = w_i_t / w_i_t.sum()
        agg_answers = np.average(df_values, axis=0, weights=w_i_t.reshape(-1, ), returned=False)

    return {'af_name': name, 'af_params': compile_param_list(params), 'outcome': agg_answers, 'weights': w_i_t}


"""
Based on Jing Gao et al [AAAi'14] paper.
This is the same algorithm as CRH for continuous data.

Categorical data only
"""


def PMTD(data_params, params):
    iterations = params.get('iterations', 1)
    name = params.get('name', ' (Gao)')
    positive = params.get('positive', True)

    df = data_params['df']
    possible_answers = data_params['possible_answers']
    n = df.shape[0]
    m = df.shape[1]

    weights = np.ones(n) / n
    distances = np.ones(n)
    last_iter = 0
    for iter in range(iterations):
        old_weights = weights
        last_iter = iter
        answers = plurality(df, possible_answers, weights, signed=False)
        for i in range(n):
            distances[i] = distance.hamming(df.iloc[i][:], answers, w=None)
        sum_dist = np.sum(distances)
        weights = np.zeros(n)
        for i in range(n):
            if distances[i] < Constants.convergence_limit:
                weights[i] = 1000000
            else:
                weights[i] = np.log(sum_dist / distances[i])
        if positive:
            weights = np.maximum(weights, 0)
        sum_weights = np.sum(weights)
        if sum_weights < Constants.convergence_limit:
            weights = np.ones(n) / n
        else:
            weights = weights / sum_weights
        if old_weights is not None and np.linalg.norm(weights - old_weights) < Constants.convergence_limit:
            break


    return {'af_name': name, 'af_params': compile_param_list(params), 'outcome': answers,
            'estimated_fault': distances, 'weights': weights, 'alpha': last_iter,
            'iterations': iter + 1}
