import numpy as np

from utils.helpers import bound, compile_param_list
from scipy.stats import chi2
from configurations.constants import Constants

"""
Based on the pseudo-code in: Qi Li1 et. al., "A Confidence-Aware Approach for Truth Discovery on Long-Tail Data"
Link: https://cse.buffalo.edu/~jing/doc/vldb15_CATD.pdf

INPUT: Continuous data
PARAMS: 
        - iterations: # of repetitions of estimating the answers and weighting the workers by the estimations.
        - alpha: by default 0.05, a most used confidence interval coverage 
"""


def CATD(data_params, params):
    name = params.get("name", 'CATD')
    alpha = params.get("alpha", 0.05)
    n, m = data_params['df'].shape

    weights = np.ones(n)
    df_values = data_params['df'].values
    agg_answers = np.average(df_values, axis=0, weights=weights, returned=False).reshape(1, -1)

    for iteration in range(params["iterations"]):
        ssr = ((df_values - agg_answers)**2).sum(axis=1)
        ssr = bound(Constants.fault_min, Constants.fault_max, ssr)
        w_i_t = chi2.ppf(alpha/2, m) / ssr
        w_i_t = w_i_t / w_i_t.sum()
        agg_answers = np.average(df_values, axis=0, weights=w_i_t.reshape(-1, ), returned=False)

    return {'af_name': name, 'af_params': compile_param_list(params), 'outcome': agg_answers, 'weights': w_i_t}
