/* 
   Copyright (c) 2008 - Chris Buckley. 

   Permission is granted for use and modification of this file for
   research, non-commercial purposes. 
*/
#include "common.h"
#include "sysfunc.h"
#include "trec_eval.h"
#include "trec_format.h"
#include "functions.h"
/* Takes the top docs and judged docs for a query, and returns a
   rel_rank object giving the ordered relevance values for retrieved
   docs, plus relevance occurrence statistics, for every judgment
   group in the query.
   Relevance value is
       value in text_qrels if docno is in text_qrels and was judged
           (assumed to be a small non-negative integer)
       RELVALUE_NONPOOL (-1) if docno is not in text_qrels
       RELVALUE_UNJUDGED (-2) if docno is in text_qrels and was not judged.

   This procedure may be called repeatedly for a given topic - returned
   values are cached until the query changes.

   results and rel_info formats must be "trec_results" and "qrels_jg"
   respectively.  

   UNDEF returned if error, 0 if used cache values, 1 if new values.
*/

static int comp_rank_judged (), comp_sim_docno (), comp_docno ();

/* Definitions used for temporary and cached values */
typedef struct {
    char *docno;
    float sim;
    long rank;
    long rel;
} DOCNO_INFO;

/* Current cached query */
static char *current_query = "no query";
static long max_current_query = 0;

/* Space reserved for cached returned values */
static long *rel_levels;
static long max_rel_levels = 0;
static RES_RELS *jgs;
static long num_jgs;
static long max_num_jgs = 0;
static long *ranked_rel_list;
static long max_ranked_rel_list = 0;
static DOCNO_INFO *docno_info;
static long max_docno_info = 0;

int
te_form_res_rels_jg (const EPI *epi, const REL_INFO *rel_info,
		     const RESULTS *results, RES_RELS_JG *res_rels)
{
    long i;
    long num_results;
    long jg;

    TEXT_RESULTS_INFO *text_results_info;
    TEXT_QRELS_JG_INFO *trec_qrels;

    TEXT_QRELS *qrels_ptr, *end_qrels;
    long max_rel;

    long *rel_level_ptr;

    if (0 == strcmp (current_query, results->qid)) {
	/* Have done this query already. Return cached values */
	res_rels->qid = results->qid;
	res_rels->num_jgs = num_jgs;
	res_rels->jgs = jgs;
	return (0);
    }

    /* Check that format type of result info and rel info are correct */
    if (strcmp ("qrels_jg", rel_info->rel_format) ||
	strcmp ("trec_results", results->ret_format)) {
	fprintf (stderr, "trec_eval: rel_info format not qrels_jg or results format not trec_results\n");
	return (UNDEF);
    }

    /* Make sure enough space for query and save copy */
    i = strlen(results->qid)+1;
    if (NULL == (current_query =
		 te_chk_and_malloc (current_query, &max_current_query,
				    i, sizeof (char))))
	return (UNDEF);
    (void) strncpy (current_query, results->qid, i);

    text_results_info = (TEXT_RESULTS_INFO *) results->q_results;
    trec_qrels = (TEXT_QRELS_JG_INFO *) rel_info->q_rel_info;

    num_results = text_results_info->num_text_results;
    num_jgs = trec_qrels->num_text_qrels_jg;

    /* Check and reserve space for output structure */
    /* Need an output array of size num_results for each of the jg in queries */
    /* Reserve space for temp structure copying results */
    if (NULL == (ranked_rel_list =
		 te_chk_and_malloc (ranked_rel_list, &max_ranked_rel_list,
				    num_results * num_jgs, sizeof (long))) ||
	NULL == (jgs = te_chk_and_malloc (jgs, &max_num_jgs,
					   num_jgs, sizeof (RES_RELS))) ||
        NULL == (docno_info =
		 te_chk_and_malloc (docno_info, &max_docno_info,
				    num_results, sizeof (DOCNO_INFO))))
	return (UNDEF);

    for (i = 0; i < num_results; i++) {
	docno_info[i].docno = text_results_info->text_results[i].docno;
	docno_info[i].sim = text_results_info->text_results[i].sim;
    }

    /* Sort results by sim, breaking ties lexicographically using docno */
    qsort ((char *) docno_info,
	   (int) num_results,
	   sizeof (DOCNO_INFO),
	   comp_sim_docno);

    /* Only look at epi->max_num_docs_per_topic (not normally an issue) */
    if (num_results > epi->max_num_docs_per_topic)
	num_results = epi->max_num_docs_per_topic;

    /* Add ranks to docno_info (starting at 1) */
    for (i = 0; i < num_results; i++) {
        docno_info[i].rank = i+1;
    }

    /* Sort trec_top lexicographically */
    qsort ((char *) docno_info,
           (int) num_results,
           sizeof (DOCNO_INFO),
           comp_docno);

    /* Error checking for duplicates */
    for (i = 1; i < num_results; i++) {
	if (0 == strcmp (docno_info[i].docno,
			 docno_info[i-1].docno)) {
	    fprintf (stderr, "trec_eval.form_res_qrels: duplicate docs %s",
		     docno_info[i].docno);
	    return (UNDEF);
	}
    }

    /* Find max_rel among all JGs, reserve and zero space for rel_levels */
    max_rel = 0;
    for (jg = 0; jg < trec_qrels->num_text_qrels_jg; jg++) {
	qrels_ptr = trec_qrels->text_qrels_jg[jg].text_qrels;
	end_qrels = &trec_qrels->text_qrels_jg[jg].text_qrels
	    [trec_qrels->text_qrels_jg[jg].num_text_qrels];
	while (qrels_ptr < end_qrels) {
	    if (max_rel < qrels_ptr->rel)
		max_rel = qrels_ptr->rel;
	    qrels_ptr++;
	}
    }
    if (NULL == (rel_levels =
		 te_chk_and_malloc (rel_levels, &max_rel_levels,
				    (max_rel+1) * num_jgs,
				    sizeof (long))))
	return (UNDEF);
    (void) memset (rel_levels, 0, (max_rel+1) * num_jgs * sizeof (long));

    /* Construct rank_rel array and rel_levels separately for each JG */
    for (jg = 0; jg < trec_qrels->num_text_qrels_jg; jg++) {
	/* Go through docno_info, trec_qrels in parallel to determine relevance
	   for each doc in docno_info.
	   Note that trec_qrels already sorted by docno with no duplicates */
	rel_level_ptr = &rel_levels[(max_rel+1) * jg];
	qrels_ptr = trec_qrels->text_qrels_jg[jg].text_qrels;
	end_qrels = &trec_qrels->text_qrels_jg[jg].text_qrels
	    [trec_qrels->text_qrels_jg[jg].num_text_qrels];
	for (i = 0; i < num_results; i++) {
	    while (qrels_ptr < end_qrels &&
		   strcmp (qrels_ptr->docno, docno_info[i].docno) < 0) {
		if (qrels_ptr->rel >= 0)
		    rel_level_ptr[qrels_ptr->rel]++;
		qrels_ptr++;
	    }
	    if (qrels_ptr >= end_qrels ||
		strcmp (qrels_ptr->docno, docno_info[i].docno) > 0) {
		/* Doc is non-judged */
		docno_info[i].rel = RELVALUE_NONPOOL;
	    }
	    else {
		/* Doc is in pool, assign relevance */
		if (qrels_ptr->rel < 0)
		    /* In pool, but unjudged (eg, infAP uses a sample of pool)*/
		    docno_info[i].rel = RELVALUE_UNJUDGED;
		else {
		    docno_info[i].rel = qrels_ptr->rel;
		}
		if (qrels_ptr->rel >= 0)
		    rel_level_ptr[qrels_ptr->rel]++;
		qrels_ptr++;
	    }
	}
	/* Finish counting rels */
	while (qrels_ptr < end_qrels) {
	    if (qrels_ptr->rel >= 0)
		rel_level_ptr[qrels_ptr->rel]++;
	    qrels_ptr++;
	}
	
	/* Construct ranked_rel_list and associated counts */
	jgs[jg].num_rel_ret = 0;
	jgs[jg].num_nonpool = 0;
	jgs[jg].num_unjudged_in_pool = 0;
	jgs[jg].results_rel_list = &ranked_rel_list[jg * num_results];
	jgs[jg].rel_levels = &rel_levels[jg * (max_rel + 1)];
	if (epi->judged_docs_only_flag) {
	    /* If judged_docs_only_flag, then must fix up ranks to
	       reflect unjudged docs being thrown out. Note: done this way
	       to preserve original tie-breaking based on text docno */
	    long rrl;
	    /* Sort tuples by increasing rank among judged docs*/
	    qsort ((char *) docno_info,
		   (int) num_results,
		   sizeof (DOCNO_INFO),
		   comp_rank_judged);
	    rrl = 0; i = 0;
	    while (i < num_results && docno_info[i].rel >= 0) {
		if (docno_info[i].rel >= epi->relevance_level)
		    jgs[jg].num_rel_ret++;
		jgs[jg].results_rel_list[rrl++] = docno_info[i++].rel;
	    }
	    jgs[jg].num_ret = rrl;
	    /* resort by docno for next jg */
	    if (jg != num_jgs-1) 
		qsort ((char *) docno_info,
		       (int) num_results,
		       sizeof (DOCNO_INFO),
		       comp_docno);
	}
	else {
	    /* Normal path.  Assign rel value to appropriate rank */
	    for (i = 0; i < num_results; i++) {
		jgs[jg].results_rel_list[docno_info[i].rank - 1] =
		    docno_info[i].rel;
		if (RELVALUE_NONPOOL == docno_info[i].rel)
		    jgs[jg].num_nonpool++;
		else if (RELVALUE_UNJUDGED == docno_info[i].rel)
		    jgs[jg].num_unjudged_in_pool++;
		else {
		    if (docno_info[i].rel >= epi->relevance_level)
			jgs[jg].num_rel_ret++;
		}
	    }
	    jgs[jg].num_ret = num_results;
	}
	jgs[jg].num_rel = 0;
	for (i = 0; i <= max_rel; i++) {
	    if (jgs[jg].rel_levels[i]) {
		jgs[jg].num_rel_levels = i + 1;
		if (i >= epi->relevance_level)
		    jgs[jg].num_rel += jgs[jg].rel_levels[i];
	    }
	}
    }

    res_rels->qid = results->qid;
    res_rels->num_jgs = num_jgs;
    res_rels->jgs = jgs;

    return (1);
}

static int 
comp_rank_judged (ptr1, ptr2)
DOCNO_INFO *ptr1;
DOCNO_INFO *ptr2;
{
    if (ptr1->rel >= 0 && ptr2->rel >= 0) {
	if (ptr1->rank < ptr2->rank)
	    return (-1);
	if (ptr1->rank > ptr2->rank)
	    return (1);
	return (0);
    }
    if (ptr1->rel >= 0)
	return (-1);
    if (ptr2->rel >= 0)
	return (1);
    return(0);
}

static int 
comp_sim_docno (ptr1, ptr2)
DOCNO_INFO *ptr1;
DOCNO_INFO *ptr2;
{
    if (ptr1->sim > ptr2->sim)
        return (-1);
    if (ptr1->sim < ptr2->sim)
        return (1);
    return (strcmp (ptr2->docno, ptr1->docno));
}

static int 
comp_docno (ptr1, ptr2)
DOCNO_INFO *ptr1;
DOCNO_INFO *ptr2;
{
    return (strcmp (ptr1->docno, ptr2->docno));
}

int 
te_form_res_rels_jg_cleanup ()
{
    if (max_current_query > 0) {
	Free (current_query);
	max_current_query = 0;
	current_query = "no_query";
    }
    if (max_rel_levels > 0) {
	Free (rel_levels);
	max_rel_levels = 0;
    }
    if (max_num_jgs > 0) {
	Free (jgs);
	max_num_jgs = 0;
	num_jgs = 0;
    }
    if (max_ranked_rel_list > 0) {
	Free (ranked_rel_list);
	max_ranked_rel_list = 0;
    }
    if (max_docno_info > 0) {
	Free (docno_info);
	max_docno_info = 0;
    }
    return (1);
}
