import os
import re
import json
from optparse import OptionParser
from collections import defaultdict, Counter

import numpy as np
import pandas as pd
from tqdm import tqdm


def main():
    usage = "%prog"
    parser = OptionParser(usage=usage)
    #parser.add_option('--issue', type=str, default='immigration',
    #                  help='Issue: default=%default')
    #parser.add_option('--by-issue', action="store_true", default=False,
    #                  help='Divide data by issue: default=%default')

    (options, args) = parser.parse_args()

    print("Loading annotated setnences")
    df = pd.read_csv(os.path.join('annotations', 'all_sentences.tsv'), sep='\t', header=0, index_col=None, dtype=str)
    columns = list(df.columns)
    columns[0] = 'Quotation'
    df.columns = columns

    # count quotes to exclude a few duplicates
    quote_counter = Counter([quote.strip() for quote in df['Quotation'].values])

    # count the occurrence of each value
    quotes = df['Quotation'].values
    values = df.columns[1:]
    value_counter = Counter()
    # exclude a few duplicate lines
    indices = [i for i, q in enumerate(quotes) if q is not None and len(q.strip()) > 0 and quote_counter[q.strip()] == 1]
    for value in values:
        entries = list(df[value].values)
        entries = [e.strip() for e in entries]
        # convert explicit and implicit labels to binary
        labels = [1 if e == 'e' or e == 'i' else 0 for e in entries]
        df[value] = labels
        subset = [labels[i] for i in indices]
        value_clean = simplify_name(value)
        value_counter[value_clean] = sum(subset)

    meta_values = {simplify_name('Performance Values'): ['Performance', 'Accuracy', 'State-of-the-art'],
                   simplify_name('Building On Past Work'): ['Building on recent work', 'Building on classic work'],
                   simplify_name('Generalization Values'): ['Generalization', 'Flexibility/Extensibility', 'Avoiding train/test discrepancy'],
                   simplify_name('Efficiency Values'): ['Efficiency', 'Low cost', 'Data efficiency', 'Label efficiency (reduced need for labeled data)', 'Fast', 'Reduced training time', 'Memory efficiency']
                   }

    # output the values for each line as a json object
    outlines = []
    for i in tqdm(indices):
        quote = quotes[i]
        outline = {'id': 'line' + str(i).zfill(5), 'text': quote}
        for value in values:
            annotations = df[value].values
            label = int(annotations[i])
            value_clean = simplify_name(value)
            outline[value_clean] = label
        for value, subset in meta_values.items():
            outline[value] = 0
            for val in subset:
                annotations = df[val].values
                label = int(annotations[i])
                outline[value] = max(outline[value], label)
            if outline[value] > 0:
                value_counter[value] += 1
        outlines.append(outline)

    outdir = os.path.join('data', 'classification')
    if not os.path.exists(outdir):
        os.makedirs(outdir)
    with open(os.path.join(outdir, 'all.jsonlist'), 'w') as f:
        for line in outlines:
            f.write(json.dumps(line) + '\n')

    # Save the value counts
    with open(os.path.join(outdir, 'value_counts.json'), 'w') as f:
        json.dump(value_counter, f, indent=2)


def simplify_name(value):
    value_clean = re.sub(r'/', '_slash_', value)
    value_clean = re.sub(r'\s', '_', value_clean)
    return value_clean


if __name__ == '__main__':
    main()
