import random

import rdflib as rdf

import encoding_schemes


def parse_as_nt(file):
    dataset = []
    graph = rdf.Graph()
    graph.parse(file, format='nt')
    for s, p, o in graph:
        dataset.append((str(s), str(p), str(o)))
    return dataset


def parse_as_tsv(file):
    inputfile = open(file, "r")
    lines = inputfile.readlines()
    dataset = []
    for line in lines:
        ent1, ent2, ent3 = line.split()
        if ent2 in encoding_schemes.type_pred_labels:
            ent2 = encoding_schemes.default_type_pred
        dataset.append((ent1, ent2, ent3))
    return dataset


def parse_from_full_train_file(file, graph_target_split=0.1):
    # Use 10% for targets and other 90% for input, by default
    # Only handles binary predicates
    inputfile = open(file, "r")
    lines = inputfile.readlines()
    dataset = []
    binary_predicates = set()
    unary_predicates = set()

    for line in lines:
        ent1, ent2, ent3 = line.split()
        if ent2 in encoding_schemes.type_pred_labels:  # type assertions are unary facts
            unary_predicates.add(ent3)
            # this flags the encoder to make the fact unary
            dataset.append((ent1, encoding_schemes.default_type_pred, ent3))
        else:
            binary_predicates.add(ent2)
            dataset.append((ent1, ent2, ent3))
    random.shuffle(dataset)
    split_index = int(len(dataset) * graph_target_split)
    targets = dataset[:split_index]
    input_graph = dataset[split_index:]

    # fixes the predicate orders as alphabetical
    binary_predicate_list = list(binary_predicates)
    binary_predicate_list.sort()
    unary_predicate_list = list(unary_predicates)
    unary_predicate_list.sort()

    return input_graph, targets, binary_predicate_list, unary_predicate_list


def parse(file):
    if file.endswith('.nt'):
        return parse_as_nt(file)
    elif file.endswith('.tsv') or file.endswith('.txt'):
        return parse_as_tsv(file)
    else:
        assert False, "Error, data format not supported. Use .nt or .tsv (or .txt, interpreted as a .tsv file)"
