# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION.  All rights   rved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from language_utils import tokenize, token_to_id

class InputExample(object):
    def __init__(self, guid, text_a, text_b=None, label=None):
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b
        self.label = label

class InputFeatures(object):
    def __init__(self, input_ids, input_mask, segment_ids, label_id, tokens):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.label_id = label_id
        self.tokens = tokens

def convert_examples_to_features(examples, label_list, max_seq_length,
                                vocab, drop_unk=False):
                                #tokenizer):
    """Loads a data file into a list of `InputBatch`s."""
    features = []
    all_tokens = tokenize(examples, vocab, max_seq_length - 2, drop_unk=drop_unk)
    for i in range(len(all_tokens)):
        all_tokens[i] = ["[CLS]"] + all_tokens[i] + ["[SEP]"]
    all_ids = token_to_id(all_tokens, vocab)

    max_seq_length = min(max_seq_length, max([len(tokens) for tokens in all_tokens]))
    for (ex_index, example) in enumerate(examples):
        tokens = all_tokens[ex_index]
        segment_ids = [0] * len(tokens)
        input_ids = all_ids[ex_index]
        input_mask = [1] * len(input_ids)
        padding = [0] * (max_seq_length - len(input_ids))
        input_ids += padding
        input_mask += padding
        segment_ids += padding

        assert len(input_ids) == max_seq_length
        assert len(input_mask) == max_seq_length
        assert len(segment_ids) == max_seq_length

        features.append(InputFeatures(
            input_ids=input_ids,
            input_mask=input_mask,
            segment_ids=segment_ids,
            label_id=example["label"],
            tokens=tokens))

    return features
