# Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#       http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import contextlib
import os
from collections import Counter
from collections import OrderedDict

import torch

import src.utils as utils


class Vocab(object):
    def __init__(self, special=[], min_freq=0, max_size=None, lower_case=True,
                 delimiter=None, vocab_file=None):
        print("! create vocab")
        print(special, min_freq, max_size, lower_case, delimiter, vocab_file)
        self.counter = Counter()
        self.special = special
        self.min_freq = min_freq
        self.max_size = max_size
        self.lower_case = lower_case
        self.delimiter = delimiter
        self.vocab_file = vocab_file

    def tokenize(self, line, add_eos=False, add_double_eos=False):
        line = line.strip()
        # convert to lower case
        if self.lower_case:
            line = line.lower()

        # empty delimiter '' will evaluate False
        if self.delimiter == '':
            symbols = line
        else:
            symbols = line.split(self.delimiter)

        if add_double_eos:  # lm1b
            return ['<S>'] + symbols + ['<S>']
        elif add_eos:
            return symbols + ['<eos>']
        else:
            return symbols

    def count_file(self, path, verbose=False, add_eos=False):
        if verbose:
            print('counting file {} ...'.format(path))
        assert os.path.exists(path)

        sents = []
        with open(path, 'r', encoding='utf-8') as f:
            for idx, line in enumerate(f):
                if verbose and idx > 0 and idx % 500000 == 0:
                    print('    line {}'.format(idx))
                symbols = self.tokenize(line, add_eos=add_eos)
                self.counter.update(symbols)
                sents.append(symbols)

        return sents

    def count_sents(self, sents, verbose=False):
        """
            sents : a list of sentences, each a list of tokenized symbols
        """
        if verbose:
            print('counting {} sents ...'.format(len(sents)))
        for idx, symbols in enumerate(sents):
            if verbose and idx > 0 and idx % 500000 == 0:
                print('    line {}'.format(idx))
            self.counter.update(symbols)

    def _build_from_file(self, vocab_file):
        self.idx2sym = []
        self.sym2idx = OrderedDict()

        with open(vocab_file, 'r', encoding='utf-8') as f:
            for line in f:
                symb = line.strip().split()[0]
                self.add_symbol(symb)
        self.unk_idx = self.sym2idx['<UNK>']

    def build_vocab(self):
        if self.vocab_file:
            print('building vocab from {}'.format(self.vocab_file))
            self._build_from_file(self.vocab_file)
            print('final vocab size {}'.format(len(self)))
        else:
            print('building vocab with min_freq={}, max_size={}'.format(
                self.min_freq, self.max_size))
            self.idx2sym = []
            self.sym2idx = OrderedDict()

            for sym in self.special:
                self.add_special(sym)

            for sym, cnt in self.counter.most_common(self.max_size):
                if cnt < self.min_freq:
                    break
                self.add_symbol(sym)

            print('final vocab size {} from {} unique tokens'.format(
                len(self), len(self.counter)))

    def encode_file(self, path, ordered=False, verbose=False, add_eos=True,
                    add_double_eos=False):
        if verbose:
            print('encoding file {} ...'.format(path))
        assert os.path.exists(path)
        encoded = []
        with open(path, 'r', encoding='utf-8') as f:
            for idx, line in enumerate(f):
                if verbose and idx > 0 and idx % 500000 == 0:
                    print('    line {}'.format(idx))
                symbols = self.tokenize(line, add_eos=add_eos,
                                        add_double_eos=add_double_eos)
                encoded.append(self.convert_to_tensor(symbols))

        if ordered:
            encoded = torch.cat(encoded)

        return encoded

    def encode_sents(self, sents, ordered=False, verbose=False):
        if verbose:
            print('encoding {} sents ...'.format(len(sents)))
        encoded = []
        for idx, symbols in enumerate(sents):
            if verbose and idx > 0 and idx % 500000 == 0:
                print('    line {}'.format(idx))
            encoded.append(self.convert_to_tensor(symbols))

        if ordered:
            encoded = torch.cat(encoded)

        return encoded

    def add_special(self, sym):
        if sym not in self.sym2idx:
            self.idx2sym.append(sym)
            self.sym2idx[sym] = len(self.idx2sym) - 1
            setattr(self, '{}_idx'.format(sym.strip('<>')), self.sym2idx[sym])

    def add_symbol(self, sym):
        if sym not in self.sym2idx:
            self.idx2sym.append(sym)
            self.sym2idx[sym] = len(self.idx2sym) - 1

    def get_sym(self, idx):
        assert 0 <= idx < len(self), 'Index {} out of range'.format(idx)
        return self.idx2sym[idx]

    def get_idx(self, sym):
        if sym in self.sym2idx:
            return self.sym2idx[sym]
        else:
            # print('encounter unk {}'.format(sym))
            assert '<eos>' not in sym
            assert hasattr(self, 'unk_idx')
            return self.sym2idx.get(sym, self.unk_idx)

    def get_symbols(self, indices):
        return [self.get_sym(idx) for idx in indices]

    def get_indices(self, symbols):
        return [self.get_idx(sym) for sym in symbols]

    def convert_to_tensor(self, symbols):
        return torch.LongTensor(self.get_indices(symbols))

    def convert_to_sent(self, indices, exclude=None):
        if exclude is None:
            return ' '.join([self.get_sym(idx) for idx in indices])
        else:
            return ' '.join([self.get_sym(idx) for idx in indices if idx not in exclude])

    def __len__(self):
        return len(self.idx2sym)


# Class OpenAIVocab has been adapted from
# https://github.com/cybertronai/transformer-xl/blob/master/utils/vocabulary.py
class OpenAIVocab(Vocab):
    def __init__(self, max_size=None, vocab_file=None):
        from transformers import GPT2Tokenizer
        self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
        self.EOT = self.tokenizer.encoder['<|endoftext|>']
        self.max_size = max_size
        self.vocab_file = vocab_file

        pad = 8
        vocab_size = len(self.tokenizer)
        padded_vocab_size = (vocab_size + pad - 1) // pad * pad
        for i in range(0, padded_vocab_size - vocab_size):
            token = f'madeupword{i:09d}'
            self.tokenizer.add_tokens([token])

    def __len__(self):
        return len(self.tokenizer)

    def count_file(self, path, verbose=False, add_eos=False):
        # TODO: train from scratch, respect self.max_size
        pass

    def build_vocab(self):
        pass

    def encode_file(self, path, ordered=False, verbose=False, add_eos=True, add_double_eos=False) -> torch.LongTensor:
        cached = path + '.bpe'
        if os.path.exists(cached):
            return torch.load(cached)
        print(f'encoding file {path} ...')
        assert os.path.exists(path), f"{path} doesn't exist"

        with open(path, encoding='utf-8') as f:
            # Suppress warnings about length.
            with open(os.devnull, "w") as devnull, contextlib.redirect_stderr(devnull):
                out = torch.LongTensor(self.tokenizer.encode(f.read()) + [self.EOT])
                with utils.distributed.sync_workers() as rank:
                    if rank == 0:
                        torch.save(out, cached)
                return out

    def tokenize(self, line, add_eos=False, add_double_eos=False):
        return self.tokenizer.encode(line)

    def convert_to_tensor(self, symbols):
        return torch.LongTensor(symbols)

from transformers import BertTokenizer
import torch


class BERT_tokenizer(object):
    def __init__(self, special=[], min_freq=0, max_size=None, lower_case=True, 
                 delimiter=None, vocab_file=None):
        print("Loading pre-trained BERT tokenizer")
        print("! create BERT_tokenizer")
        print(special, min_freq, max_size, lower_case, delimiter, vocab_file)
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=lower_case, cache_dir='./cache', force_download=True)
        # These attributes are included for compatibility but might not be used in the context of BERT tokenizer
        self.special = special
        self.min_freq = min_freq
        self.max_size = max_size
        self.delimiter = delimiter
        self.vocab_file = vocab_file
        print("Loading pre-trained BERT tokenizer done, size=", len(self.tokenizer))
    def tokenize(self, line, add_eos=False, add_double_eos=False):
        tokens = self.tokenizer.tokenize(line)
        if add_double_eos:
            return ['<S>'] + tokens + ['<S>']
        elif add_eos:
            return tokens + ['<eos>']
        else:
            return tokens

    def count_file(self, path, verbose=False, add_eos=False):
        raise NotImplementedError("BERT tokenizer doesn't require counting from files. Please use another method.")

    def count_sents(self, sents, verbose=False):
        raise NotImplementedError("BERT tokenizer doesn't require counting from sentences. Please use another method.")

    def _build_from_file(self, vocab_file):
        raise NotImplementedError("BERT tokenizer comes with a pre-built vocabulary.")

    def build_vocab(self):
        raise NotImplementedError("BERT tokenizer comes with a pre-built vocabulary.")

    def encode_file(self, path, ordered=False, verbose=False, add_eos=True, add_double_eos=False):
        with open(path, 'r', encoding='utf-8') as f:
            lines = f.readlines()
        encoded = [self.convert_to_tensor(self.tokenize(line, add_eos=add_eos, add_double_eos=add_double_eos)) for line in lines]
        if ordered:
            return torch.cat(encoded)
        return encoded

    def encode_sents(self, sents, ordered=False, verbose=False):
        encoded = [self.convert_to_tensor(self.tokenize(sent)) for sent in sents]
        if ordered:
            return torch.cat(encoded)
        return encoded

    def add_special(self, sym):
        raise NotImplementedError("BERT tokenizer has its own set of special tokens. Adding new is not recommended.")

    def add_symbol(self, sym):
        raise NotImplementedError("BERT tokenizer has a pre-built vocabulary. Adding new symbols is not recommended.")

    def get_sym(self, idx):
        return self.tokenizer.convert_ids_to_tokens(idx)

    def get_idx(self, sym):
        return self.tokenizer.convert_tokens_to_ids(sym)

    def get_symbols(self, indices):
        return [self.get_sym(idx) for idx in indices]

    def get_indices(self, symbols):
        return [self.get_idx(sym) for sym in symbols]

    def convert_to_tensor(self, symbols):
        return torch.tensor(self.tokenizer.convert_tokens_to_ids(symbols))

    def convert_to_sent(self, indices, exclude=None):
        tokens = self.get_symbols(indices)
        return ' '.join(tokens).replace(' ##', '')

    def __len__(self):
        return len(self.tokenizer)