

if __name__ == "__main__":
    import tokenizers

    from tokenizers.models import WordLevel
    from tokenizers.trainers import WordLevelTrainer

    tokenizer = tokenizers.Tokenizer(WordLevel(unk_token="[UNK]"))

    tokenizer.pre_tokenizer = tokenizers.pre_tokenizers.Split(tokenizers.Regex("."), "isolated")

    vocab = ["'", 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'à', 'á', 'ã', 'ä', 'æ', 'ç', 'è', 'é', 'ê', 'ë', 'í', 'ó', 'õ', 'ö', 'ø', 'ü', 'ġ', 'ŋ', 'œ', 'ɐ', 'ɑ', 'ɒ', 'ɔ', 'ɕ', 'ɘ', 'ə', 'ɛ', 'ɜ', 'ɝ', 'ɡ', 'ɦ', 'ɪ', 'ɱ', 'ɲ', 'ɸ', 'ɹ', 'ɾ', 'ʀ', 'ʁ', 'ʃ', 'ʊ', 'ʋ', 'ʌ', 'ʎ', 'ʏ', 'ʒ', 'ʔ', 'ʰ', 'ʲ', 'ʼ', 'ˀ', 'ː', 'ˑ', '̃', '̆', '̊', '̍', '̝', '̞', '̥', '̩', '̪', '̯', '͜', '͡', 'θ', 'χ', '‿']
    vocab = vocab + [x.upper() for x in vocab] + ["ß"] + [chr(x) for x in range(9, 128)] + [x for x in '˶˵˂˃«»‴‷⁽⁾˹˺']
    vocab = vocab + [chr(i) for i in range(592, 687+1)] # add unicode characters for IPA symbols.
    vocab = vocab + [x for x in '˶˵˂˃«»‴‷⁽⁾˹˺']
    print(vocab)
    tokenizer.train_from_iterator(vocab, WordLevelTrainer(special_tokens=["[PAD]"]))

    tokenizer.save("unicode_char_tokenizer_ipa.json")

    x = tokenizer.encode("baß")
    print(x.tokens)
    print(x.ids)

    from transformers import PreTrainedTokenizerFast

    tok = PreTrainedTokenizerFast(tokenizer_file="unicode_char_tokenizer_ipa.json")

    print(tok("This is a long text"))
