#!/usr/bin/env python3
# coding=utf-8
# Authors:
#   2019.5 Zhiyang Zhou (https://github.com/Joee1995/chn_text_norm.git)
#   2019.9 Jiayu DU
#
# requirements:
#   - python 3.X
# notes: python 2.X WILL fail or produce misleading results

import sys, os, argparse, codecs, string, re
from num2words import num2words
# ================================================================================ #
#                          different types of rewriters
# ================================================================================ #
class Cardinal:
    """
    CARDINAL类
    """

    def __init__(self, cardinal=None, chntext=None):
        self.cardinal = cardinal
        self.chntext = chntext

    def chntext2cardinal(self):
        return chn2num(self.chntext)

    def cardinal2chntext(self):
        return num2chn(self.cardinal)

class Digit:
    """
    DIGIT类
    """
    
    def __init__(self, digit=None, entext=None):
        self.digit = digit
        self.entext = entext

    def digit2entext(self):
        #print(int(self.digit))
        return num2words(float(self.digit)) + ' '
    
    
class Digit_sort:
    """
    Sort类
    """
    def __init__(self, digit=None, entext=None):
        self.digit = digit
        self.entext = entext
        self.dicts = {
            'one st':'first','two nd':'second','three rd':'third','four th':'fourth','five th':'fifth',
            'six th':'sixth','seven th':'seventh','eighe th':'eighth','nine th':'ninth','ten th':'tenth',
            'eleven th':'eleventh','twelve th':'twelfth','thirteen th':'thirteenth','fourteen th':'fourteenth','fifteen th':'fifteenth',
            'sixteen th':'sixteenth','seventeen th':'seventeenth','eighteen th':'eighteenth','nineteen th':'nineteenth','twenty th':'twentieth',
            'thirty th':'thirty','forty th':'fortieth','fifty th':'fiftith','sixty th':'sixtieth','seventy th':'seventieth',
            'eighty th':'eightieth','ninety th':'ninetieth','hundred th':'hundredth','thousand th':'thousandth'}
    
    def digit2entext(self):
        letters = re.findall('[a-zA-Z]+',self.digit)[0]
        numbers = re.findall('[0-9]+',self.digit)[0]
        words = num2words(int(numbers))
        entext = num2words(int(numbers)) + ' ' + letters
        new_entext = entext
        for key in self.dicts.keys():
            if key in entext:
                new_entext = entext.replace(key,self.dicts[key])
                break            
        if new_entext == entext:
            new_entext = words + letters
        return new_entext
        

class Decade:
    """
    DATE类 Decade
    """

    def __init__(self, decade=None, entext=None):
        self.decade = decade
        self.entext = entext
        self.dicts ={
            'one':'first','two':'second','three':'third','four':'fourth','five':'fifth',
            'six':'sixth','seven':'seventh','eight':'eighth','nine':'ninth','ten':'tenth',
            'eleven':'eleventh','twelve':'twelfth','thirteen':'thirteenth','fourteen':'fourteen','fifteen':'fifteenth',
            'sixteen':'sixteenth','seventeen':'seventeenth','eighteen':'eighteenth','nineteen':'nineteenth','twenty':'twentieth',
            'thirty':'thirty','forty':'fortieth','fifty':'fiftith','sixty':'sixtieth','seventy':'seventieth',
            'eighty':'eightieth','ninety':'ninetieth'}
    def decade2entext(self):
        century_text = ''
        # if len(self.decade)>5:
        #     return self.decade
        if len(self.decade) >3:
            century = self.decade[:-3]
            self.decade = self.decade[-3:]
            century_string = num2words(int(century)+1)
            if century_string in self.dicts.keys():
                century_text = f"{self.dicts[century_string]} century"
            else:
                prefix, suffix = century_string.split('-')
                suffix = self.dicts[suffix]
                century_text = f"{prefix}-{suffix} century"
        years_string = ''
        if len(self.decade)==3:
            years = self.decade[:-1]
            if years not in ['00','10']:
                years_string = num2words(int(years))[:-2] + 'ies'
            elif years =='10':
                years_string = 'tenes'
            else:
                years_string = ''
        if years_string and century_text:
            return years_string + ' of ' +  century_text
        elif years_string and not century_text:
            return years_string
        elif not years_string and century_text:
            return century_text
        else:
            return self.decade
    
    
class Year:
    """
    DATE类 Year
    """
    def __init__(self, year=None, entext=None):
        self.year = year
        self.entext = entext

    def year2entext(self):
        if len(self.year) == 4 and self.year[:2] in ['18','19','20']:
            years_string = num2words(int(self.year[:2])) + '-' + num2words(int(self.year[2:]))
        else:
            years_string = self.year
        return years_string
    
class Time:
    """
    DATE类 Time
    """
    def __init__(self, time=None, entext=None):
        self.time = time
        self.entext = entext

    def time2entext(self):
        clock,minute = self.time.split(':')
        time_string = num2words(int(clock)) + ' ' + num2words(int(minute))
        return time_string

class Money:
    """
    MONEY类  $
    """

    def __init__(self, money=None, entext=None):
        self.money = money
        self.entext = entext

    # def entext2money(self):
    #     return self.money

    def money2entext(self):
        money = self.money
        num = float(money[1:])
        if num == 1:
            entext = num2words(num) + ' ' + 'dollar'
        else:
            entext = num2words(num) + ' ' + 'dollars'        
        return entext


class Percentage:
    """
    PERCENTAGE类
    """

    def __init__(self, percentage=None, entext=None):
        self.percentage = percentage
        self.entext = entext

    def entext2percentage(self):
        return chn2num(self.chntext.strip().strip('百分之')) + '%'

    def percentage2entext(self):
        percentage = self.percentage
        num = float(percentage[:-1])
        entext = num2words(num) + ' ' + 'percent'        
        return entext


# ================================================================================ #
#                            NSW Normalizer
# ================================================================================ #
class EN_NSWNormalizer:
    def __init__(self, raw_text):
        self.raw_text = raw_text
        self.norm_text = ''

    # def _particular(self):
    #     text = self.norm_text
    #     pattern = re.compile(r"(([a-zA-Z]+)二([a-zA-Z]+))")
    #     matchers = pattern.findall(text)
    #     if matchers:
    #         # print('particular')
    #         for matcher in matchers:
    #             text = text.replace(matcher[0], matcher[1]+'2'+matcher[2], 1)
    #     self.norm_text = text
    #     return self.norm_text

    def normalize(self):
        text = self.raw_text
        
        # 数据去逗号
        pattern = re.compile(r'\d,\d|\d+ \d')
        while True:
            matchers = pattern.search(text)
            if matchers:
                matcher = matchers.group()
                text = text.replace(matcher,matcher.replace(',','').replace(' ',''))
            else:
                break

        # matchers = pattern.findall(text)
        # if matchers:
        #     for matcher in matchers:
        #         text = text.replace(matcher, Digit_sort(digit=matcher).digit2entext(),1)

        # 规范化数字排序
        pattern = re.compile(r"(\d+st|\d+nd|\d+rd|\d+th)")
        matchers = pattern.findall(text)
        if matchers:
            for matcher in matchers:
                text = text.replace(matcher, Digit_sort(digit=matcher).digit2entext(),1)

        #规范化金钱
        pattern = re.compile(r"(\$\d+.\d+|\$\d+)")
        matchers = pattern.findall(text)
        if matchers:
            for matcher in matchers:
                text = text.replace(matcher, Money(money=matcher).money2entext(), 1)


        # 规范化百分数
        pattern = re.compile(r"(\d+.\d+%|\d+%)")
        matchers = pattern.findall(text)
        if matchers:
            #print('fraction')
            for matcher in matchers:
                text = text.replace(matcher, Percentage(percentage=matcher).percentage2entext(), 1)

        #规范化年代
        pattern = re.compile(r"(\d+s)")
        matchers = pattern.findall(text)
        #print(matchers)
        if matchers:
            for matcher in matchers:
                text = text.replace(matcher, Decade(decade=matcher).decade2entext(), 1)
                
        #规范化年份 
        pattern = re.compile(r"(\d+)")
        matchers = pattern.findall(text)
        #print(matchers)
        if matchers:
            for matcher in matchers:
                text = text.replace(matcher, Year(year=matcher).year2entext(), 1)      

        # 规范化时间
        pattern = re.compile(r"(\d+:\d+)")
        matchers = pattern.findall(text)
        if matchers:
            for matcher in matchers:
                text = text.replace(matcher, Time(time=matcher).time2entext(), 1)
                
        pattern = re.compile(r"(\d+-\d+)")
        matchers = pattern.findall(text)
        if matchers:
            for matcher in matchers:
                text = text.replace(matcher, matcher.replace('-',' to '), 1)
                
        pattern = re.compile(r"(\d+x\d+)")
        matchers = pattern.findall(text)
        if matchers:
            for matcher in matchers:
                text = text.replace(matcher, matcher.replace('x',' by '), 1)
        #print(text)

        # 规范化纯数
        pattern = re.compile(r"(\d+.\d|\d+)")
        matchers = pattern.findall(text)
        if matchers:
            for matcher in matchers:
                text = text.replace(matcher, Digit(digit=matcher).digit2entext(), 1)
        #print(text)

        self.norm_text = text
        # self._particular()

        return self.norm_text

class EnglishTextNormalizer:
    def __init__(self):
        self.ignore_patterns = r"\b(hmm|mm|mhm|mmm|uh|um)\b"
        self.replacers = {
            # common contractions
            r"\bwon't\b": "will not",
            r"\bcan't\b": "can not",
            r"\blet's\b": "let us",
            r"\bain't\b": "aint",
            r"\by'all\b": "you all",
            r"\bwanna\b": "want to",
            r"\bgotta\b": "got to",
            r"\bgonna\b": "going to",
            r"\bi'ma\b": "i am going to",
            r"\bimma\b": "i am going to",
            r"\bwoulda\b": "would have",
            r"\bcoulda\b": "could have",
            r"\bshoulda\b": "should have",
            r"\bma'am\b": "madam",
            # contractions in titles/prefixes
            r"\bmr\b": "mister ",
            r"\bmrs\b": "missus ",
            r"\bst\b": "saint ",
            r"\bdr\b": "doctor ",
            r"\bprof\b": "professor ",
            r"\bcapt\b": "captain ",
            r"\bgov\b": "governor ",
            r"\bald\b": "alderman ",
            r"\bgen\b": "general ",
            r"\bsen\b": "senator ",
            r"\brep\b": "representative ",
            r"\bpres\b": "president ",
            r"\brev\b": "reverend ",
            r"\bhon\b": "honorable ",
            r"\basst\b": "assistant ",
            r"\bassoc\b": "associate ",
            r"\blt\b": "lieutenant ",
            r"\bcol\b": "colonel ",
            r"\bjr\b": "junior ",
            r"\bsr\b": "senior ",
            r"\besq\b": "esquire ",
            # prefect tenses, ideally it should be any past participles, but it's harder..
            r"'d been\b": " had been",
            r"'s been\b": " has been",
            r"'d gone\b": " had gone",
            r"'s gone\b": " has gone",
            r"'d done\b": " had done",  # "'s done" is ambiguous
            r"'s got\b": " has got",
            # general contractions
            r"n't\b": " not",
            r"'re\b": " are",
            # r"'s\b": " is",
            r"'d\b": " would",
            r"'ll\b": " will",
            r"'t\b": " not",
            r"'ve\b": " have",
            r"'m\b": " am",
        }

    def __call__(self, s: str):
        #s = s.lower()
        s = re.sub(self.ignore_patterns, "", s)
        s = re.sub(r"\s+'", "'", s)  # when there's a space before an apostrophe
        for pattern, replacement in self.replacers.items():
            s = re.sub(pattern, replacement, s)
        return s