#!/usr/bin/env python3
# coding=utf-8
# Authors:
#   2019.5 Zhiyang Zhou (https://github.com/Joee1995/chn_text_norm.git)
#   2019.9 Jiayu DU
#
# requirements:
#   - python 3.X
# notes: python 2.X WILL fail or produce misleading results

import sys, os, argparse, codecs, string, re
from num2words import num2words
# ================================================================================ #
#                          different types of rewriters
# ================================================================================ #
class Cardinal:
    """
    CARDINAL类
    """

    def __init__(self, cardinal=None, chntext=None):
        self.cardinal = cardinal
        self.chntext = chntext

    def chntext2cardinal(self):
        return chn2num(self.chntext)

    def cardinal2chntext(self):
        return num2chn(self.cardinal)

class Digit:
    """
    DIGIT类
    """
    
    def __init__(self, digit=None, estext=None):
        self.digit = digit
        self.estext = estext

    def digit2estext(self):
        #print(int(self.digit))
        if '.' in self.digit:
            return num2words(float(self.digit),lang='es')
        else:
            return num2words(int(self.digit),lang='es')
    
    
# class Digit_sort:
#     """
#     Sort类
#     """
#     def __init__(self, digit=None, entext=None):
#         self.digit = digit
#         self.entext = entext
#         self.dicts = {
#             'one st':'first','two nd':'second','three rd':'third','four th':'fourth','five th':'fifth',
#             'six th':'sixth','seven th':'seventh','eighe th':'eighth','nine th':'ninth','ten th':'tenth',
#             'eleven th':'eleventh','twelve th':'twelfth','thirteen th':'thirteenth','fourteen th':'fourteenth','fifteen th':'fifteenth',
#             'sixteen th':'sixteenth','seventeen th':'seventeenth','eighteen th':'eighteenth','nineteen th':'nineteenth','twenty th':'twentieth',
#             'thirty th':'thirty','forty th':'fortieth','fifty th':'fiftith','sixty th':'sixtieth','seventy th':'seventieth',
#             'eighty th':'eightieth','ninety th':'ninetieth','hundred th':'hundredth','thousand th':'thousandth'}
    
#     def digit2entext(self):
#         letters = re.findall('[a-zA-Z]+',self.digit)[0]
#         numbers = re.findall('[0-9]+',self.digit)[0]
#         words = num2words(int(numbers))
#         entext = num2words(int(numbers)) + ' ' + letters
#         new_entext = entext
#         for key in self.dicts.keys():
#             if key in entext:
#                 new_entext = entext.replace(key,self.dicts[key])
#                 break            
#         if new_entext == entext:
#             new_entext = words + letters
#         return new_entext
        

# class Decade:
#     """
#     DATE类 Decade
#     """

#     def __init__(self, decade=None, entext=None):
#         self.decade = decade
#         self.entext = entext
#         self.dicts ={
#             'one':'first','two':'second','three':'third','four':'fourth','five':'fifth',
#             'six':'sixth','seven':'seventh','eight':'eighth','nine':'ninth','ten':'tenth',
#             'eleven':'eleventh','twelve':'twelfth','thirteen':'thirteenth','fourteen':'fourteen','fifteen':'fifteenth',
#             'sixteen':'sixteenth','seventeen':'seventeenth','eighteen':'eighteenth','nineteen':'nineteenth','twenty':'twentieth',
#             'thirty':'thirty','forty':'fortieth','fifty':'fiftith','sixty':'sixtieth','seventy':'seventieth',
#             'eighty':'eightieth','ninety':'ninetieth'}
#     def decade2entext(self):
#         century_text = ''
#         # if len(self.decade)>5:
#         #     return self.decade
#         if len(self.decade) >3:
#             century = self.decade[:-3]
#             self.decade = self.decade[-3:]
#             century_string = num2words(int(century)+1)
#             if century_string in self.dicts.keys():
#                 century_text = f"{self.dicts[century_string]} century"
#             else:
#                 prefix, suffix = century_string.split('-')
#                 suffix = self.dicts[suffix]
#                 century_text = f"{prefix}-{suffix} century"
#         years_string = ''
#         if len(self.decade)==3:
#             years = self.decade[:-1]
#             if years not in ['00','10']:
#                 years_string = num2words(int(years))[:-2] + 'ies'
#             elif years =='10':
#                 years_string = 'tenes'
#             else:
#                 years_string = ''
#         if years_string and century_text:
#             return years_string + ' of ' +  century_text
#         elif years_string and not century_text:
#             return years_string
#         elif not years_string and century_text:
#             return century_text
#         else:
#             return self.decade
    
    
# class Year:
#     """
#     DATE类 Year
#     """
#     def __init__(self, year=None, entext=None):
#         self.year = year
#         self.entext = entext

#     def year2entext(self):
#         if len(self.year) == 4 and self.year[:2] in ['18','19','20']:
#             years_string = num2words(int(self.year[:2])) + '-' + num2words(int(self.year[2:]))
#         else:
#             years_string = self.year
#         return years_string
    
class Time:
    """
    DATE类 Time
    """
    def __init__(self, time=None, estext=None):
        self.time = time
        self.estext = entext

    def time2estext(self):
        clock,minute = self.time.split(':')
        time_string = num2words(int(clock),lang='es') + ' ' + num2words(int(minute),lang='es')
        return time_string

class Money:
    """
    MONEY类  $
    """

    def __init__(self, money=None, entext=None):
        self.money = money
        self.entext = entext

    # def entext2money(self):
    #     return self.money

    def money2entext(self):
        money = self.money
        if '.' in money:
            num = float(money[1:])
        else:
            num = int(money[1:])
        if num == 1:
            entext = num2words(num,lang='es') + ' ' + 'dollar'
        else:
            entext = num2words(num,lang='es') + ' ' + 'dollars'        
        return entext


class Percentage:
    """
    PERCENTAGE类
    """

    def __init__(self, percentage=None, estext=None):
        self.percentage = percentage
        self.estext = estext

    def entext2percentage(self):
        return chn2num(self.chntext.strip().strip('百分之')) + '%'

    def percentage2estext(self):
        percentage = self.percentage
        if '.' in money:
            num = float(money[:-1])
        else:
            num = int(money[:-1])
        entext = num2words(num,lang='es') + ' ' + 'porciento'        
        return entext


# ================================================================================ #
#                            NSW Normalizer
# ================================================================================ #
class ES_NSWNormalizer:
    def __init__(self, raw_text):
        self.raw_text = raw_text
        self.norm_text = ''

    # def _particular(self):
    #     text = self.norm_text
    #     pattern = re.compile(r"(([a-zA-Z]+)二([a-zA-Z]+))")
    #     matchers = pattern.findall(text)
    #     if matchers:
    #         # print('particular')
    #         for matcher in matchers:
    #             text = text.replace(matcher[0], matcher[1]+'2'+matcher[2], 1)
    #     self.norm_text = text
    #     return self.norm_text

    def normalize(self):
        text = self.raw_text
        
        # 数据去小数点和空格
        pattern = re.compile(r'(\d.\d|\d+ \d)')
        while True:
            matchers = pattern.search(text)
            if matchers:
                matcher = matchers.group()
                text = text.replace(matcher,matcher.replace('.','').replace(' ',''))
            else:
                break
            
        # 数据小数点标准化    
        pattern = re.compile(r'(\d,\d)')
        while True:
            matchers = pattern.search(text)
            if matchers:
                matcher = matchers.group()
                text = text.replace(matcher,matcher.replace(',','.'))
            else:
                break
            
        # pattern = re.compile(r'\d+ \d')
        # while True:
        #     matchers = pattern.search(text)
        #     if matchers:
        #         matcher = matchers.group()
        #         text = text.replace(matcher,matcher.replace(' ',''))
        #     else:
        #         break
            

        # 规范化数字排序
        # pattern = re.compile(r"(\d+st|\d+nd|\d+rd|\d+th)")
        # matchers = pattern.findall(text)
        # if matchers:
        #     for matcher in matchers:
        #         text = text.replace(matcher, Digit_sort(digit=matcher).digit2entext(),1)

        #规范化金钱
        pattern = re.compile(r"(\$\d+.\d+|\$\d+)")
        matchers = pattern.findall(text)
        if matchers:
            for matcher in matchers:
                text = text.replace(matcher, Money(money=matcher).money2estext(), 1)


        # 规范化百分数
        pattern = re.compile(r"(\d+.\d+%|\d+%)")
        matchers = pattern.findall(text)
        if matchers:
            #print('fraction')
            for matcher in matchers:
                text = text.replace(matcher, Percentage(percentage=matcher).percentage2estext(), 1)

        #规范化年代
        # pattern = re.compile(r"(\d+s)")
        # matchers = pattern.findall(text)
        # #print(matchers)
        # if matchers:
        #     for matcher in matchers:
        #         text = text.replace(matcher, Decade(decade=matcher).decade2entext(), 1)
                
        #规范化年份 
        # pattern = re.compile(r"(\d+)")
        # matchers = pattern.findall(text)
        # #print(matchers)
        # if matchers:
        #     for matcher in matchers:
        #         text = text.replace(matcher, Year(year=matcher).year2entext(), 1)      

        # 规范化时间
        pattern = re.compile(r"(\d+:\d+)")
        matchers = pattern.findall(text)
        if matchers:
            for matcher in matchers:
                text = text.replace(matcher, Time(time=matcher).time2estext(), 1)
                
        pattern = re.compile(r"(\d+-\d+)")
        matchers = pattern.findall(text)
        if matchers:
            for matcher in matchers:
                text = text.replace(matcher, matcher.replace('-',', '), 1)
                
        pattern = re.compile(r"(\d+x\d+)")
        matchers = pattern.findall(text)
        if matchers:
            for matcher in matchers:
                text = text.replace(matcher, matcher.replace('x',', '), 1)
        #print(text)

        # 规范化纯数
        pattern = re.compile(r"(\d+.\d|\d+)")
        matchers = pattern.findall(text)
        if matchers:
            for matcher in matchers:
                text = text.replace(matcher, Digit(digit=matcher).digit2estext(), 1)
        #print(text)

        self.norm_text = text
        # self._particular()

        return self.norm_text
