"""https://github.com/hschen0712/textgrid-parser/blob/master/parse_textgrid.py"""
from typing import List
from collections import OrderedDict
import re
import os
import json

# PRAAT_PATH = "/data/librispeech/LibriSpeech-TextGrid/Praat/test-clean"
# JSON_PATH = "/data/librispeech/LibriSpeech-TextGrid/Json/test-clean"
PRAAT_PATH = "/data/librispeech/LibriSpeech-TextGrid/Praat/test-other"
JSON_PATH = "/data/librispeech/LibriSpeech-TextGrid/Json/test-other"


def remove_empty_lines(lines: List) -> List:
    """remove empty lines"""
    assert (len(lines) > 0) and isinstance(lines, list)
    lines = [t.strip() for t in lines]
    if "" in lines:
        lines.remove("")
    return lines


class TextGrid(object):
    def __init__(self, lines: List):
        self.lines = lines
        self.line_count = 0
        self._get_type()
        self._get_time_intval()
        self._get_size()
        self.tier_list = []
        self._get_item_list()

    def _extract_pattern(self, pattern, inc):
        try:
            group = re.match(pattern, self.lines[self.line_count].decode("utf-8")).group(1)
            self.line_count += inc
        except AttributeError:
            raise ValueError("File format error at line %d:%s" % (self.line_count, self.lines[self.line_count]))
        return group

    def _get_type(self):
        self.file_type = self._extract_pattern(r"File type = \"(.*)\"", 3)

    def _get_time_intval(self):
        self.xmin = self._extract_pattern(r"xmin = (.*)", 1)
        self.xmax = self._extract_pattern(r"xmax = (.*)", 2)

    def _get_size(self):
        self.size = int(self._extract_pattern(r"size = (.*)", 2))

    def _get_item_list(self):
        """Only supports IntervalTier currently"""
        for itemIdx in range(1, self.size + 1):
            tier = OrderedDict()
            item_list = []
            tier_idx = self._extract_pattern(r"item \[(.*)\]:", 1)
            tier_class = self._extract_pattern(r"class = \"(.*)\"", 1)
            if tier_class != "IntervalTier":
                raise NotImplementedError("Only IntervalTier class is supported currently")
            tier_name = self._extract_pattern(r"name = \"(.*)\"", 1)
            tier_xmin = self._extract_pattern(r"xmin = (.*)", 1)
            tier_xmax = self._extract_pattern(r"xmax = (.*)", 1)
            tier_size = self._extract_pattern(r"intervals: size = (.*)", 1)
            for i in range(int(tier_size)):
                item = OrderedDict()
                try:
                    item["idx"] = self._extract_pattern(r"intervals \[(.*)\]", 1)
                except ValueError:  # something missing...
                    break
                item["xmin"] = self._extract_pattern(r"xmin = (.*)", 1)
                item["xmax"] = self._extract_pattern(r"xmax = (.*)", 1)
                item["text"] = self._extract_pattern(r"text = \"(.*)\"", 1)
                item_list.append(item)
            tier["idx"] = tier_idx
            tier["class"] = tier_class
            tier["name"] = tier_name
            tier["xmin"] = tier_xmin
            tier["xmax"] = tier_xmax
            tier["size"] = tier_size
            tier["items"] = item_list
            self.tier_list.append(tier)

    def toJson(self):
        _json = OrderedDict()
        _json["file_type"] = self.file_type
        _json["xmin"] = self.xmin
        _json["xmax"] = self.xmax
        _json["size"] = self.size
        _json["tiers"] = self.tier_list
        return json.dumps(_json, ensure_ascii=False, indent=2).encode("utf-8")


if __name__ == '__main__':
    praat_paths = []
    json_paths = []
    for root, dirs, files in os.walk(PRAAT_PATH):
        for f in files:
            if f.endswith(".TextGrid"):
                j_root = root.replace(PRAAT_PATH, JSON_PATH)
                os.makedirs(j_root, exist_ok=True)
                p_ = os.path.join(root, f)
                j_ = p_.replace(".TextGrid", ".json").replace(PRAAT_PATH, JSON_PATH)
                praat_paths.append(p_)
                json_paths.append(j_)

    num_files = len(praat_paths)
    for count, (p_, j_) in enumerate(zip(praat_paths, json_paths)):
        if count % 10 == 0:
            print(f"... {count} / {num_files} (p: {p_})")

        with open(p_, "rb") as f:
            text_lines = f.readlines()
            if len(text_lines) == 0:
                raise IOError
            text_lines = remove_empty_lines(text_lines)
            text_grid = TextGrid(text_lines)
            with open(j_, "wb") as tf:
                tf.write(text_grid.toJson())
