# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

import numpy as np
import torch
import json

from fairseq import utils
from fairseq.models.text_to_speech.vocoder import CodeHiFiGANVocoder

# from examples.hubert.simple_kmeans.dump_hubert_feature import HubertFeatureReader
from examples.textless_nlp.gslm.speech2unit.pretrained.hubert_feature_reader import HubertFeatureReader
from examples.hubert.simple_kmeans.dump_km_label import ApplyKmeans


# Hubert tokenizer
class HubertTokenizer:
    def __init__(
                self,
                hubert_path,
                hubert_layer,
                km_path,
                use_cuda=True,
            ):
        self.feature_extractor = HubertFeatureReader(hubert_path, hubert_layer, use_cuda=use_cuda)
        self.quantizer = ApplyKmeans(km_path)
        if not use_cuda:
            self.quantizer.C = self.quantizer.C.cpu()
            self.quantizer.Cnorm = self.quantizer.Cnorm.cpu()

    def wav2code(self, path, channel_id=1):
        feat = self.feature_extractor.get_feats(path, channel_id=channel_id)
        code = self.quantizer(feat)
        return ' '.join(map(str, code))

    def wav2codes(self, path):
        codes = [
            self.wav2code(path, channel_id=1),
            self.wav2code(path, channel_id=2)
        ]
        return codes


# Vocoder
class HifiganVocoder:
    def __init__(
                self,
                vocoder_path,
                vocoder_cfg_path,
                use_cuda=True,
            ):
        with open(vocoder_cfg_path) as f:
            cfg = json.load(f)
        self.vocoder = CodeHiFiGANVocoder(vocoder_path, cfg).eval()
        self.use_cuda = use_cuda
        if self.use_cuda:
            self.vocoder.cuda()

    def code2wav(self, code, speaker_id=0, pred_dur=False):
        if isinstance(code, str):
            code = list(map(int, code.split()))
        inp = {"code": torch.LongTensor(code).view(1, -1)}
        if self.vocoder.model.multispkr:
            inp["spkr"] = torch.LongTensor([speaker_id]).view(1, 1)
        if self.use_cuda:
            inp = utils.move_to_cuda(inp)
        return self.vocoder(inp, pred_dur).detach().cpu().numpy()

    def codes2wav(self, codes, speaker_ids=[0, 4], pred_dur=False):
        if isinstance(codes, dict):
            codes = list(codes.values())
        assert len(codes) == 2
        wav1 = self.code2wav(codes[0], speaker_ids[0], pred_dur)
        wav2 = self.code2wav(codes[1], speaker_ids[1], pred_dur)
        wav = np.stack([wav1, wav2])
        return wav
