import sys

sys.path.append("src/clap")

import os
import torch
import librosa
from open_clip import create_model
from training.data import get_audio_features
from training.data import int16_to_float32, float32_to_int16
from transformers import RobertaTokenizer

tokenize = RobertaTokenizer.from_pretrained("roberta-base")


def tokenizer(text):
    result = tokenize(
        text,
        padding="max_length",
        truncation=True,
        max_length=77,
        return_tensors="pt",
    )
    return {k: v.squeeze(0) for k, v in result.items()}


PRETRAINED_PATH = "/mnt/fast/nobackup/users/hl01486/projects/contrastive_pretraining/CLAP/assets/checkpoints/epoch_top_0_audioset_no_fusion.pt"
WAVE_48k_PATH = "/mnt/fast/nobackup/users/hl01486/projects/contrastive_pretraining/CLAP/assets/audio/machine.wav"


def infer_text():
    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    precision = "fp32"
    amodel = "HTSAT-tiny"  # or 'PANN-14'
    tmodel = "roberta"  # the best text encoder in our training
    enable_fusion = False  # False if you do not want to use the fusion model
    fusion_type = "aff_2d"
    pretrained = PRETRAINED_PATH

    model, model_cfg = create_model(
        amodel,
        tmodel,
        pretrained,
        precision=precision,
        device=device,
        enable_fusion=enable_fusion,
        fusion_type=fusion_type,
    )
    # load the text, can be a list (i.e. batch size)
    text_data = ["I love the contrastive learning", "I love the pretrain model"]
    # tokenize for roberta, if you want to tokenize for another text encoder, please refer to data.py#L43-90
    text_data = tokenizer(text_data)

    text_embed = model.get_text_embedding(text_data)
    print(text_embed.size())


def infer_audio():
    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    precision = "fp32"
    amodel = "HTSAT-tiny"  # or 'PANN-14'
    tmodel = "roberta"  # the best text encoder in our training
    enable_fusion = False  # False if you do not want to use the fusion model
    fusion_type = "aff_2d"
    pretrained = PRETRAINED_PATH

    model, model_cfg = create_model(
        amodel,
        tmodel,
        pretrained,
        precision=precision,
        device=device,
        enable_fusion=enable_fusion,
        fusion_type=fusion_type,
    )

    # load the waveform of the shape (T,), should resample to 48000
    audio_waveform, sr = librosa.load(WAVE_48k_PATH, sr=48000)
    # quantize
    audio_waveform = int16_to_float32(float32_to_int16(audio_waveform))
    audio_waveform = torch.from_numpy(audio_waveform).float()
    audio_dict = {}

    # the 'fusion' truncate mode can be changed to 'rand_trunc' if run in unfusion mode
    import ipdb

    ipdb.set_trace()
    audio_dict = get_audio_features(
        audio_dict,
        audio_waveform,
        480000,
        data_truncating="fusion",
        data_filling="repeatpad",
        audio_cfg=model_cfg["audio_cfg"],
    )
    # can send a list to the model, to process many audio tracks in one time (i.e. batch size)
    audio_embed = model.get_audio_embedding([audio_dict])
    print(audio_embed.size())
    import ipdb

    ipdb.set_trace()


if __name__ == "__main__":
    infer_text()
    infer_audio()
