import numpy as np
import torch
from tqdm import tqdm
import math
from einops import rearrange
import sys
sys.path.append(".")
from opensora.eval.flolpips.pwcnet import Network as PWCNet
from opensora.eval.flolpips.flolpips import FloLPIPS

loss_fn = FloLPIPS(net='alex', version='0.1').eval().requires_grad_(False)
flownet = PWCNet().eval().requires_grad_(False)

def trans(x):
    return x


def calculate_flolpips(videos1, videos2, device):
    global loss_fn, flownet
    
    print("calculate_flowlpips...")
    loss_fn = loss_fn.to(device)
    flownet = flownet.to(device)
    
    if videos1.shape != videos2.shape:
        print("Warning: the shape of videos are not equal.")
        min_frames = min(videos1.shape[1], videos2.shape[1])
        videos1 = videos1[:, :min_frames]
        videos2 = videos2[:, :min_frames]
        
    videos1 = trans(videos1)
    videos2 = trans(videos2)

    flolpips_results = []
    for video_num in tqdm(range(videos1.shape[0])):
        video1 = videos1[video_num].to(device)
        video2 = videos2[video_num].to(device)
        frames_rec = video1[:-1]
        frames_rec_next = video1[1:]
        frames_gt = video2[:-1]
        frames_gt_next = video2[1:]
        t, c, h, w = frames_gt.shape
        flow_gt = flownet(frames_gt, frames_gt_next)
        flow_dis = flownet(frames_rec, frames_rec_next)
        flow_diff = flow_gt - flow_dis
        flolpips = loss_fn.forward(frames_gt, frames_rec, flow_diff, normalize=True)
        flolpips_results.append(flolpips.cpu().numpy().tolist())
        
    flolpips_results = np.array(flolpips_results) # [batch_size, num_frames]
    flolpips = {}
    flolpips_std = {}

    for clip_timestamp in range(flolpips_results.shape[1]):
        flolpips[clip_timestamp] = np.mean(flolpips_results[:,clip_timestamp], axis=-1)
        flolpips_std[clip_timestamp] = np.std(flolpips_results[:,clip_timestamp], axis=-1)

    result = {
        "value": flolpips,
        "value_std": flolpips_std,
        "video_setting": video1.shape,
        "video_setting_name": "time, channel, heigth, width",
        "result": flolpips_results,
        "details": flolpips_results.tolist()
    }

    return result

# test code / using example

def main():
    NUMBER_OF_VIDEOS = 8
    VIDEO_LENGTH = 50
    CHANNEL = 3
    SIZE = 64
    videos1 = torch.zeros(NUMBER_OF_VIDEOS, VIDEO_LENGTH, CHANNEL, SIZE, SIZE, requires_grad=False)
    videos2 = torch.zeros(NUMBER_OF_VIDEOS, VIDEO_LENGTH, CHANNEL, SIZE, SIZE, requires_grad=False)

    import json
    result = calculate_flolpips(videos1, videos2, "cuda:0")
    print(json.dumps(result, indent=4))

if __name__ == "__main__":
    main()