import os
import os.path as osp
import numpy as np
from PIL import Image, ImageFile

import torch
from torch.utils.data import Dataset

from nerv.utils import glob_all, load_obj

#from .utils import BaseTransforms

from .langtable import LangtableInstDataset, LangtableSlotsDataset
from .utils import BaseTransforms

ImageFile.LOAD_TRUNCATED_IMAGES = True

class LangtableActionDataset(LangtableInstDataset):
    """Langtable dataset with language instructions."""

    def __init__(
        self,
        data_root,
        split,
        langtable_transform,
        n_sample_frames=16,
        frame_offset=None,
        video_len=50,
        input_frames=6,
    ):
        super().__init__(
            data_root=data_root,
            split=split,
            langtable_transform=langtable_transform,
            n_sample_frames=n_sample_frames,
            frame_offset=frame_offset,
            video_len=video_len,
        )
        self.input_frames = input_frames

        # load actions
        self.act_root = os.path.join(data_root, "actions")
        actions = np.load(os.path.join(self.act_root, "actions.npy"), allow_pickle=True)
        self.actions = [torch.from_numpy(a) for a in actions]

    def _read_actions(self, idx):
        folder, start_idx = self.valid_idx[idx]
        episode_actions = self.actions[int(os.path.basename(folder))] # [Length, act_dim]
        return episode_actions[start_idx + self.input_frames - 1] # get last action

    def _read_insts_raw(self, idx):
        folder, _ = self.valid_idx[idx]
        inst_file = os.path.join(self.inst_root, os.path.basename(folder)+".npy")
        return torch.tensor(np.load(inst_file))
 
    def __getitem__(self, idx):
        """Data dict:
            - data_idx: int
            - img: [T, 3, H, W]
        """
        frames = self._read_frames(idx)
        insts = self._read_insts(idx)
        actions = self._read_actions(idx)
        insts_raw = self._read_insts_raw(idx)
        data_dict = {
            'data_idx': idx,
            'img': frames,
            'instruction': insts,
            'actions': actions,
            'instruction_raw': insts_raw,
            # 'instruction_text': insts_text,
        }
        return data_dict

class LangtableActionSlotsDataset(LangtableSlotsDataset):
    """Langtable dataset with language instructions."""

    def __init__(
        self,
        data_root,
        video_slots,
        split,
        langtable_transform,
        n_sample_frames=16,
        frame_offset=None,
        video_len=50,
        input_frames=6,
    ):
        super().__init__(
            data_root=data_root,
            video_slots=video_slots,
            split=split,
            langtable_transform=langtable_transform,
            n_sample_frames=n_sample_frames,
            frame_offset=frame_offset,
            video_len=video_len,
        )
        self.input_frames = input_frames

        # load actions
        self.act_root = os.path.join(data_root, "actions")
        actions = np.load(os.path.join(self.act_root, "actions.npy"), allow_pickle=True)
        self.actions = [torch.from_numpy(a) for a in actions]

    def _read_actions(self, idx):
        folder, start_idx = self.valid_idx[idx]
        episode_actions = self.actions[int(os.path.basename(folder))] # [Length, act_dim]
        return episode_actions[start_idx:start_idx + self.n_sample_frames] # get actions [n_sample_frames, act_dim]

    def _read_insts_raw(self, idx):
        folder, _ = self.valid_idx[idx]
        inst_file = os.path.join(self.inst_root, os.path.basename(folder)+".npy")
        return torch.tensor(np.load(inst_file))


 
    def __getitem__(self, idx):
        """Data dict:
            - data_idx: int
            - img: [T, 3, H, W]
        """
        frames = self._read_frames(idx)
        slots = self._read_slots(idx)
        insts = self._read_insts_word(idx)
        actions = self._read_actions(idx)
        insts_raw = self._read_insts_raw(idx)
        # mask = self._read_mask(idx)
        data_dict = {
            'data_idx': idx,
            'img': frames,
            'slots': slots,
            'instruction': insts,
            'actions': actions,
            'instruction_raw': insts_raw,
            # 'mask': mask,
            # 'instruction_text': insts_text,
        }
        return data_dict


def build_langtable_action_dataset(params, val_only=False):
    """Build langtable video dataset."""
    if params.model == 'RoboticsTransformer':
        # don't transform image since we'll use huggingface processor
        import torchvision.transforms as transforms
        langtable_transform=transforms.Compose([
            transforms.ToTensor(),  # [3, H, W]
            transforms.Resize(params.resolution),
        ])
    else:
        langtable_transform=BaseTransforms(params.resolution)
    args = dict(
        data_root=params.data_root,
        split='val',
        langtable_transform=langtable_transform,
        n_sample_frames=params.n_sample_frames,
        frame_offset=params.frame_offset,
        video_len=params.video_len,
        input_frames=params.input_frames,
    )
    val_dataset = LangtableActionDataset(**args)
    if val_only:
        return val_dataset
    args['split'] = 'train'
    train_dataset = LangtableActionDataset(**args)
    return train_dataset, val_dataset

def build_langtable_action_slots_dataset(params, val_only=False):
    """Build Langtable video dataset with pre-computed slots."""
    slots = load_obj(params.slots_root)
    args = dict(
        data_root=params.data_root,
        video_slots=slots['val'],
        split='val',
        langtable_transform=BaseTransforms(params.resolution),
        n_sample_frames=params.n_sample_frames,
        frame_offset=params.frame_offset,
        video_len=params.video_len,
    )
    val_dataset = LangtableActionSlotsDataset(**args)
    if val_only:
        return val_dataset
    args['split'] = 'train'
    args['video_slots'] = slots['train']
    train_dataset = LangtableActionSlotsDataset(**args)
    return train_dataset, val_dataset

