"""
Converts data from the BridgeData raw format to numpy format.

Consider the following directory structure for the input data:
bridgedata_raw/
    rss/
        toykitchen2/
            set_table/
                00/
                    2022-01-01_00-00-00/
                        collection_metadata.json
                        config.json
                        diagnostics.png
                        raw/
                            traj_group0/
                                traj0/
                                    obs_dict.pkl
                                    policy_out.pkl
                                    agent_data.pkl
                                    images0/
                                        im_0.jpg
                                        im_1.jpg
                                        ...
                                ...
                            ...
                01/
                ...
The --depth parameter controls how much of the data to process at the
--input_path; for example, if --depth=5, then --input_path should be
"bridgedata_raw", and all data will be processed. If --depth=3, then
--input_path should be "bridgedata_raw/rss/toykitchen2", and only data
under "toykitchen2" will be processed.

The same directory structure will be replicated under --output_path. For
example, in the second case, the output will be written to
"{output_path}/set_table/00/...".

Squashes images to 128x128.

Can write directly to Google Cloud Storage, but not read from it.

Written by Kevin Black (kvablack@berkeley.edu).
"""
import copy
import glob
import os
import pickle
import random
from collections import defaultdict
from datetime import datetime
from functools import partial
from multiprocessing import Pool
import json

import numpy as np
import tensorflow as tf
import tqdm
from absl import app, flags, logging
from PIL import Image
import pprint

FLAGS = flags.FLAGS

flags.DEFINE_integer("EPISODE_CNT", 0, "Total episode count", required=False)
flags.DEFINE_integer("NO_INSTRUCTION", 0, "Total episode count", required=False)
flags.DEFINE_integer("NO_IMAGE", 0, "Total episode count", required=False)
flags.DEFINE_integer("NO_TRAJ", 0, "Total episode count", required=False)
flags.DEFINE_integer("ASSERT", 0, "Total episode count", required=False)
flags.DEFINE_string("input_path", None, "Input path", required=True)
flags.DEFINE_string("output_path", None, "Output path", required=True)
flags.DEFINE_integer(
"depth",
5,
"Number of directories deep to traverse to the dated directory. Looks for"
"{input_path}/dir_1/dir_2/.../dir_{depth-1}/2022-01-01_00-00-00/...",
)
flags.DEFINE_bool("overwrite", False, "Overwrite existing files")
flags.DEFINE_float(
"train_proportion", 0.9, "Proportion of data to use for training (rather than val)"
)
flags.DEFINE_integer("num_workers", 8, "Number of threads to use")
flags.DEFINE_integer("im_size", 128, "Image size")

def squash(path): # squash from 480x640 to im_size
    im = Image.open(path)
    im = im.resize((FLAGS.im_size, FLAGS.im_size), Image.Resampling.LANCZOS)
    out = np.asarray(im).astype(np.uint8)
    return out

def process_images(path): # processes images at a trajectory level
    names = sorted(
        [x for x in os.listdir(path) if "images" in x and not "depth" in x],
        key=lambda x: int(x.split("images")[1]),
    )
    image_path = [
        os.path.join(path, x)
        for x in os.listdir(path)
        if "images" in x and not "depth" in x
    ]
    image_path = sorted(image_path, key=lambda x: int(x.split("images")[1]))
    images_out = defaultdict(list)
    if not image_path:
        return None, None


    tlen = len(glob.glob(image_path[0] + "/im_*.jpg"))

    for i, name in enumerate(names):
        for t in range(tlen):
            images_out[name].append(squash(image_path[i] + "/im_{}.jpg".format(t)))

    images_out = dict(images_out)

    obs, next_obs = dict(), dict()

    for n in names:
        obs[n] = images_out[n][:-1]
        next_obs[n] = images_out[n][1:]
    return obs, next_obs


def process_state(path):
    fp = os.path.join(path, "obs_dict.pkl")
    with open(fp, "rb") as f:
        x = pickle.load(f)
    return x["full_state"][:-1], x["full_state"][1:]

def process_time(path):
    fp = os.path.join(path, "obs_dict.pkl")
    with open(fp, "rb") as f:
        x = pickle.load(f)
    return x["time_stamp"][:-1], x["time_stamp"][1:]

def process_actions(path): # gets actions
    fp = os.path.join(path, "policy_out.pkl")
    with open(fp, "rb") as f:
        act_list = pickle.load(f)
        if isinstance(act_list[0], dict):
            act_list = [x["actions"] for x in act_list]
    return act_list

# processes each data collection attempt
def process_dc(path, train_ratio=0.9):
    entries = []
    # a mystery left by the greats of the past
    if "lmdb" in path:
        logging.warning(f"Skipping {path} because uhhhh lmdb?")
        FLAGS.NO_TRAJ+=1
        return None
    all_dicts_train = list()
    all_dicts_test = list()
    all_rews_train = list()
    all_rews_test = list()

    # Data collected prior to 7-23 has a delay of 1, otherwise a delay of 0
    date_time = datetime.strptime(path.split("/")[-1], "%Y-%m-%d_%H-%M-%S")
    latency_shift = date_time < datetime(2021, 7, 23)

    search_path = os.path.join(path, "raw", "traj_group*", "traj*")
    all_traj = glob.glob(search_path)
    if all_traj == []:
        logging.info(f"no trajs found in {search_path}")
        FLAGS.NO_TRAJ+=1
        return None

    #random.shuffle(all_traj)

    num_traj = len(all_traj)
    for itraj, tp in tqdm.tqdm(enumerate(all_traj)):
        try:
            out = dict()
            print(tp)
            print(f'Episode: {FLAGS.EPISODE_CNT}')
            ld = os.listdir(tp)

            assert "obs_dict.pkl" in ld, tp + ":" + str(ld)
            assert "policy_out.pkl" in ld, tp + ":" + str(ld)
            # assert "agent_data.pkl" in ld, tp + ":" + str(ld) # not used
            obs, next_obs = process_images(tp)
            if obs is None:
                logging.warning(f"Skipping {tp} because no images")
                FLAGS.NO_IMAGE+=1
                continue
            acts = process_actions(tp)
            state, next_state = process_state(tp)
            time_stamp, next_time_stamp = process_time(tp)
            term = [0] * len(acts)
            if "lang.txt" in ld:
                with open(os.path.join(tp, "lang.txt")) as f:
                    lang = list(f)
                    lang = [l.strip() for l in lang if "confidence" not in l]
            else:
                # empty string is a placeholder for data with no language label
                lang = [""]

            out["observations"] = obs
            out["observations"]["state"] = state
            out["observations"]["time_stamp"] = time_stamp
            out["next_observations"] = next_obs
            out["next_observations"]["state"] = next_state
            out["next_observations"]["time_stamp"] = next_time_stamp

            out["observations"] = [
                dict(zip(out["observations"], t))
                for t in zip(*out["observations"].values())
            ]
            out["next_observations"] = [
                dict(zip(out["next_observations"], t))
                for t in zip(*out["next_observations"].values())
            ]

            out["actions"] = acts
            out["terminals"] = term
            out["language"] = lang

            # shift the actions according to camera latency
            if latency_shift:
                out["observations"] = out["observations"][1:]
                out["next_observations"] = out["next_observations"][1:]
                out["actions"] = out["actions"][:-1]
                out["terminals"] = term[:-1]

            labeled_rew = copy.deepcopy(out["terminals"])[:]
            labeled_rew[-2:] = [1, 1]

            traj_len = len(out["observations"])
            assert len(out["next_observations"]) == traj_len
            assert len(out["actions"]) == traj_len
            assert len(out["terminals"]) == traj_len
            assert len(labeled_rew) == traj_len

            instruction = out["language"][0]
            if instruction == '':
                FLAGS.NO_INSTRUCTION+=1
            for i in range(len(out["observations"])):
                
                img_primary = Image.fromarray(out["observations"][i]['images0'])

                state = out["observations"][i]['state']
                action = out["actions"][i]
                
                # Saving Images
                # img_dir_primary = f"/home/bridge_imgs_singleview_raw_script_script/episode_{FLAGS.EPISODE_CNT}/step_{i}_primary.jpg" 
                # img_dir_primary = f"/home/bridge_processed/bridge_imgs_singleview_flap/episode_{FLAGS.EPISODE_CNT}/step_{i}_primary.jpg" 
                # img_dir_primary = f"/home/bridge_processed/bridge_imgs_singleview_icra/episode_{FLAGS.EPISODE_CNT}/step_{i}_primary.jpg" 
                # img_dir_primary = f"/home/bridge_processed/bridge_imgs_singleview_rss/episode_{FLAGS.EPISODE_CNT}/step_{i}_primary.jpg" 
                # img_dir_primary = f"/home/bridge_processed/bridge_imgs_singleview_bridgev1/episode_{FLAGS.EPISODE_CNT}/step_{i}_primary.jpg" 
                img_dir_primary = f"/home/bridge_processed/bridge_imgs_singleview_bridgev2/episode_{FLAGS.EPISODE_CNT}/step_{i}_primary.jpg" 
                # img_dir_primary = f"/home/bridge_processed/bridge_imgs_singleview_script/episode_{FLAGS.EPISODE_CNT}/step_{i}_primary.jpg" 

                
                if not os.path.exists(os.path.dirname(f'{img_dir_primary}')):
                    os.makedirs(os.path.dirname(f'{img_dir_primary}'))
                    print(f"Directory '{os.path.dirname(f'{img_dir_primary}')}' was created.")
                img_primary.save(img_dir_primary)

                entry = {
                    "image": img_dir_primary,
                    "episode": FLAGS.EPISODE_CNT,
                    "conversations": [
                    {
                        "from": "human",
                        "value": f"What action should the robot take to `{instruction}`\n<image>"
                    },
                    {
                        "from": "gpt",
                        #"value": action_text,
                        "raw_actions": action.tolist(),
                        "raw_states": state.tolist()
                    },
                    ]
                }
                entries.append(entry)
            FLAGS.EPISODE_CNT+=1

        except FileNotFoundError as e:
            FLAGS.NO_IMAGE+=1
            logging.error(e)
            continue
        except AssertionError as e:
            FLAGS.ASSERT+=1
            logging.error(e)
            continue
    return entries
    #return all_dicts_train, all_dicts_test, all_rews_train, all_rews_test


def make_entries(path, train_proportion):
    entries = []
    dirname = os.path.abspath(path)
    lst = os.listdir(path)
    check_path = os.path.join(path, lst[0]) + '/raw/traj_group0/traj0'
    if os.path.isdir(check_path):
        for dated_folder in os.listdir(path):
            ents = process_dc(
                os.path.join(path, dated_folder), train_ratio=train_proportion
            )
            # print("ents",ents)
            # print("ents",len(entries))
            if ents and ents is not None:
                # print("yes@@@@@")
                entries = entries + ents
            elif ents is None:
                # print("no@@@@@")
                continue
            # print("len for now", len(entries))
            
        # doesn't seem like these are ever used anymore
        # np.save(os.path.join(outpath_train, "out_rew.npy"), rew_train_l)
        # np.save(os.path.join(outpath_val, "out_rew.npy"), rew_val_l)
    return entries


def main(_):
    assert FLAGS.depth >= 1
    # output_name = 'data/bridge_singleview_scripted.json'
    # output_name = 'data/bridge_singleview_script.json'
    # output_name = 'data/bridge_singleview_flap.json'
    # output_name = 'data/bridge_singleview_icra.json'
    # output_name = 'data/bridge_singleview_rss.json'
    # output_name = 'data/bridge_singleview_bridgev1.json'
    output_name = 'data/bridge_singleview_bridgev2.json'
    # output_name = 'data/bridge_singleview_script.json'
    entries = []
    # each path is a directory that contains dated directories
    paths = glob.glob(os.path.join(FLAGS.input_path, *("*" * (FLAGS.depth - 1))))
    
    for i in range(len(paths)):
        print(f'{i} / {len(paths)}')
        ents = make_entries(paths[i], train_proportion=FLAGS.train_proportion)
        entries = entries + ents
        print("entries length", len(entries))
        if len(entries) == 0:
            print(ents)
    print("entries after", len(entries))
    if len(entries)==0:
        print(ents)
    print(f'Total number of trajs with no instruction: {FLAGS.NO_INSTRUCTION}')   
    print(f'Total number of trajs with no image: {FLAGS.NO_IMAGE}')    
    print(f'Total number of trajs with no trajs: {FLAGS.NO_TRAJ}')
    print(f'Total number of trajs with assertion errors: {FLAGS.ASSERT}')        
    with open(output_name, 'w') as file:
        json.dump(entries, file, indent=4)

    '''
    worker_fn = partial(make_numpy, train_proportion=FLAGS.train_proportion)

    with Pool(FLAGS.num_workers) as p:
        list(tqdm.tqdm(p.imap(worker_fn, paths), total=len(paths)))
    '''

app.run(main)
