"""
Convert TVQA into tfrecords
"""
import pickle
import sys
import csv

import argparse
import hashlib
import io
import json
import os
import random
import numpy as np
from tempfile import TemporaryDirectory
from copy import deepcopy

from PIL import Image, ImageDraw, ImageFont
import tensorflow as tf
from google.cloud import storage
# from sacremoses import MosesDetokenizer
import regex as re
from tqdm import tqdm
import pandas as pd
from finetune.common_data_utils import *
from collections import defaultdict
import colorsys
import hashlib
import tempfile
import subprocess
from scipy.io import wavfile
from mreserve.preprocess import make_spectrogram, invert_spectrogram
from mreserve.lowercase_encoder import START
import pysrt
from unidecode import unidecode
import ftfy
from dotenv import load_dotenv

load_dotenv('../../.env')


parser = create_base_parser()
parser.add_argument(
    '-data_dir',
    dest='data_dir',
    default=os.environ["DATA_DIR"],
    type=str,
    help='Image directory.'
)
"""
Must set things up like this in the data_dir
drwxr-xr-x 1 rowan rowan    1155072 Aug 19  2018 tvqa_subtitles
drwxr-xr-x 1 rowan rowan       4096 Aug 27  2018 tvqa_qa_release
drwxr-xr-x 1 rowan rowan       4096 Jan 18  2020 tvqa_plus_annotations_with_test
drwxrwxr-x 1 rowan rowan       4096 Sep 14 08:06 tvqa_frames
-rw-rw-r-- 1 rowan rowan 4294967296 Nov 12  2018 tvqa_video_frames_fps3_hq.tar.gz.aa
-rw-rw-r-- 1 rowan rowan 4294967296 Nov 12  2018 tvqa_video_frames_fps3_hq.tar.gz.ab
-rw-rw-r-- 1 rowan rowan 4294967296 Nov 12  2018 tvqa_video_frames_fps3_hq.tar.gz.ac
-rw-rw-r-- 1 rowan rowan 4294967296 Nov 12  2018 tvqa_video_frames_fps3_hq.tar.gz.ad
-rw-rw-r-- 1 rowan rowan 4294967296 Nov 12  2018 tvqa_video_frames_fps3_hq.tar.gz.ae
-rw-rw-r-- 1 rowan rowan 4294967296 Nov 12  2018 tvqa_video_frames_fps3_hq.tar.gz.af
-rw-rw-r-- 1 rowan rowan 4294967296 Nov 12  2018 tvqa_video_frames_fps3_hq.tar.gz.ag
-rw-rw-r-- 1 rowan rowan 4294967296 Nov 12  2018 tvqa_video_frames_fps3_hq.tar.gz.ah
-rw-rw-r-- 1 rowan rowan 4294967296 Nov 12  2018 tvqa_video_frames_fps3_hq.tar.gz.ai
-rw-rw-r-- 1 rowan rowan 4294967296 Nov 12  2018 tvqa_video_frames_fps3_hq.tar.gz.aj
-rw-rw-r-- 1 rowan rowan 4294967296 Nov 12  2018 tvqa_video_frames_fps3_hq.tar.gz.ak
-rw-rw-r-- 1 rowan rowan 4294967296 Nov 12  2018 tvqa_video_frames_fps3_hq.tar.gz.al
-rw-rw-r-- 1 rowan rowan 4294967296 Nov 12  2018 tvqa_video_frames_fps3_hq.tar.gz.am
-rw-rw-r-- 1 rowan rowan 4294967296 Nov 12  2018 tvqa_video_frames_fps3_hq.tar.gz.an
-rw-rw-r-- 1 rowan rowan 4294967296 Nov 12  2018 tvqa_video_frames_fps3_hq.tar.gz.ao
-rw-rw-r-- 1 rowan rowan 4294967296 Nov 12  2018 tvqa_video_frames_fps3_hq.tar.gz.ap
-rw-rw-r-- 1 rowan rowan 4294967296 Nov 12  2018 tvqa_video_frames_fps3_hq.tar.gz.aq
-rw-rw-r-- 1 rowan rowan 4294967296 Nov 12  2018 tvqa_video_frames_fps3_hq.tar.gz.ar
-rw-rw-r-- 1 rowan rowan 4294967296 Nov 12  2018 tvqa_video_frames_fps3_hq.tar.gz.as
-rw-rw-r-- 1 rowan rowan 4294967296 Nov 12  2018 tvqa_video_frames_fps3_hq.tar.gz.at
-rw-rw-r-- 1 rowan rowan 4294967296 Nov 12  2018 tvqa_video_frames_fps3_hq.tar.gz.au
-rw-rw-r-- 1 rowan rowan 4294967296 Nov 12  2018 tvqa_video_frames_fps3_hq.tar.gz.av
-rw-rw-r-- 1 rowan rowan 4294967296 Nov 12  2018 tvqa_video_frames_fps3_hq.tar.gz.aw
-rw-rw-r-- 1 rowan rowan 4294967296 Nov 12  2018 tvqa_video_frames_fps3_hq.tar.gz.ax
-rw-rw-r-- 1 rowan rowan 4294967296 Nov 12  2018 tvqa_video_frames_fps3_hq.tar.gz.ay
-rw-rw-r-- 1 rowan rowan 4294967296 Nov 12  2018 tvqa_video_frames_fps3_hq.tar.gz.az
-rw-rw-r-- 1 rowan rowan 4294967296 Nov 12  2018 tvqa_video_frames_fps3_hq.tar.gz.ba
-rw-rw-r-- 1 rowan rowan 4294967296 Nov 12  2018 tvqa_video_frames_fps3_hq.tar.gz.bb
-rw-rw-r-- 1 rowan rowan 4294967296 Nov 12  2018 tvqa_video_frames_fps3_hq.tar.gz.bc
-rw-rw-r-- 1 rowan rowan 4294967296 Nov 12  2018 tvqa_video_frames_fps3_hq.tar.gz.bd
-rw-rw-r-- 1 rowan rowan 4294967296 Nov 12  2018 tvqa_video_frames_fps3_hq.tar.gz.be
-rw-rw-r-- 1 rowan rowan 4294967296 Nov 12  2018 tvqa_video_frames_fps3_hq.tar.gz.bf
-rw-rw-r-- 1 rowan rowan 4294967296 Nov 12  2018 tvqa_video_frames_fps3_hq.tar.gz.bg
-rw-rw-r-- 1 rowan rowan 4294967296 Nov 12  2018 tvqa_video_frames_fps3_hq.tar.gz.bh
-rw-rw-r-- 1 rowan rowan  761207798 Nov 12  2018 tvqa_video_frames_fps3_hq.tar.gz.bi
-rw-rw-r-- 1 rowan rowan       2450 Nov 25  2018 tvqa_video_frames_fps3_hq.checksum.txt
-rw-r--r-- 1 rowan rowan    4061313 Apr 22  2019 tvqa_plus_val.json
-rw-r--r-- 1 rowan rowan   31270388 Apr 22  2019 tvqa_plus_train.json
-rw-rw-r-- 1 rowan rowan 4294967296 Apr 29  2019 tvqa_audios.tar.gz.aa
-rw-rw-r-- 1 rowan rowan 4294967296 Apr 29  2019 tvqa_audios.tar.gz.ab
-rw-rw-r-- 1 rowan rowan 4294967296 Apr 29  2019 tvqa_audios.tar.gz.ac
-rw-rw-r-- 1 rowan rowan 4294967296 Apr 29  2019 tvqa_audios.tar.gz.ad
-rw-rw-r-- 1 rowan rowan 4294967296 Apr 29  2019 tvqa_audios.tar.gz.ae
-rw-rw-r-- 1 rowan rowan 4294967296 Apr 29  2019 tvqa_audios.tar.gz.af
-rw-rw-r-- 1 rowan rowan 4294967296 Apr 29  2019 tvqa_audios.tar.gz.ag
-rw-rw-r-- 1 rowan rowan 2750112255 Apr 29  2019 tvqa_audios.tar.gz.ah
-rw-rw-r-- 1 rowan rowan        448 Apr 29  2019 tvqa_audios.checksum.txt
-rw-rw-r-- 1 rowan rowan   15495443 Jul 23  2019 tvqa_subtitles.tar.gz
-rw-rw-r-- 1 rowan rowan   14474003 Jul 23  2019 tvqa_qa_release.tar.gz
-rw-rw-r-- 1 rowan rowan    6718915 Jul 23  2019 tvqa_plus_annotations.tar.gz
-rw-rw-r-- 1 rowan rowan       6899 Nov 11  2019 tvqa_dl_instructions.txt
-rw-rw-r-- 1 rowan rowan   47577821 Nov 11  2019 subs.pkl
-rw-rw-r-- 1 rowan rowan    7323926 Jan 19  2020 tvqa_plus_annotations_preproc_with_test.tar.gz
"""

args = parser.parse_args()
# args.seed = 1337
random.seed(args.seed)

def load_pickle(pickle_file):
    try:
        with open(pickle_file, 'rb') as f:
            pickle_data = pickle.load(f)
    except UnicodeDecodeError as e:
        with open(pickle_file, 'rb') as f:
            pickle_data = pickle.load(f, encoding='latin1')
    except Exception as e:
        print('Unable to load data ', pickle_file, ':', e)
        raise
    return pickle_data

using_face_bbox = os.environ["FBBOX_PATH"] != ""

out_fn = os.path.join(os.environ["DATA_DIR"], 'finetune', '{}{:03d}of{:03d}.tfrecord'.format(args.split, args.fold, args.num_folds))

split_fn = {
    'train': 'train.csv',
    'val': 'test.csv',
    'test': 'test.csv',
}[args.split]
split_fn = os.path.join(os.environ["DATA_DIR"], split_fn)
duration_fn = os.path.join(os.environ["DATA_DIR"], 'durations.csv')
audio_durations = {}

durations = pd.read_csv(duration_fn)
for _, row in durations.iterrows():
    audio_durations[row['filename'][:-4]] = row['duration']

data = []
csv = pd.read_csv(split_fn)
csv = csv.reset_index()
for idx, row in csv.iterrows():
    if idx % args.num_folds != args.fold:
        continue
    item = {}
    duration = audio_durations[str(row['idx'])]
    item['ts'] = (max(0, duration - 29), duration)
    item['vid_name'] = str(row['idx'])
    item['label'] = row['label']
    data.append(item)

ts_lens = [x['ts'][1] - x['ts'][0] for x in data]
max_end = max([x['ts'][1] for x in data])

def generate_mask_w_bb(image, box, fbbox):
    
    arr_img = np.asarray(image)
    bmask = Image.fromarray(np.zeros_like(arr_img))
    fmask = Image.fromarray(np.zeros_like(arr_img))
    final_fbbox = []
    final_bbox = []

    if len(box) == 0:
        return bmask, fmask, final_bbox, final_fbbox
    
    for b, fb in zip(box, fbbox):
        img1 = ImageDraw.Draw(bmask)  
        img1.rectangle(b, fill ="#ffffff")
        final_bbox.append(np.array(b))
        
        region = np.array(image.crop(b))
        targetSize = region.shape
        x_, y_ = 224, 224
        x_scale = targetSize[1] / x_
        y_scale = targetSize[0] / y_

        (origLeft, origTop, origRight, origBottom) = fb

        x = int(np.round(origLeft * x_scale))
        y = int(np.round(origTop * y_scale))
        xmax = int(np.round(origRight * x_scale))
        ymax = int(np.round(origBottom * y_scale))

        final_fb = [b[0]+x, b[1]+y, b[0]+xmax, b[1]+ymax]

        img2 = ImageDraw.Draw(fmask)  
        img2.rectangle(final_fb, fill ="#ffffff")
        final_fbbox.append(np.array(final_fb))
    
    return bmask, fmask, final_bbox, final_fbbox

def read_bbox(filename):
    im_face_dict = {}
    with open(filename, mode ='r')as file:
        csvFile = csv.reader(file)
        for i, lines in enumerate(csvFile):
            if i == 0:
                continue
            assert f'{lines[0]}.jpg' not in im_face_dict
            im_face_dict[f'{lines[0]}.jpg'] = {'box': json.loads(lines[1]), 'fbox': json.loads(lines[2])}
    return im_face_dict

def parse_item(item):
    frames_path = os.path.join(args.data_dir, 'frames',
                            item['vid_name'])
    name = item['vid_name']

    # if using face bounding boxes
    if using_face_bbox:
        frames_bbox_path = os.path.join(os.environ["FBBOX_PATH"], f'3fps_{name}_trimmed-out_bbox_w_faces.json')
        im_face_dict = read_bbox(frames_bbox_path)

    max_frame_no = max([int(x.split('_')[-1].split('.')[0]) for x in os.listdir(frames_path)])
    max_time = (max_frame_no - 1) / 3.0

    ts0, ts1 = item.pop('ts')
    ts0 = max(ts0, 0)
    ts1 = min(ts1, max_time)
    segment_size = 4.6666667 # this differs a tiny bit from pretraining. basically i'm using denser frames here
                             # to avoid needing to cut off any audio

    # Midpoint will be the middle of the (middle) chunk, so round it to the nearest 1/3rd
    # because that's when frames were extracted
    midpoint = (ts0 + ts1) / 2.0
    midpoint = round(midpoint * 3) / 3

    t_start = midpoint - segment_size * 0.5
    t_end = midpoint + segment_size * 0.5

    # Try to extend by 3 segments in either direction of the middle
    times_used0 = [{'start_time': t_start, 'end_time': t_end}]
    for i in range(6):
        for delta in [-segment_size, segment_size]:
            t0 = t_start + delta * (i+1)
            t1 = t_end + delta * (i+1)

            t0 = round(t0 * 3) / 3
            t1 = round(t1 * 3) / 3

            if t1 < 0:
                continue
            if t0 > max_time:
                continue
            if len(times_used0) < 7:
                times_used0.append({'start_time': t0, 'end_time': t1})
    times_used0 = sorted(times_used0, key=lambda x: x['start_time'])

    ###
    frames = []
    frames_bmasks = []
    frames_fmasks = []
    bboxes = []
    fbboxes = []
    times_used = []
    for trow in times_used0:
        t_midframe = (trow['start_time'] + trow['end_time']) / 2.0
        t_mid_3ps_idx = int(round(t_midframe * 3.0)) + 1
        t_mid_3ps_idx = max(t_mid_3ps_idx, 1)
        t_mid_3ps_idx = min(t_mid_3ps_idx, max_frame_no)

        fn = os.path.join(frames_path, item['vid_name'] + f'_{t_mid_3ps_idx:03d}.jpg')

        if using_face_bbox:
            if f'{t_mid_3ps_idx:05d}.jpg' in im_face_dict:
                bbox = im_face_dict[f'{t_mid_3ps_idx:05d}.jpg']['box']
                fbbox = im_face_dict[f'{t_mid_3ps_idx:05d}.jpg']['fbox']
            else:
                bbox = []
                fbbox = []

        if os.path.exists(fn):
            image = Image.open(fn)
            frames.append(image)
            times_used.append(trow)

            if using_face_bbox:
                bmask, fmask, final_bbox, final_fbbox = generate_mask_w_bb(image, bbox, fbbox)
                image = resize_image(image, shorter_size_trg=450, longer_size_max=800)
                bmask = resize_image(bmask, shorter_size_trg=450, longer_size_max=800)
                fmask = resize_image(fmask, shorter_size_trg=450, longer_size_max=800)
                frames_bmasks.append(bmask)
                frames_fmasks.append(fmask)
                bboxes.append(final_bbox)
                fbboxes.append(final_fbbox)
        else:
            print(f"{fn} doesn't exist")


    audio_fn_mp3 = os.path.join(os.environ["MP3_PATH"], item['vid_name'] + ".mp3")
    # Start the process
    temp_folder = tempfile.TemporaryDirectory()
    audio_fn = os.path.join(temp_folder.name, 'audio.wav')

    # Before we were sampling at 22050, and we had 188 mel windows for 5 sec.
    # now we want exactly 180 windows from 4.6667 sec.
    # 4.66667 * sr / 180 = 5 * 22050 / 188

    ffmpeg_process = subprocess.Popen(['ffmpeg', '-y', '-i', audio_fn_mp3, '-ac', '1', '-ar', '22620',
                                       audio_fn], stdout=-1, stderr=-1, text=True)
    try:
        stdout, stderr = ffmpeg_process.communicate(None, timeout=15.0)
    except subprocess.TimeoutExpired:
        ffmpeg_process.kill()
        stdout, stderr = subprocess.TimeoutExpired.communicate()
        raise ValueError("couldnt convert in time")
    except:  # Keyboardinterrupt
        ffmpeg_process.kill()
        raise
    if not os.path.exists(audio_fn):
        import ipdb
        ipdb.set_trace()
    ffmpeg_process.kill()
    sr, waveform = wavfile.read(audio_fn, mmap=False)
    waveform = waveform.astype('float32')
    waveform /= max(np.abs(waveform).max(), 1.0)

    # Pad to max time just in case
    desired_final_frame = int(sr * max([t['end_time'] for t in times_used]))
    if waveform.size < desired_final_frame:
        waveform = np.concatenate([waveform, np.zeros(desired_final_frame - waveform.size, dtype=np.float32)], 0)

    # Process each segment. here i'm always using a playback_speed of 1 (aka no fast forwarding).
    spectrograms = []
    for ts_group in times_used:
        start_idx = int(sr * ts_group['start_time'])
        end_idx = int(sr * ts_group['end_time'])

        if start_idx < 0:
            # i have to add 1 here because casting to int floors "up" rather than "down" if start time is negative.
            wav_ts = np.concatenate([np.zeros(1-start_idx, dtype=np.float32), waveform[:end_idx]], 0)
        else:
            wav_ts = waveform[start_idx:end_idx]
        spectrograms.append(make_spectrogram(wav_ts, playback_speed=1, sr=22050, pad_size=0))
    temp_folder.cleanup()

    # Get subtitles
    #############################################################
    
    def _parse_ts(ts):
        sec = ts.hours * 3600 + ts.minutes * 60 + ts.seconds + ts.milliseconds / 1000.0
        return sec
    for ts in times_used:
        ts['sub'] = []

    bounds = np.array([x['start_time'] for x in times_used] + [times_used[-1]['end_time']])
    data = {"word": [], "start": [], "end": []}
    ts_filename = os.path.join(os.environ['DATA_DIR'], 'align', item['vid_name'] + '.json')
    with open(ts_filename, 'r') as f:
        language_data = json.load(f)

    for w in language_data['words']:
        data['word'].append(w['word'])
        data['start'].append(w['start'] if w['case'] == 'success' else None)
        data['end'].append(w['end'] if w['case'] == 'success' else None)
    prev_success_offsets = []

    for i, w in enumerate(language_data['words']):
        if data['start'][i] is not None:
            prev_success_offsets.append(i)
        elif len(prev_success_offsets) == 0:
            prev_success_offsets.append(0)
        else:
            prev_success_offsets.append(prev_success_offsets[-1])
    
    prev_success_offsets_suffix = []
    for i, w in enumerate(data['word'][::-1]):
        j = len(data['word']) - i - 1
        if data['start'][j] is not None:
            prev_success_offsets_suffix.append(j)
        elif len(prev_success_offsets_suffix) == 0:
            prev_success_offsets_suffix.append(-1)
        else:
            prev_success_offsets_suffix.append(prev_success_offsets_suffix[-1])
    prev_success_offsets_suffix = prev_success_offsets_suffix[::-1]

    for i, w in enumerate(data['word']):
        if data['start'][i] is None:
            try:
                off = len(w)
                prev_succ = prev_success_offsets[i]
                next_succ = prev_success_offsets_suffix[i]
                off_diff = language_data['words'][next_succ]['startOffset'] - language_data['words'][prev_succ]['endOffset']
                ts_diff = data['end'][next_succ] - data['start'][prev_succ]
                time_per_offset = ts_diff / off_diff
                start_offset = language_data['words'][i]['startOffset']
                end_offset = language_data['words'][i]['endOffset']
                start_offset_ts = data['end'][prev_succ] + time_per_offset * (start_offset - language_data['words'][prev_succ]['endOffset'])
                end_offset_ts = data['end'][prev_succ] + time_per_offset * (end_offset - language_data['words'][prev_succ]['endOffset'])
                data['start'][i] = start_offset_ts
                data['end'][i] = end_offset_ts
            except Exception:
                if i == 0:
                    data['start'][i] = 0
                    data['end'][i] = 0
                else:
                    data['start'][i] = data['end'][i-1]
                    data['end'][i] = data['end'][i-1]
    

    start_times = data['start']
    end_times = data['end']
    words = data['word']
    for sentence, start_time, end_time in zip(words, start_times, end_times):
        mid_time = (start_time + end_time) / 2.0
        pos = np.searchsorted(bounds, mid_time)
        if (pos > 0) and (pos <= len(times_used)):
            times_used[pos-1]['sub'].append(sentence)

    for ts in times_used:
        ts['sub'] = ' '.join(ts['sub'])
        ts['sub'] = unidecode(ftfy.ftfy(ts['sub'])).replace('\n', ' ')

    # Figure out the relative position of the annotation
    my_duration = times_used0[-1]['end_time'] - times_used[0]['start_time']
    rel_localized_tstart = (ts0 - times_used[0]['start_time']) / my_duration
    rel_localized_tend = (ts1 - times_used[0]['start_time']) / my_duration
    qa_item = {'id': '{}'.format(item['vid_name'])}
    qa_item['rel_localization'] = (rel_localized_tstart, rel_localized_tend)

    qa_item['num_frames'] = len(frames)
    qa_item['magic_number'] = 255.0 / max(np.percentile(np.stack(spectrograms).reshape(-1, 65), 99), 1.0)
    qa_item['_mp3_fn'] = audio_fn_mp3
    qa_item['_frames_path'] = frames_path
    qa_item['_time_interval'] = [ts0, ts1]
    qa_item['label'] = item['label']


    # Pad to 7
    for i in range(7 - len(frames)):
        frames.append(frames[-1])
        spectrograms.append(spectrograms[-1])
        if using_face_bbox:
            frames_bmasks.append(frames_bmasks[-1])
            frames_fmasks.append(frames_fmasks[-1])
            bboxes.append(bboxes[-1])
            fbboxes.append(fbboxes[-1])
        times_used.append({'start_time': -1, 'end_time': -1, 'sub': ''})

    if using_face_bbox:
        return qa_item, frames, frames_bmasks, frames_fmasks, bboxes, fbboxes, spectrograms, times_used
    else:
        return qa_item, frames, spectrograms, times_used

num_written = 0
max_len = 0
with GCSTFRecordWriter(out_fn, auto_close=False) as tfrecord_writer:
    for item in data:
        if using_face_bbox:
            qa_item, frames, frames_bmasks, frames_fmasks, bboxes, fbboxes, specs, subs = parse_item(item)
        else:
            qa_item, frames, specs, subs = parse_item(item)

        # Tack on the relative position of the localized timestamp, plus a START token for separation
        ts_enc = encoder.encode('{} to {}'.format(int(qa_item['rel_localization'][0] * 100),
                                                  int(qa_item['rel_localization'][1] * 100),
                                                  )).ids + [START]

        feature_dict = {
            'id': bytes_feature(qa_item['id'].encode('utf-8')),
            'magic_number': float_list_feature([qa_item['magic_number']]),
            'num_frames': int64_feature(qa_item['num_frames']),
            'label': int64_feature(qa_item['label']),
        }

        if using_face_bbox:
            for i, (frame_i, bmask_i, fmask_i, bbox_i, fbbox_i, spec_i, subs_i) in enumerate(zip(frames, frames_bmasks, frames_fmasks, bboxes, fbboxes, specs, subs)):
                feature_dict[f'c{i:02d}/image_encoded'] = bytes_feature(pil_image_to_jpgstring(frame_i))
                feature_dict[f'c{i:02d}/bmasks_encoded'] = bytes_feature(pil_image_to_jpgstring(bmask_i))
                feature_dict[f'c{i:02d}/fmasks_encoded'] = bytes_feature(pil_image_to_jpgstring(fmask_i))

                bbox_1d = [x for val in bbox_i for x in val]
                fbbox_1d = [x for val in fbbox_i for x in val]
                feature_dict[f'c{i:02d}/bbox'] = float_list_feature(bbox_1d)
                feature_dict[f'c{i:02d}/fbbox'] = float_list_feature(fbbox_1d)

                compressed = np.minimum(spec_i.reshape(-1, 65) * qa_item['magic_number'], 255.0).astype(np.uint8)
                assert compressed.shape == (180, 65)
                feature_dict[f'c{i:02d}/spec_encoded'] = bytes_feature(pil_image_to_jpgstring(Image.fromarray(compressed)))

                feature_dict[f'c{i:02d}/sub'] = int64_list_feature(encoder.encode(subs_i['sub']).ids)
        else:
            for i, (frame_i, spec_i, subs_i) in enumerate(zip(frames, specs, subs)):
                feature_dict[f'c{i:02d}/image_encoded'] = bytes_feature(pil_image_to_jpgstring(frame_i))

                compressed = np.minimum(spec_i.reshape(-1, 65) * qa_item['magic_number'], 255.0).astype(np.uint8)
                assert compressed.shape == (180, 65)
                feature_dict[f'c{i:02d}/spec_encoded'] = bytes_feature(pil_image_to_jpgstring(Image.fromarray(compressed)))

                feature_dict[f'c{i:02d}/sub'] = int64_list_feature(encoder.encode(subs_i['sub']).ids)

            #
            # # Debug image
            # os.makedirs('debug', exist_ok=True)
            # for i in range(7):
            #     with open(f'debug/ex{num_written}_img{i}.jpg', 'wb') as f:
            #         f.write(feature_dict[f'c{i:02d}/image_encoded'].bytes_list.value[0])
            #
            #     jpgstr = feature_dict[f'c{i:02d}/spec_encoded'].bytes_list.value[0]
            #     inv = Image.open(io.BytesIO(jpgstr))
            #     inv_np = np.asarray(inv).astype(np.float32) / qa_item['magic_number']
            #     inv_np = inv_np[:, :64].reshape(3, 60, 64) # remove playback speed feature
            #     for ii, spec_ii in enumerate(inv_np):
            #         y = invert_spectrogram(spec_ii)
            #         wavfile.write(f'debug/ex{num_written}_audio{i}_{ii}.wav', rate=22050, data=y)
            #
            # # Get the ground truth
            # mp3_orig = qa_item['_mp3_fn']
            # print("time interval {}".format(qa_item['_time_interval']), flush=True)
            # os.system(f'cp {mp3_orig} debug/ex{num_written}_audio_raw.mp3')
            # frames_path = qa_item['_frames_path']
            # os.system(f'cp -r {frames_path} debug/ex{num_written}_frames')
            # # assert False

        example = tf.train.Example(features=tf.train.Features(feature=feature_dict))
        tfrecord_writer.write(example.SerializeToString())
        num_written += 1
        if num_written % 100 == 0:
            print("Have written {} / {}".format(num_written, len(data)), flush=True)
    tfrecord_writer.close()

print(f'Finished writing {num_written} questions; max len = {max_len}', flush=True)