# -*- coding: utf-8 -*-
"""
Example code for turning the trace of one instance of COCO Localized Narratives 
val split to bounding boxes and visualizing the result.

This script is self-contained and does not require downloading the image file.
It will generate plots on a blank white background.
"""
import json
import collections
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import os
from rdp import rdp
import json

# Provided annotation data for COCO val split image 322944
# COCO_VAL_JSON_DICT = {"dataset_id": "mscoco_train2017", "image_id": "208612", "annotator_id": 61, "caption": "In the middle of the image a woman is sitting on a chair and holding a mobile phone. Behind her there is a fencing.", "timed_caption": [{"utterance": "In", "start_time": 0.0, "end_time": 0.0}, {"utterance": "the", "start_time": 0.0, "end_time": 0.8}, {"utterance": "middle", "start_time": 0.8, "end_time": 1.1}, {"utterance": "of", "start_time": 1.1, "end_time": 1.2}, {"utterance": "the", "start_time": 1.2, "end_time": 1.3}, {"utterance": "image", "start_time": 1.3, "end_time": 1.8}, {"utterance": "a", "start_time": 1.8, "end_time": 1.8}, {"utterance": "woman", "start_time": 1.8, "end_time": 2.2}, {"utterance": "is", "start_time": 2.2, "end_time": 2.2}, {"utterance": "sitting", "start_time": 2.2, "end_time": 2.7}, {"utterance": "on", "start_time": 2.7, "end_time": 2.9}, {"utterance": "a", "start_time": 2.9, "end_time": 3.0}, {"utterance": "chair", "start_time": 3.0, "end_time": 3.1}, {"utterance": "and", "start_time": 3.1, "end_time": 3.4}, {"utterance": "holding", "start_time": 3.4, "end_time": 4.2}, {"utterance": "a", "start_time": 4.2, "end_time": 4.3}, {"utterance": "mobile", "start_time": 4.3, "end_time": 5.0}, {"utterance": "phone.", "start_time": 5.0, "end_time": 5.3}, {"utterance": "Behind", "start_time": 5.3, "end_time": 6.1}, {"utterance": "her", "start_time": 6.1, "end_time": 6.4}, {"utterance": "there", "start_time": 6.4, "end_time": 6.7}, {"utterance": "is", "start_time": 6.7, "end_time": 6.8}, {"utterance": "a", "start_time": 6.8, "end_time": 6.9}, {"utterance": "fencing.", "start_time": 6.9, "end_time": 7.6}], "traces": [[{"x": -0.0217, "y": -0.0682, "t": 0.08}, {"x": 0.0031, "y": -0.0324, "t": 0.096}, {"x": 0.031, "y": 0.0089, "t": 0.112}, {"x": 0.0578, "y": 0.0461, "t": 0.129}, {"x": 0.0806, "y": 0.0847, "t": 0.146}, {"x": 0.1012, "y": 0.1233, "t": 0.163}, {"x": 0.1209, "y": 0.1591, "t": 0.18}, {"x": 0.1415, "y": 0.199, "t": 0.196}, {"x": 0.1632, "y": 0.239, "t": 0.213}, {"x": 0.1849, "y": 0.2858, "t": 0.229}, {"x": 0.2056, "y": 0.334, "t": 0.245}, {"x": 0.2366, "y": 0.4043, "t": 0.263}, {"x": 0.2593, "y": 0.4484, "t": 0.279}, {"x": 0.282, "y": 0.4828, "t": 0.295}, {"x": 0.3048, "y": 0.5158, "t": 0.312}, {"x": 0.3254, "y": 0.5365, "t": 0.329}, {"x": 0.3399, "y": 0.5489, "t": 0.346}, {"x": 0.3512, "y": 0.553, "t": 0.363}, {"x": 0.3647, "y": 0.553, "t": 0.386}, {"x": 0.3709, "y": 0.5503, "t": 0.395}, {"x": 0.3812, "y": 0.5434, "t": 0.412}, {"x": 0.3905, "y": 0.531, "t": 0.429}, {"x": 0.3998, "y": 0.5131, "t": 0.446}, {"x": 0.4153, "y": 0.4745, "t": 0.464}, {"x": 0.4267, "y": 0.4497, "t": 0.481}, {"x": 0.437, "y": 0.4249, "t": 0.496}, {"x": 0.4473, "y": 0.4029, "t": 0.513}, {"x": 0.4566, "y": 0.3822, "t": 0.529}, {"x": 0.4659, "y": 0.3602, "t": 0.546}, {"x": 0.4742, "y": 0.3368, "t": 0.563}, {"x": 0.4824, "y": 0.312, "t": 0.579}, {"x": 0.4897, "y": 0.2858, "t": 0.596}, {"x": 0.4969, "y": 0.2638, "t": 0.613}, {"x": 0.5031, "y": 0.2445, "t": 0.63}, {"x": 0.5083, "y": 0.2224, "t": 0.647}, {"x": 0.5134, "y": 0.199, "t": 0.663}, {"x": 0.5165, "y": 0.1852, "t": 0.679}, {"x": 0.5176, "y": 0.1742, "t": 0.697}, {"x": 0.5186, "y": 0.1632, "t": 0.714}, {"x": 0.5196, "y": 0.1522, "t": 0.729}, {"x": 0.5207, "y": 0.1412, "t": 0.747}, {"x": 0.5217, "y": 0.1329, "t": 0.763}, {"x": 0.5217, "y": 0.1274, "t": 0.779}, {"x": 0.5217, "y": 0.1205, "t": 0.797}, {"x": 0.5217, "y": 0.1164, "t": 0.814}, {"x": 0.5217, "y": 0.1122, "t": 0.83}, {"x": 0.5217, "y": 0.1081, "t": 0.846}, {"x": 0.5207, "y": 0.1026, "t": 0.865}, {"x": 0.5196, "y": 0.0985, "t": 0.88}, {"x": 0.5186, "y": 0.0971, "t": 0.896}, {"x": 0.5165, "y": 0.093, "t": 0.914}, {"x": 0.5155, "y": 0.093, "t": 0.931}, {"x": 0.5134, "y": 0.0916, "t": 0.947}, {"x": 0.5124, "y": 0.0902, "t": 0.964}, {"x": 0.5083, "y": 0.0902, "t": 0.98}, {"x": 0.5031, "y": 0.0902, "t": 0.997}, {"x": 0.499, "y": 0.0902, "t": 1.013}, {"x": 0.4948, "y": 0.0902, "t": 1.03}, {"x": 0.4886, "y": 0.0902, "t": 1.047}, {"x": 0.4824, "y": 0.0943, "t": 1.063}, {"x": 0.4773, "y": 0.0957, "t": 1.079}, {"x": 0.4732, "y": 0.1012, "t": 1.097}, {"x": 0.4669, "y": 0.1054, "t": 1.113}, {"x": 0.4607, "y": 0.1122, "t": 1.13}, {"x": 0.4556, "y": 0.1191, "t": 1.147}, {"x": 0.4484, "y": 0.1246, "t": 1.163}, {"x": 0.4442, "y": 0.1301, "t": 1.18}, {"x": 0.4411, "y": 0.137, "t": 1.198}, {"x": 0.438, "y": 0.1439, "t": 1.212}, {"x": 0.4339, "y": 0.1522, "t": 1.229}, {"x": 0.4298, "y": 0.1632, "t": 1.251}, {"x": 0.4267, "y": 0.1729, "t": 1.263}, {"x": 0.4215, "y": 0.1825, "t": 1.281}, {"x": 0.4184, "y": 0.1935, "t": 1.296}, {"x": 0.4112, "y": 0.2059, "t": 1.315}, {"x": 0.405, "y": 0.2197, "t": 1.331}, {"x": 0.3843, "y": 0.2555, "t": 1.363}, {"x": 0.3709, "y": 0.2748, "t": 1.38}, {"x": 0.3616, "y": 0.2941, "t": 1.397}, {"x": 0.3533, "y": 0.3106, "t": 1.413}, {"x": 0.344, "y": 0.3299, "t": 1.43}, {"x": 0.3296, "y": 0.3574, "t": 1.446}, {"x": 0.3223, "y": 0.3753, "t": 1.464}, {"x": 0.3151, "y": 0.3932, "t": 1.48}, {"x": 0.3058, "y": 0.4139, "t": 1.496}, {"x": 0.2965, "y": 0.436, "t": 1.514}, {"x": 0.2882, "y": 0.4552, "t": 1.53}, {"x": 0.2831, "y": 0.4704, "t": 1.546}, {"x": 0.2779, "y": 0.4883, "t": 1.564}, {"x": 0.2758, "y": 0.5076, "t": 1.58}, {"x": 0.2738, "y": 0.5282, "t": 1.597}, {"x": 0.2727, "y": 0.5475, "t": 1.614}, {"x": 0.2727, "y": 0.5737, "t": 1.631}, {"x": 0.2738, "y": 0.5875, "t": 1.647}, {"x": 0.2769, "y": 0.6012, "t": 1.663}, {"x": 0.28, "y": 0.6123, "t": 1.68}, {"x": 0.2851, "y": 0.6233, "t": 1.697}, {"x": 0.2913, "y": 0.6343, "t": 1.714}, {"x": 0.3006, "y": 0.6439, "t": 1.73}, {"x": 0.3099, "y": 0.6508, "t": 1.747}, {"x": 0.3213, "y": 0.6563, "t": 1.763}, {"x": 0.3337, "y": 0.6646, "t": 1.781}, {"x": 0.3461, "y": 0.6674, "t": 1.797}, {"x": 0.3564, "y": 0.6729, "t": 1.814}, {"x": 0.3709, "y": 0.677, "t": 1.831}, {"x": 0.3781, "y": 0.677, "t": 1.847}, {"x": 0.3874, "y": 0.6784, "t": 1.865}, {"x": 0.3977, "y": 0.6811, "t": 1.881}, {"x": 0.4101, "y": 0.6825, "t": 1.897}, {"x": 0.4246, "y": 0.688, "t": 1.914}, {"x": 0.436, "y": 0.688, "t": 1.931}, {"x": 0.4432, "y": 0.688, "t": 1.947}, {"x": 0.4484, "y": 0.688, "t": 1.964}, {"x": 0.4546, "y": 0.688, "t": 1.981}, {"x": 0.4618, "y": 0.688, "t": 1.998}, {"x": 0.4711, "y": 0.688, "t": 2.014}, {"x": 0.4886, "y": 0.6866, "t": 2.031}, {"x": 0.5, "y": 0.6853, "t": 2.047}, {"x": 0.5103, "y": 0.6825, "t": 2.064}, {"x": 0.5196, "y": 0.6811, "t": 2.081}, {"x": 0.5289, "y": 0.6784, "t": 2.097}, {"x": 0.5362, "y": 0.6784, "t": 2.114}, {"x": 0.5424, "y": 0.6756, "t": 2.136}, {"x": 0.5455, "y": 0.6743, "t": 2.148}, {"x": 0.5506, "y": 0.6743, "t": 2.164}, {"x": 0.5527, "y": 0.6743, "t": 2.181}, {"x": 0.5548, "y": 0.6743, "t": 2.198}, {"x": 0.5558, "y": 0.6729, "t": 2.214}, {"x": 0.5579, "y": 0.6701, "t": 2.231}, {"x": 0.5589, "y": 0.6687, "t": 2.246}, {"x": 0.5599, "y": 0.6674, "t": 2.264}, {"x": 0.5599, "y": 0.666, "t": 2.747}, {"x": 0.5599, "y": 0.666, "t": 2.764}, {"x": 0.5599, "y": 0.6632, "t": 2.948}, {"x": 0.5599, "y": 0.6522, "t": 2.965}, {"x": 0.5599, "y": 0.6412, "t": 2.981}, {"x": 0.5599, "y": 0.6302, "t": 2.998}, {"x": 0.562, "y": 0.6109, "t": 3.015}, {"x": 0.563, "y": 0.5957, "t": 3.031}, {"x": 0.563, "y": 0.582, "t": 3.048}, {"x": 0.5641, "y": 0.5668, "t": 3.064}, {"x": 0.5672, "y": 0.5475, "t": 3.084}, {"x": 0.5672, "y": 0.5282, "t": 3.098}, {"x": 0.5661, "y": 0.5076, "t": 3.115}, {"x": 0.5661, "y": 0.4869, "t": 3.131}, {"x": 0.5651, "y": 0.4704, "t": 3.148}, {"x": 0.5641, "y": 0.4552, "t": 3.165}, {"x": 0.562, "y": 0.4428, "t": 3.181}, {"x": 0.5599, "y": 0.4291, "t": 3.197}, {"x": 0.5599, "y": 0.4084, "t": 3.215}, {"x": 0.5599, "y": 0.3932, "t": 3.231}, {"x": 0.5599, "y": 0.3767, "t": 3.248}, {"x": 0.5599, "y": 0.3574, "t": 3.265}, {"x": 0.5599, "y": 0.3409, "t": 3.282}, {"x": 0.5599, "y": 0.323, "t": 3.298}, {"x": 0.5579, "y": 0.3037, "t": 3.315}, {"x": 0.5579, "y": 0.2872, "t": 3.331}, {"x": 0.5589, "y": 0.2748, "t": 3.349}, {"x": 0.5589, "y": 0.2624, "t": 3.366}, {"x": 0.5589, "y": 0.2527, "t": 3.382}, {"x": 0.5599, "y": 0.239, "t": 3.399}, {"x": 0.562, "y": 0.228, "t": 3.415}, {"x": 0.563, "y": 0.2183, "t": 3.431}, {"x": 0.563, "y": 0.2087, "t": 3.448}, {"x": 0.5641, "y": 0.2004, "t": 3.466}, {"x": 0.5641, "y": 0.1949, "t": 3.481}, {"x": 0.5641, "y": 0.1908, "t": 3.499}, {"x": 0.5641, "y": 0.1852, "t": 3.515}, {"x": 0.5641, "y": 0.1811, "t": 3.532}, {"x": 0.5641, "y": 0.177, "t": 3.548}, {"x": 0.563, "y": 0.1715, "t": 3.565}, {"x": 0.561, "y": 0.1687, "t": 3.582}, {"x": 0.5599, "y": 0.1673, "t": 3.598}, {"x": 0.5568, "y": 0.1632, "t": 3.615}, {"x": 0.5548, "y": 0.1632, "t": 3.631}, {"x": 0.5527, "y": 0.1618, "t": 3.649}, {"x": 0.5486, "y": 0.1618, "t": 3.665}, {"x": 0.5465, "y": 0.1618, "t": 3.682}, {"x": 0.5444, "y": 0.1618, "t": 3.699}, {"x": 0.5434, "y": 0.1618, "t": 3.715}, {"x": 0.5424, "y": 0.1618, "t": 3.732}, {"x": 0.5403, "y": 0.1618, "t": 3.749}, {"x": 0.5393, "y": 0.1618, "t": 3.765}, {"x": 0.5362, "y": 0.1618, "t": 3.781}, {"x": 0.531, "y": 0.1618, "t": 3.799}, {"x": 0.5279, "y": 0.1618, "t": 3.815}, {"x": 0.5238, "y": 0.1618, "t": 3.832}, {"x": 0.5207, "y": 0.1646, "t": 3.849}, {"x": 0.5165, "y": 0.166, "t": 3.866}, {"x": 0.5114, "y": 0.1687, "t": 3.882}, {"x": 0.5072, "y": 0.1715, "t": 3.899}, {"x": 0.5021, "y": 0.1715, "t": 3.917}, {"x": 0.4969, "y": 0.1756, "t": 3.932}, {"x": 0.4938, "y": 0.1784, "t": 3.95}, {"x": 0.4897, "y": 0.1811, "t": 3.966}, {"x": 0.4845, "y": 0.1852, "t": 3.981}, {"x": 0.4804, "y": 0.1908, "t": 3.999}, {"x": 0.4763, "y": 0.1949, "t": 4.015}, {"x": 0.4742, "y": 0.2018, "t": 4.032}, {"x": 0.4701, "y": 0.2087, "t": 4.049}, {"x": 0.4669, "y": 0.2142, "t": 4.069}, {"x": 0.4649, "y": 0.2238, "t": 4.081}, {"x": 0.4628, "y": 0.2293, "t": 4.099}, {"x": 0.4597, "y": 0.2404, "t": 4.114}, {"x": 0.4566, "y": 0.25, "t": 4.132}, {"x": 0.4535, "y": 0.261, "t": 4.149}, {"x": 0.4515, "y": 0.272, "t": 4.166}, {"x": 0.4473, "y": 0.2886, "t": 4.183}, {"x": 0.4473, "y": 0.2913, "t": 4.198}, {"x": 0.4463, "y": 0.3051, "t": 4.215}, {"x": 0.4463, "y": 0.3092, "t": 4.233}, {"x": 0.4463, "y": 0.312, "t": 4.249}, {"x": 0.4463, "y": 0.3134, "t": 4.266}, {"x": 0.4463, "y": 0.3147, "t": 4.282}, {"x": 0.4484, "y": 0.3147, "t": 4.349}, {"x": 0.4484, "y": 0.3147, "t": 4.366}, {"x": 0.4515, "y": 0.3147, "t": 4.383}, {"x": 0.4525, "y": 0.3147, "t": 4.399}, {"x": 0.4546, "y": 0.3147, "t": 4.416}, {"x": 0.4577, "y": 0.3106, "t": 4.433}, {"x": 0.4597, "y": 0.3078, "t": 4.45}, {"x": 0.4618, "y": 0.3051, "t": 4.466}, {"x": 0.4628, "y": 0.3023, "t": 4.483}, {"x": 0.4649, "y": 0.2996, "t": 4.499}, {"x": 0.4669, "y": 0.2968, "t": 4.516}, {"x": 0.4701, "y": 0.2927, "t": 4.533}, {"x": 0.4721, "y": 0.2886, "t": 4.549}, {"x": 0.4763, "y": 0.2844, "t": 4.566}, {"x": 0.4793, "y": 0.2789, "t": 4.583}, {"x": 0.4824, "y": 0.2748, "t": 4.599}, {"x": 0.4845, "y": 0.2707, "t": 4.619}, {"x": 0.4886, "y": 0.2651, "t": 4.632}, {"x": 0.4928, "y": 0.2624, "t": 4.649}, {"x": 0.4959, "y": 0.2555, "t": 4.666}, {"x": 0.499, "y": 0.2514, "t": 4.683}, {"x": 0.501, "y": 0.2486, "t": 4.7}, {"x": 0.5041, "y": 0.2431, "t": 4.717}, {"x": 0.5072, "y": 0.239, "t": 4.733}, {"x": 0.5093, "y": 0.2362, "t": 4.749}, {"x": 0.5114, "y": 0.2321, "t": 4.767}, {"x": 0.5134, "y": 0.2266, "t": 4.783}, {"x": 0.5155, "y": 0.2224, "t": 4.799}, {"x": 0.5165, "y": 0.2197, "t": 4.816}, {"x": 0.5176, "y": 0.2183, "t": 4.837}, {"x": 0.5196, "y": 0.2142, "t": 4.857}, {"x": 0.5207, "y": 0.2114, "t": 4.865}, {"x": 0.5207, "y": 0.21, "t": 4.883}, {"x": 0.5217, "y": 0.2073, "t": 4.9}, {"x": 0.5227, "y": 0.2059, "t": 4.917}, {"x": 0.5238, "y": 0.2018, "t": 4.933}, {"x": 0.5238, "y": 0.199, "t": 4.949}, {"x": 0.5248, "y": 0.1976, "t": 4.967}, {"x": 0.5248, "y": 0.1963, "t": 4.983}, {"x": 0.5248, "y": 0.1949, "t": 4.999}, {"x": 0.5248, "y": 0.1935, "t": 5.017}, {"x": 0.5258, "y": 0.1908, "t": 5.033}, {"x": 0.5258, "y": 0.1908, "t": 5.05}, {"x": 0.5269, "y": 0.1894, "t": 5.067}, {"x": 0.5269, "y": 0.188, "t": 5.1}, {"x": 0.5279, "y": 0.1852, "t": 5.884}, {"x": 0.5341, "y": 0.1839, "t": 5.901}, {"x": 0.5465, "y": 0.1811, "t": 5.916}, {"x": 0.5651, "y": 0.1797, "t": 5.934}, {"x": 0.6106, "y": 0.1797, "t": 5.951}, {"x": 0.6488, "y": 0.1797, "t": 5.966}, {"x": 0.686, "y": 0.1797, "t": 5.983}, {"x": 0.7221, "y": 0.1797, "t": 6.0}, {"x": 0.7521, "y": 0.1797, "t": 6.017}, {"x": 0.781, "y": 0.1797, "t": 6.034}, {"x": 0.8099, "y": 0.1797, "t": 6.05}, {"x": 0.8399, "y": 0.1797, "t": 6.069}, {"x": 0.8709, "y": 0.1797, "t": 6.083}, {"x": 0.904, "y": 0.1742, "t": 6.101}, {"x": 0.9318, "y": 0.166, "t": 6.117}, {"x": 0.9608, "y": 0.1536, "t": 6.133}, {"x": 0.9949, "y": 0.1384, "t": 6.15}, {"x": 1.0124, "y": 0.126, "t": 6.167}, {"x": 1.0269, "y": 0.115, "t": 6.184}, {"x": 1.0413, "y": 0.0998, "t": 6.2}], [{"x": 0.1849, "y": -0.0613, "t": 6.867}, {"x": 0.1694, "y": -0.0517, "t": 6.885}, {"x": 0.1539, "y": -0.0393, "t": 6.902}, {"x": 0.1353, "y": -0.0269, "t": 6.918}, {"x": 0.1157, "y": -0.0076, "t": 6.936}, {"x": 0.1054, "y": 0.0062, "t": 6.951}, {"x": 0.0981, "y": 0.0172, "t": 6.968}, {"x": 0.093, "y": 0.0323, "t": 6.984}, {"x": 0.0857, "y": 0.0516, "t": 7.002}, {"x": 0.0806, "y": 0.0668, "t": 7.019}, {"x": 0.0754, "y": 0.0861, "t": 7.035}, {"x": 0.0692, "y": 0.1067, "t": 7.052}, {"x": 0.0661, "y": 0.1246, "t": 7.068}, {"x": 0.063, "y": 0.1453, "t": 7.086}, {"x": 0.0599, "y": 0.1632, "t": 7.102}, {"x": 0.0599, "y": 0.1825, "t": 7.118}, {"x": 0.0599, "y": 0.2059, "t": 7.135}, {"x": 0.0599, "y": 0.2445, "t": 7.151}, {"x": 0.0599, "y": 0.2707, "t": 7.169}, {"x": 0.063, "y": 0.2982, "t": 7.185}, {"x": 0.0661, "y": 0.3257, "t": 7.203}, {"x": 0.0671, "y": 0.3574, "t": 7.219}, {"x": 0.0671, "y": 0.3919, "t": 7.235}, {"x": 0.062, "y": 0.4249, "t": 7.251}, {"x": 0.0537, "y": 0.4566, "t": 7.269}, {"x": 0.0475, "y": 0.4924, "t": 7.285}, {"x": 0.0372, "y": 0.531, "t": 7.302}, {"x": 0.0289, "y": 0.5709, "t": 7.318}, {"x": 0.0165, "y": 0.6123, "t": 7.334}, {"x": 0.0021, "y": 0.6701, "t": 7.352}, {"x": -0.0103, "y": 0.7059, "t": 7.369}, {"x": -0.0217, "y": 0.7376, "t": 7.385}, {"x": -0.0351, "y": 0.7693, "t": 7.402}, {"x": -0.0475, "y": 0.801, "t": 7.418}]], "voice_recording": "coco_train/coco_train_208612_61.ogg"}
# Note: The full data from the prompt is used in the actual running script, 
# but truncated here for readability. The code below uses the full COCO_VAL_JSON_DICT.


# The full data is too large to display again, assuming it's loaded as `COCO_VAL_JSON_DICT`.



def load_jsonl_by_index(jsonl_path, selected_index=1):
    """
    读取jsonl文件，返回第selected_index条（从1开始计数）数据
    """
    # import ipdb;ipdb.set_trace()
    with open(jsonl_path, 'r', encoding='utf-8') as f:
        for i, line in enumerate(f, 1):
            if i == selected_index:
                return json.loads(line)
    raise IndexError(f"JSONL文件中没有第{selected_index}条数据")

def visualize_all_boxes(
    image_id, segmented_xs, segmented_ys, xmins, xmaxs, ymins, ymaxs,
    tokens, segmentation_method, image_base_path, dataset_id, full_caption, output_dir="."):
    """
    Visualizes segmented traces and their bounding boxes on a canvas.
    
    Args:
        image_id (str): The ID of the image for titling.
        segmented_xs (list of lists): List of segmented x-coordinate lists.
        segmented_ys (list of lists): List of segmented y-coordinate lists.
        xmins, xmaxs, ymins, ymaxs (list): Lists of bounding box coordinates.
        tokens (list): List of corresponding text tokens.
        segmentation_method (str): The method used for segmentation, for titling.
        image_base_path (str): The base directory where COCO images are存储.
        dataset_id (str): The dataset ID, e.g., 'mscoco_val2017'.
        full_caption (str): The entire caption for the image.
        output_dir (str): The directory to save the output images.
    """
    # Construct the path to the actual image file.
    # COCO 2017 has splits like 'val2017', 'train2017'. We parse this from the dataset_id.
    split_name = dataset_id.split('_')[1]
    image_filename = f"{int(image_id):012d}.jpg"
    full_image_path = os.path.join(image_base_path, split_name, image_filename)

    try:
        im = Image.open(full_image_path)
        imw, imh = im.size
        # Save the original image to the output directory
        original_image_output_path = os.path.join(output_dir, os.path.basename(full_image_path))
        im.save(original_image_output_path)
        print(f"Saved original image to: {original_image_output_path}")
    except FileNotFoundError:
        print(f"Image not found at {full_image_path}, using a blank canvas instead.")
        # Fallback to a blank canvas if image is not found.
        imw, imh = 640, 480
        im = np.ones((imh, imw, 3), dtype=np.uint8) * 255  # White background


    fig, ax = plt.subplots(figsize=(12, 9))
    ax.imshow(im)

    # Use a colormap to get distinct colors for each bounding box
    # Using plt.get_cmap to be explicit and avoid potential linter issues.
    colors = plt.get_cmap('gist_rainbow')(np.linspace(0, 1, len(xmins)))

    for i in range(len(xmins)):
        # Define a single color for the trace, box, and text of this segment
        segment_color = colors[i]
        
        # Plot the trace segment with the corresponding color
        # if segmented_xs[i]:  # Check if segment is not empty
        #     seg_xs_scaled = [x * imw for x in segmented_xs[i]]
        #     seg_ys_scaled = [y * imh for y in segmented_ys[i]]
        #     ax.plot(seg_xs_scaled, seg_ys_scaled, linewidth=2.5, color=segment_color, alpha=0.8)
            
        # Create a rectangle patch for the bounding box
        rect = patches.Rectangle(
            (xmins[i] * imw, ymins[i] * imh),
            imw * (xmaxs[i] - xmins[i]),
            imh * (ymaxs[i] - ymins[i]),
            linewidth=2.5,
            edgecolor=segment_color,
            facecolor='none'
        )
        ax.add_patch(rect)
        
        # Determine the token text. For uniform segmentation, there are no tokens.
        token_text = f"Seg {i}"
        if tokens and i < len(tokens):
            token_text = f"{i}: {tokens[i]}"

        # Add the token text near the box, with the same color as the box and trace
        ax.text(
            xmins[i] * imw, 
            ymins[i] * imh - 5,  # Position text slightly above the box
            token_text,
            fontsize=10,
            color=segment_color,
            bbox=dict(facecolor='white', alpha=0.7, edgecolor='none', pad=1)
        )

    ax.set_title(f"Trace Visualization for Image {image_id}\nMethod: {segmentation_method}", fontsize=16)
    
    # Add the full caption at the bottom of the figure with corresponding colors
    if tokens: # Only attempt to color captions if we have tokens (timestamp method)
        fig_width = fig.get_window_extent().width
        # Estimate the starting x-position to center the text
        # This is an approximation. A more robust way would involve text rendering introspection.
        total_caption_width_approx = sum([len(t) for t in tokens]) * 6 # Approx avg char width
        start_x = (fig_width - total_caption_width_approx) / (2 * fig_width)
        
        # Use a fixed y-position for the caption line
        y_pos = 0.03
        
        # Use renderer to get precise text widths for layout
        renderer = fig.canvas.get_renderer()
        x_offset = 0.5 # Start near the horizontal center

        # Calculate total text width to center it
        text_objs = [plt.text(0, 0, f"{t} ", fontsize=12, ha='left', va='bottom') for t in tokens]
        total_width_pixels = sum(t.get_window_extent(renderer).width for t in text_objs)
        # Clean up temporary text objects
        for t in text_objs:
            t.remove()

        start_x = 0.5 - (total_width_pixels / fig_width) / 2

        current_x = start_x
        for i, token in enumerate(tokens):
            color = colors[i]
            # Add a space for separation
            txt_obj = plt.text(current_x, y_pos, f"{token} ", transform=fig.transFigure,
                             ha='left', va='bottom', fontsize=12, color=color,
                             bbox=dict(facecolor='white', alpha=0.8, edgecolor='none', pad=1))
            
            # Update current_x for the next word
            current_x += txt_obj.get_window_extent(renderer).width / fig_width

    else: # Fallback for methods without tokens
        plt.figtext(0.5, 0.01, full_caption, ha="center", fontsize=12, wrap=True,
                    bbox=dict(facecolor='white', alpha=0.8, edgecolor='none', pad=2))

    plt.subplots_adjust(bottom=0.15) # Adjust layout to make room for caption
    
    plt.axis('off')
    output_filename = f"trace_visualization_{image_id}_{segmentation_method.replace(' ', '_').replace('/', '_')}.png"
    # Save the figure to the specified output directory
    os.makedirs(output_dir, exist_ok=True) # Ensure the output directory exists
    plt.savefig(os.path.join(output_dir, output_filename), bbox_inches='tight', dpi=300)
    plt.close(fig) # 关闭图形以释放内存


def get_json_anno_external(json_anno):
    """Get transcription and trace for each of its element from a json dict."""
    traces = json_anno['traces']
    all_traces = [point for trace in traces for point in trace]
    
    if not all_traces:
        return None, None, None, None, None, None

    xs = [max(0.0, min(1.0, p['x'])) for p in all_traces]
    ys = [max(0.0, min(1.0, p['y'])) for p in all_traces]
    ts = [p['t'] for p in all_traces]

    timed_caption = json_anno['timed_caption']
    # Removed .encode('utf-8') for Python 3 compatibility with matplotlib
    toks = [tcap['utterance'] for tcap in timed_caption]
    time_begins = [tcap['start_time'] for tcap in timed_caption]
    time_ends = [tcap['end_time'] for tcap in timed_caption]
    
    return xs, ys, ts, toks, time_begins, time_ends

def trace_segment_uniform_time_interval(
    trace_xs, trace_ys, trace_ts, time_interval, use_douglas_peucker=False, dp_epsilon=0.01):
    """Segment the traces uniformly given a specificed time interval."""
    if not trace_ts:
        return [], [], []
    
    tbins = (np.array(trace_ts) - trace_ts[0]) / time_interval
    tbins = tbins.astype(int)
    
    segmented_xs, segmented_ys, segmented_ts = [], [], []
    
    num_segments = tbins[-1] + 1 if len(tbins) > 0 else 0
    for i in range(num_segments):
        indices = np.where(tbins == i)[0]
        if len(indices) > 0:
            xs = [trace_xs[j] for j in indices]
            ys = [trace_ys[j] for j in indices]
            ts = [trace_ts[j] for j in indices]

            # Apply Douglas-Peucker simplification if requested
            if use_douglas_peucker and len(xs) > 2:
                points = np.array(list(zip(xs, ys)))
                simplified_points = rdp(points, epsilon=dp_epsilon)
                segmented_xs.append(simplified_points[:, 0].tolist())
                segmented_ys.append(simplified_points[:, 1].tolist())
            else:
                segmented_xs.append(xs)
                segmented_ys.append(ys)
            
            # Timestamps are not simplified, so we just append the original segment's timestamps.
            # The caller currently ignores this return value, so mismatched length is not an issue.
            segmented_ts.append(ts)
        else:
            segmented_xs.append([])
            segmented_ys.append([])
            segmented_ts.append([])

    return segmented_xs, segmented_ys, segmented_ts

def trace_segment_timestamp(trace_xs, trace_ys, trace_ts,
                            token_time_begins, token_time_ends, use_douglas_peucker=False, dp_epsilon=0.01):
    """Segment the traces based on 'ground-truth' timestamped tokens."""
    segmented_xs, segmented_ys, segmented_ts = [], [], []
    trace_arr = np.array(trace_ts)

    for time_begin, time_end in zip(token_time_begins, token_time_ends):
        # Find indices of trace points within the token's time window
        indices = np.where((trace_arr >= time_begin) & (trace_arr <= time_end))[0]
        if len(indices) > 0:
            xs = [trace_xs[j] for j in indices]
            ys = [trace_ys[j] for j in indices]
            ts = [trace_ts[j] for j in indices]

            # Apply Douglas-Peucker simplification if requested
            if use_douglas_peucker and len(xs) > 2:
                points = np.array(list(zip(xs, ys)))
                simplified_points = rdp(points, epsilon=dp_epsilon)
                segmented_xs.append(simplified_points[:, 0].tolist())
                segmented_ys.append(simplified_points[:, 1].tolist())
            else:
                segmented_xs.append(xs)
                segmented_ys.append(ys)

            # Timestamps are not simplified, so we just append the original segment's timestamps.
            segmented_ts.append(ts)
        else:
            segmented_xs.append([])
            segmented_ys.append([])
            segmented_ts.append([])
            
    return segmented_xs, segmented_ys, segmented_ts


def traces_to_bboxs(xs_list, ys_list):
    """Convert each trace of x and y coordinates to bounding boxes."""
    xmins, xmaxs, ymins, ymaxs = [], [], [], []
    for xs, ys in zip(xs_list, ys_list):
        if not xs:  # Handle empty segments
            xmin, xmax, ymin, ymax = 0.0, 1.0, 0.0, 1.0 # Default to full box for empty
        else:
            xmin = min(xs)
            xmax = max(xs)
            ymin = min(ys)
            ymax = max(ys)
        xmins.append(xmin)
        xmaxs.append(xmax)
        ymins.append(ymin)
        ymaxs.append(ymax)
    return xmins, xmaxs, ymins, ymaxs


def process_json(json_anno, trace_segmentation_method, image_base_path, visualize=False, use_douglas_peucker=False, dp_epsilon=0.01, output_dir="."):
    """
    Main processing function to segment traces and optionally visualize them.
    """
    image_id = json_anno['image_id']
    dataset_id = json_anno['dataset_id']
    full_caption = json_anno['caption']
    
    method_title = trace_segmentation_method
    if use_douglas_peucker:
        method_title += " + Douglas-Peucker"
    print(f"\n--- Processing Image: {image_id} using '{method_title}' method ---")
    
    xs, ys, ts, transcription, time_begins, time_ends = get_json_anno_external(json_anno)

    if not xs:
        print("Error: Empty or invalid trace data.")
        return

    # Segment traces into chunks based on the chosen method
    tokens_for_vis = None
    if trace_segmentation_method == 'uni_len_global':
        uniform_trace_segmentation_time_interval = 0.4
        segmented_xs, segmented_ys, _ = \
            trace_segment_uniform_time_interval(
                xs, ys, ts, uniform_trace_segmentation_time_interval,
                use_douglas_peucker=use_douglas_peucker, dp_epsilon=dp_epsilon)
        tokens_for_vis = transcription
    elif trace_segmentation_method == 'timestamp':
        segmented_xs, segmented_ys, _ = \
            trace_segment_timestamp(xs, ys, ts, time_begins, time_ends,
                                    use_douglas_peucker=use_douglas_peucker, dp_epsilon=dp_epsilon)
        tokens_for_vis = transcription
    else:
        raise ValueError("Unknown trace_segmentation_method")

    # Convert segmented trace chunks into bounding boxes from the original segments

    xmins, xmaxs, ymins, ymaxs = traces_to_bboxs(segmented_xs, segmented_ys)
    
    print(f"Generated {len(xmins)} bounding boxes.")
    
    # For visualization, apply Douglas-Peucker simplification if requested
    viz_segmented_xs = segmented_xs
    viz_segmented_ys = segmented_ys
    output_method_name = trace_segmentation_method
    if use_douglas_peucker:
        # import ipdb;ipdb.set_trace()
        print(f"Applying Douglas-Peucker simplification with epsilon={dp_epsilon}.")
        output_method_name += "_DP"
    
    # Trigger visualization if requested
    if visualize:
        visualize_all_boxes(
            image_id, viz_segmented_xs, viz_segmented_ys, xmins, xmaxs, ymins, ymaxs, 
            tokens_for_vis, output_method_name,
            image_base_path, dataset_id, full_caption, output_dir=output_dir)


if __name__ == '__main__':
    # Define the path to your COCO dataset images
    # SELECTED_INDEX = 11  # 设置为你想要的编号（1表示第一条，2表示第二条，以此类推）
    COCO_IMAGE_PATH = '/storage-root/datasets/yangfan/coco2017'
    JSONL_PATH = '/storage-root/datasets/yangfan/Seg_LLaVA_v2/datasets/Localized_Narratives/coco_train_localized_narratives-00000-of-00004.jsonl'

    # Create a unique output directory based on current timestamp
    import datetime
    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
    output_base_dir = f"output_visualizations_{timestamp}"
    os.makedirs(output_base_dir, exist_ok=True)
    print(f"Saving visualizations to: {os.path.abspath(output_base_dir)}")

    # Loop from 1 to 100 for SELECTED_INDEX
    for i in range(1, 1001):
        try:
            COCO_VAL_JSON_DICT = load_jsonl_by_index(JSONL_PATH, i)
            # Create a sub-directory for each index's output
            current_output_dir = os.path.join(output_base_dir, f"index_{i}")
            os.makedirs(current_output_dir, exist_ok=True)
            
            process_json(COCO_VAL_JSON_DICT, 
                         trace_segmentation_method='timestamp',
                         image_base_path=COCO_IMAGE_PATH,
                         visualize=True,
                         use_douglas_peucker=True,
                         dp_epsilon=0.01,
                         output_dir=current_output_dir) # Pass the output directory
        except IndexError as e:
            print(f"Skipping index {i}: {e}")
        except Exception as e:
            print(f"An error occurred while processing index {i}: {e}")