[{"id": 1, "image": "COCO_val2014_000000144305.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: keyboard, laptop, dining table, generate a short caption for this image."}]}, {"id": 7, "image": "COCO_val2014_000000164091.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, bed, teddy bear, generate a short caption for this image."}]}, {"id": 13, "image": "COCO_val2014_000000308003.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: chair, bench, dining table, generate a short caption for this image."}]}, {"id": 19, "image": "COCO_val2014_000000054870.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: knife, sandwich, fork, generate a short caption for this image."}]}, {"id": 25, "image": "COCO_val2014_000000095908.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: mouse, laptop, dining table, keyboard, cup, generate a short caption for this image."}]}, {"id": 31, "image": "COCO_val2014_000000329614.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, car, bus, generate a short caption for this image."}]}, {"id": 37, "image": "COCO_val2014_000000324008.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: banana, person, train, dining table, airplane, orange, cup, generate a short caption for this image."}]}, {"id": 43, "image": "COCO_val2014_000000052700.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, spoon, handbag, fork, dining table, cake, generate a short caption for this image."}]}, {"id": 49, "image": "COCO_val2014_000000501311.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: baseball glove, person, sports ball, generate a short caption for this image."}]}, {"id": 55, "image": "COCO_val2014_000000247179.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: truck, person, elephant, clock, generate a short caption for this image."}]}, {"id": 61, "image": "COCO_val2014_000000577912.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: sink, toothbrush, toilet, bottle, generate a short caption for this image."}]}, {"id": 67, "image": "COCO_val2014_000000196341.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: clock, handbag, person, generate a short caption for this image."}]}, {"id": 73, "image": "COCO_val2014_000000234433.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: traffic light, person, bicycle, car, generate a short caption for this image."}]}, {"id": 79, "image": "COCO_val2014_000000455610.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: traffic light, person, motorcycle, car, generate a short caption for this image."}]}, {"id": 85, "image": "COCO_val2014_000000039053.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: spoon, bowl, dining table, apple, cake, cell phone, cup, generate a short caption for this image."}]}, {"id": 91, "image": "COCO_val2014_000000493862.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: dog, person, tennis racket, generate a short caption for this image."}]}, {"id": 97, "image": "COCO_val2014_000000269873.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: traffic light, person, car, bus, generate a short caption for this image."}]}, {"id": 103, "image": "COCO_val2014_000000524656.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: bowl, spoon, knife, sink, refrigerator, toaster, wine glass, bottle, microwave, oven, cup, generate a short caption for this image."}]}, {"id": 109, "image": "COCO_val2014_000000369547.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: dog, cat, bird, generate a short caption for this image."}]}, {"id": 115, "image": "COCO_val2014_000000181446.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: bowl, person, bottle, sink, refrigerator, generate a short caption for this image."}]}, {"id": 121, "image": "COCO_val2014_000000128119.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: backpack, person, handbag, bicycle, cell phone, bus, generate a short caption for this image."}]}, {"id": 127, "image": "COCO_val2014_000000401027.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: sandwich, fork, dining table, knife, cup, generate a short caption for this image."}]}, {"id": 133, "image": "COCO_val2014_000000253630.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: tv, chair, potted plant, dining table, generate a short caption for this image."}]}, {"id": 139, "image": "COCO_val2014_000000326021.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, teddy bear, dining table, generate a short caption for this image."}]}, {"id": 145, "image": "COCO_val2014_000000317320.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: motorcycle, person, bicycle, generate a short caption for this image."}]}, {"id": 151, "image": "COCO_val2014_000000507171.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, surfboard, cat, generate a short caption for this image."}]}, {"id": 157, "image": "COCO_val2014_000000161227.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: couch, laptop, remote, chair, keyboard, cell phone, generate a short caption for this image."}]}, {"id": 163, "image": "COCO_val2014_000000025812.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: snowboard, person, skis, generate a short caption for this image."}]}, {"id": 169, "image": "COCO_val2014_000000267537.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: tie, person, bench, umbrella, generate a short caption for this image."}]}, {"id": 175, "image": "COCO_val2014_000000195645.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, wine glass, handbag, bottle, dining table, bench, generate a short caption for this image."}]}, {"id": 181, "image": "COCO_val2014_000000155671.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: traffic light, truck, car, generate a short caption for this image."}]}, {"id": 187, "image": "COCO_val2014_000000412894.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: traffic light, person, backpack, handbag, generate a short caption for this image."}]}, {"id": 193, "image": "COCO_val2014_000000032625.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: bowl, fork, dining table, book, cup, generate a short caption for this image."}]}, {"id": 199, "image": "COCO_val2014_000000018688.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: baseball glove, person, sports ball, baseball bat, generate a short caption for this image."}]}, {"id": 205, "image": "COCO_val2014_000000343534.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: dog, potted plant, chair, book, teddy bear, generate a short caption for this image."}]}, {"id": 211, "image": "COCO_val2014_000000370657.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, motorcycle, handbag, generate a short caption for this image."}]}, {"id": 217, "image": "COCO_val2014_000000481596.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: chair, person, hot dog, generate a short caption for this image."}]}, {"id": 223, "image": "COCO_val2014_000000460347.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, car, bus, generate a short caption for this image."}]}, {"id": 229, "image": "COCO_val2014_000000398753.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: baseball glove, person, sports ball, baseball bat, generate a short caption for this image."}]}, {"id": 235, "image": "COCO_val2014_000000067833.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, bowl, dining table, knife, wine glass, bottle, fork, pizza, cup, generate a short caption for this image."}]}, {"id": 241, "image": "COCO_val2014_000000272880.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, fork, dining table, knife, pizza, cup, generate a short caption for this image."}]}, {"id": 247, "image": "COCO_val2014_000000254368.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: pizza, person, bed, generate a short caption for this image."}]}, {"id": 253, "image": "COCO_val2014_000000193108.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, sports ball, baseball bat, generate a short caption for this image."}]}, {"id": 259, "image": "COCO_val2014_000000459141.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: motorcycle, person, bicycle, potted plant, generate a short caption for this image."}]}, {"id": 265, "image": "COCO_val2014_000000231534.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: truck, person, handbag, suitcase, airplane, generate a short caption for this image."}]}, {"id": 271, "image": "COCO_val2014_000000227031.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, potted plant, dining table, knife, cake, tie, generate a short caption for this image."}]}, {"id": 277, "image": "COCO_val2014_000000209347.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, handbag, train, cell phone, tie, generate a short caption for this image."}]}, {"id": 283, "image": "COCO_val2014_000000523292.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: tv, laptop, dining table, chair, cat, generate a short caption for this image."}]}, {"id": 289, "image": "COCO_val2014_000000012375.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: broccoli, carrot, dining table, generate a short caption for this image."}]}, {"id": 295, "image": "COCO_val2014_000000013550.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, elephant, handbag, generate a short caption for this image."}]}, {"id": 301, "image": "COCO_val2014_000000146837.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, motorcycle, skateboard, generate a short caption for this image."}]}, {"id": 307, "image": "COCO_val2014_000000067042.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, cell phone, tie, generate a short caption for this image."}]}, {"id": 313, "image": "COCO_val2014_000000209925.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: baseball glove, person, sports ball, generate a short caption for this image."}]}, {"id": 319, "image": "COCO_val2014_000000002154.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, bench, car, umbrella, generate a short caption for this image."}]}, {"id": 325, "image": "COCO_val2014_000000274931.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: traffic light, car, fire hydrant, generate a short caption for this image."}]}, {"id": 331, "image": "COCO_val2014_000000104326.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: clock, dining table, chair, knife, sink, potted plant, microwave, oven, generate a short caption for this image."}]}, {"id": 337, "image": "COCO_val2014_000000233888.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: tv, baseball glove, person, potted plant, baseball bat, generate a short caption for this image."}]}, {"id": 343, "image": "COCO_val2014_000000213905.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: chair, backpack, person, tennis racket, generate a short caption for this image."}]}, {"id": 349, "image": "COCO_val2014_000000277479.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: refrigerator, bottle, person, generate a short caption for this image."}]}, {"id": 355, "image": "COCO_val2014_000000300784.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: dog, person, couch, bench, generate a short caption for this image."}]}, {"id": 361, "image": "COCO_val2014_000000183766.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: knife, fork, cake, dining table, generate a short caption for this image."}]}, {"id": 367, "image": "COCO_val2014_000000169169.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: car, traffic light, person, potted plant, generate a short caption for this image."}]}, {"id": 373, "image": "COCO_val2014_000000409346.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: bowl, person, spoon, dining table, cake, generate a short caption for this image."}]}, {"id": 379, "image": "COCO_val2014_000000522262.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: traffic light, person, bicycle, car, generate a short caption for this image."}]}, {"id": 385, "image": "COCO_val2014_000000079014.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, orange, sports ball, generate a short caption for this image."}]}, {"id": 391, "image": "COCO_val2014_000000344942.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: tv, clock, laptop, generate a short caption for this image."}]}, {"id": 397, "image": "COCO_val2014_000000275929.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: sink, bowl, toilet, generate a short caption for this image."}]}, {"id": 403, "image": "COCO_val2014_000000465601.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: handbag, bench, person, umbrella, generate a short caption for this image."}]}, {"id": 409, "image": "COCO_val2014_000000300124.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: chair, couch, microwave, dining table, generate a short caption for this image."}]}, {"id": 415, "image": "COCO_val2014_000000046919.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: backpack, person, skis, generate a short caption for this image."}]}, {"id": 421, "image": "COCO_val2014_000000434022.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, train, car, generate a short caption for this image."}]}, {"id": 427, "image": "COCO_val2014_000000490491.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, car, stop sign, generate a short caption for this image."}]}, {"id": 433, "image": "COCO_val2014_000000175878.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: backpack, person, cell phone, generate a short caption for this image."}]}, {"id": 439, "image": "COCO_val2014_000000012669.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: wine glass, person, vase, handbag, generate a short caption for this image."}]}, {"id": 445, "image": "COCO_val2014_000000192496.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, train, car, generate a short caption for this image."}]}, {"id": 451, "image": "COCO_val2014_000000133146.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: tv, couch, chair, vase, book, generate a short caption for this image."}]}, {"id": 457, "image": "COCO_val2014_000000053744.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: knife, sandwich, dining table, generate a short caption for this image."}]}, {"id": 463, "image": "COCO_val2014_000000375902.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: backpack, person, skis, generate a short caption for this image."}]}, {"id": 469, "image": "COCO_val2014_000000498747.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: bird, person, bottle, cell phone, generate a short caption for this image."}]}, {"id": 475, "image": "COCO_val2014_000000052435.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: bird, person, surfboard, umbrella, generate a short caption for this image."}]}, {"id": 481, "image": "COCO_val2014_000000301977.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: refrigerator, bottle, cow, generate a short caption for this image."}]}, {"id": 487, "image": "COCO_val2014_000000261097.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: chair, person, bench, tennis racket, generate a short caption for this image."}]}, {"id": 493, "image": "COCO_val2014_000000227204.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: banana, person, spoon, bowl, dining table, knife, bottle, cell phone, cup, generate a short caption for this image."}]}, {"id": 499, "image": "COCO_val2014_000000353981.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: truck, person, airplane, generate a short caption for this image."}]}, {"id": 505, "image": "COCO_val2014_000000185327.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, horse, car, cow, bus, generate a short caption for this image."}]}, {"id": 511, "image": "COCO_val2014_000000348654.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: tv, laptop, cell phone, dining table, generate a short caption for this image."}]}, {"id": 517, "image": "COCO_val2014_000000482432.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: sink, toothbrush, bowl, generate a short caption for this image."}]}, {"id": 523, "image": "COCO_val2014_000000454161.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: pizza, wine glass, fork, dining table, generate a short caption for this image."}]}, {"id": 529, "image": "COCO_val2014_000000144723.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: dog, truck, car, frisbee, generate a short caption for this image."}]}, {"id": 535, "image": "COCO_val2014_000000129707.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: book, person, cell phone, generate a short caption for this image."}]}, {"id": 541, "image": "COCO_val2014_000000379842.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: book, remote, dining table, generate a short caption for this image."}]}, {"id": 547, "image": "COCO_val2014_000000203110.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: truck, person, train, car, orange, bus, generate a short caption for this image."}]}, {"id": 553, "image": "COCO_val2014_000000556478.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: pizza, bottle, oven, cup, generate a short caption for this image."}]}, {"id": 559, "image": "COCO_val2014_000000103815.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: baseball glove, person, sports ball, baseball bat, generate a short caption for this image."}]}, {"id": 565, "image": "COCO_val2014_000000451943.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: backpack, suitcase, bed, dining table, generate a short caption for this image."}]}, {"id": 571, "image": "COCO_val2014_000000268854.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, boat, handbag, generate a short caption for this image."}]}, {"id": 577, "image": "COCO_val2014_000000034464.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, surfboard, boat, generate a short caption for this image."}]}, {"id": 583, "image": "COCO_val2014_000000495513.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, fork, hot dog, generate a short caption for this image."}]}, {"id": 589, "image": "COCO_val2014_000000529256.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: dog, bed, book, generate a short caption for this image."}]}, {"id": 595, "image": "COCO_val2014_000000037907.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: tv, potted plant, cat, generate a short caption for this image."}]}, {"id": 601, "image": "COCO_val2014_000000506401.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: traffic light, truck, person, car, cat, generate a short caption for this image."}]}, {"id": 607, "image": "COCO_val2014_000000406611.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: backpack, skis, person, generate a short caption for this image."}]}, {"id": 613, "image": "COCO_val2014_000000267225.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: tv, truck, person, car, remote, generate a short caption for this image."}]}, {"id": 619, "image": "COCO_val2014_000000278365.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, dining table, chair, pizza, cell phone, cup, generate a short caption for this image."}]}, {"id": 625, "image": "COCO_val2014_000000069757.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, wine glass, knife, dining table, chair, cake, cup, generate a short caption for this image."}]}, {"id": 631, "image": "COCO_val2014_000000432213.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: bowl, person, wine glass, sandwich, dining table, knife, generate a short caption for this image."}]}, {"id": 637, "image": "COCO_val2014_000000368367.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, cell phone, cup, generate a short caption for this image."}]}, {"id": 643, "image": "COCO_val2014_000000201555.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: traffic light, truck, car, generate a short caption for this image."}]}, {"id": 649, "image": "COCO_val2014_000000092131.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, handbag, bus, generate a short caption for this image."}]}, {"id": 655, "image": "COCO_val2014_000000021779.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: backpack, person, suitcase, generate a short caption for this image."}]}, {"id": 661, "image": "COCO_val2014_000000565148.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: baseball glove, person, sports ball, baseball bat, generate a short caption for this image."}]}, {"id": 667, "image": "COCO_val2014_000000160183.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: truck, car, person, generate a short caption for this image."}]}, {"id": 673, "image": "COCO_val2014_000000555456.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: snowboard, person, skis, generate a short caption for this image."}]}, {"id": 679, "image": "COCO_val2014_000000235595.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, car, potted plant, parking meter, motorcycle, cup, generate a short caption for this image."}]}, {"id": 685, "image": "COCO_val2014_000000489909.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: banana, bowl, orange, generate a short caption for this image."}]}, {"id": 691, "image": "COCO_val2014_000000501080.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: vase, bowl, potted plant, dining table, generate a short caption for this image."}]}, {"id": 697, "image": "COCO_val2014_000000405183.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, remote, chair, tv, handbag, couch, potted plant, book, generate a short caption for this image."}]}, {"id": 703, "image": "COCO_val2014_000000409138.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, car, bus, generate a short caption for this image."}]}, {"id": 709, "image": "COCO_val2014_000000411840.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: backpack, person, sports ball, tennis racket, chair, generate a short caption for this image."}]}, {"id": 715, "image": "COCO_val2014_000000333712.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: pizza, spoon, dining table, generate a short caption for this image."}]}, {"id": 721, "image": "COCO_val2014_000000148977.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: bottle, dining table, keyboard, book, teddy bear, bear, generate a short caption for this image."}]}, {"id": 727, "image": "COCO_val2014_000000399058.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: truck, person, bicycle, orange, elephant, generate a short caption for this image."}]}, {"id": 733, "image": "COCO_val2014_000000183571.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: dog, person, boat, generate a short caption for this image."}]}, {"id": 739, "image": "COCO_val2014_000000325836.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: handbag, motorcycle, person, bicycle, generate a short caption for this image."}]}, {"id": 745, "image": "COCO_val2014_000000437910.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: sink, bottle, toilet, generate a short caption for this image."}]}, {"id": 751, "image": "COCO_val2014_000000155885.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: bird, person, boat, generate a short caption for this image."}]}, {"id": 757, "image": "COCO_val2014_000000467887.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, mouse, couch, laptop, dining table, chair, generate a short caption for this image."}]}, {"id": 763, "image": "COCO_val2014_000000174402.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, sports ball, tennis racket, generate a short caption for this image."}]}, {"id": 769, "image": "COCO_val2014_000000352734.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, spoon, fork, dining table, cake, generate a short caption for this image."}]}, {"id": 775, "image": "COCO_val2014_000000346716.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, car, bus, generate a short caption for this image."}]}, {"id": 781, "image": "COCO_val2014_000000451144.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: backpack, person, skis, generate a short caption for this image."}]}, {"id": 787, "image": "COCO_val2014_000000295656.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: tv, person, clock, car, chair, pizza, tie, generate a short caption for this image."}]}, {"id": 793, "image": "COCO_val2014_000000129886.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: snowboard, person, skis, generate a short caption for this image."}]}, {"id": 799, "image": "COCO_val2014_000000557998.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, wine glass, bottle, dining table, chair, generate a short caption for this image."}]}, {"id": 805, "image": "COCO_val2014_000000401010.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: tv, chair, sink, bed, generate a short caption for this image."}]}, {"id": 811, "image": "COCO_val2014_000000567562.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, spoon, bowl, broccoli, dining table, chair, bottle, fork, pizza, book, cup, generate a short caption for this image."}]}, {"id": 817, "image": "COCO_val2014_000000313928.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: wine glass, surfboard, bottle, dining table, chair, generate a short caption for this image."}]}, {"id": 823, "image": "COCO_val2014_000000490284.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: dog, person, couch, generate a short caption for this image."}]}, {"id": 829, "image": "COCO_val2014_000000346940.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: knife, bowl, oven, generate a short caption for this image."}]}, {"id": 835, "image": "COCO_val2014_000000222782.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: truck, car, cell phone, person, generate a short caption for this image."}]}, {"id": 841, "image": "COCO_val2014_000000255934.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, cow, elephant, handbag, generate a short caption for this image."}]}, {"id": 847, "image": "COCO_val2014_000000278627.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, spoon, dining table, chair, fork, bottle, potted plant, cup, generate a short caption for this image."}]}, {"id": 853, "image": "COCO_val2014_000000400851.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: backpack, person, sports ball, boat, chair, bench, umbrella, generate a short caption for this image."}]}, {"id": 859, "image": "COCO_val2014_000000472610.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: dog, person, cow, generate a short caption for this image."}]}, {"id": 865, "image": "COCO_val2014_000000078565.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: dog, person, boat, generate a short caption for this image."}]}, {"id": 871, "image": "COCO_val2014_000000388237.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: chair, person, skateboard, bottle, generate a short caption for this image."}]}, {"id": 877, "image": "COCO_val2014_000000289640.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, wine glass, frisbee, dining table, chair, orange, cup, generate a short caption for this image."}]}, {"id": 883, "image": "COCO_val2014_000000122602.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: scissors, bottle, dining table, generate a short caption for this image."}]}, {"id": 889, "image": "COCO_val2014_000000024104.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: laptop, dining table, bed, tv, mouse, keyboard, book, cup, generate a short caption for this image."}]}, {"id": 895, "image": "COCO_val2014_000000183709.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: traffic light, person, car, handbag, generate a short caption for this image."}]}, {"id": 901, "image": "COCO_val2014_000000059710.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: chair, refrigerator, dining table, generate a short caption for this image."}]}, {"id": 907, "image": "COCO_val2014_000000279407.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: baseball glove, person, sports ball, bottle, bench, generate a short caption for this image."}]}, {"id": 913, "image": "COCO_val2014_000000314729.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, sports ball, baseball bat, generate a short caption for this image."}]}, {"id": 919, "image": "COCO_val2014_000000527497.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: tv, mouse, laptop, dining table, keyboard, generate a short caption for this image."}]}, {"id": 925, "image": "COCO_val2014_000000411564.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, bottle, dining table, chair, refrigerator, cell phone, generate a short caption for this image."}]}, {"id": 931, "image": "COCO_val2014_000000320350.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: dog, person, skis, generate a short caption for this image."}]}, {"id": 937, "image": "COCO_val2014_000000401857.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: baseball glove, person, sports ball, bench, baseball bat, cup, generate a short caption for this image."}]}, {"id": 943, "image": "COCO_val2014_000000285302.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: chair, person, dining table, scissors, generate a short caption for this image."}]}, {"id": 949, "image": "COCO_val2014_000000061959.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: traffic light, truck, car, bus, generate a short caption for this image."}]}, {"id": 955, "image": "COCO_val2014_000000229659.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: tv, person, clock, bottle, laptop, dining table, chair, generate a short caption for this image."}]}, {"id": 961, "image": "COCO_val2014_000000473118.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, bicycle, skateboard, generate a short caption for this image."}]}, {"id": 967, "image": "COCO_val2014_000000391222.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: backpack, person, skis, generate a short caption for this image."}]}, {"id": 973, "image": "COCO_val2014_000000416104.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, wine glass, bottle, dining table, chair, umbrella, generate a short caption for this image."}]}, {"id": 979, "image": "COCO_val2014_000000511852.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: backpack, person, skis, generate a short caption for this image."}]}, {"id": 985, "image": "COCO_val2014_000000059386.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: giraffe, car, zebra, generate a short caption for this image."}]}, {"id": 991, "image": "COCO_val2014_000000540180.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: traffic light, person, car, generate a short caption for this image."}]}, {"id": 997, "image": "COCO_val2014_000000153692.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: backpack, person, dog, bicycle, car, traffic light, generate a short caption for this image."}]}, {"id": 1003, "image": "COCO_val2014_000000265628.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, bowl, toilet, generate a short caption for this image."}]}, {"id": 1009, "image": "COCO_val2014_000000499594.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: broccoli, bottle, fork, dining table, knife, cup, generate a short caption for this image."}]}, {"id": 1015, "image": "COCO_val2014_000000005934.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: baseball glove, bench, sports ball, generate a short caption for this image."}]}, {"id": 1021, "image": "COCO_val2014_000000047828.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, car, boat, generate a short caption for this image."}]}, {"id": 1027, "image": "COCO_val2014_000000047916.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: dog, truck, car, frisbee, bench, generate a short caption for this image."}]}, {"id": 1033, "image": "COCO_val2014_000000255483.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, dining table, book, scissors, generate a short caption for this image."}]}, {"id": 1039, "image": "COCO_val2014_000000131138.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: spoon, person, laptop, dining table, tv, mouse, potted plant, keyboard, cup, generate a short caption for this image."}]}, {"id": 1045, "image": "COCO_val2014_000000258209.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, bench, book, bicycle, generate a short caption for this image."}]}, {"id": 1051, "image": "COCO_val2014_000000347453.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: traffic light, person, car, bench, bus, generate a short caption for this image."}]}, {"id": 1057, "image": "COCO_val2014_000000285820.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: tv, bowl, couch, dining table, book, bench, cell phone, generate a short caption for this image."}]}, {"id": 1063, "image": "COCO_val2014_000000245664.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: laptop, dining table, tv, wine glass, mouse, keyboard, book, cup, generate a short caption for this image."}]}, {"id": 1069, "image": "COCO_val2014_000000453731.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: dog, bottle, cup, generate a short caption for this image."}]}, {"id": 1075, "image": "COCO_val2014_000000524601.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, couch, remote, chair, refrigerator, book, generate a short caption for this image."}]}, {"id": 1081, "image": "COCO_val2014_000000253262.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: truck, person, suitcase, airplane, tie, generate a short caption for this image."}]}, {"id": 1087, "image": "COCO_val2014_000000087052.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: bowl, person, bottle, vase, refrigerator, oven, cup, generate a short caption for this image."}]}, {"id": 1093, "image": "COCO_val2014_000000357578.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: truck, person, traffic light, car, generate a short caption for this image."}]}, {"id": 1099, "image": "COCO_val2014_000000418692.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, couch, remote, chair, book, cell phone, generate a short caption for this image."}]}, {"id": 1105, "image": "COCO_val2014_000000259345.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: chair, person, umbrella, generate a short caption for this image."}]}, {"id": 1111, "image": "COCO_val2014_000000119709.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: chair, person, donut, dining table, generate a short caption for this image."}]}, {"id": 1117, "image": "COCO_val2014_000000443299.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: backpack, person, handbag, bench, cell phone, generate a short caption for this image."}]}, {"id": 1123, "image": "COCO_val2014_000000120776.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: tie, person, umbrella, generate a short caption for this image."}]}, {"id": 1129, "image": "COCO_val2014_000000411765.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, sports ball, tennis racket, generate a short caption for this image."}]}, {"id": 1135, "image": "COCO_val2014_000000447991.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: traffic light, person, truck, handbag, bicycle, car, cell phone, generate a short caption for this image."}]}, {"id": 1141, "image": "COCO_val2014_000000276719.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: dog, person, truck, handbag, car, chair, motorcycle, generate a short caption for this image."}]}, {"id": 1147, "image": "COCO_val2014_000000422328.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, skis, horse, car, snowboard, generate a short caption for this image."}]}, {"id": 1153, "image": "COCO_val2014_000000354559.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, teddy bear, bear, generate a short caption for this image."}]}, {"id": 1159, "image": "COCO_val2014_000000438788.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: knife, person, cake, dining table, generate a short caption for this image."}]}, {"id": 1165, "image": "COCO_val2014_000000468005.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: backpack, person, elephant, generate a short caption for this image."}]}, {"id": 1171, "image": "COCO_val2014_000000545594.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: chair, backpack, person, orange, generate a short caption for this image."}]}, {"id": 1177, "image": "COCO_val2014_000000219546.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: spoon, bowl, knife, fork, bottle, dining table, chair, generate a short caption for this image."}]}, {"id": 1183, "image": "COCO_val2014_000000386645.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: truck, person, motorcycle, car, generate a short caption for this image."}]}, {"id": 1189, "image": "COCO_val2014_000000155131.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, sports ball, tennis racket, bench, tie, generate a short caption for this image."}]}, {"id": 1195, "image": "COCO_val2014_000000188752.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: banana, person, apple, generate a short caption for this image."}]}, {"id": 1201, "image": "COCO_val2014_000000228350.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, bowl, dining table, chair, umbrella, tie, wine glass, cup, generate a short caption for this image."}]}, {"id": 1207, "image": "COCO_val2014_000000437720.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, laptop, dining table, chair, vase, generate a short caption for this image."}]}, {"id": 1213, "image": "COCO_val2014_000000378831.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, handbag, fire hydrant, generate a short caption for this image."}]}, {"id": 1219, "image": "COCO_val2014_000000375503.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: tv, clock, person, laptop, dining table, chair, generate a short caption for this image."}]}, {"id": 1225, "image": "COCO_val2014_000000330369.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: bird, person, boat, generate a short caption for this image."}]}, {"id": 1231, "image": "COCO_val2014_000000288576.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, bicycle, frisbee, generate a short caption for this image."}]}, {"id": 1237, "image": "COCO_val2014_000000032190.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, car, horse, generate a short caption for this image."}]}, {"id": 1243, "image": "COCO_val2014_000000535403.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: clock, potted plant, bottle, sink, hair drier, toilet, generate a short caption for this image."}]}, {"id": 1249, "image": "COCO_val2014_000000449798.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: mouse, laptop, dining table, keyboard, book, cell phone, generate a short caption for this image."}]}, {"id": 1255, "image": "COCO_val2014_000000375769.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, dining table, chair, bicycle, car, hot dog, cup, umbrella, generate a short caption for this image."}]}, {"id": 1261, "image": "COCO_val2014_000000426172.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: bottle, dining table, knife, chair, book, generate a short caption for this image."}]}, {"id": 1267, "image": "COCO_val2014_000000020650.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: bowl, broccoli, sandwich, fork, dining table, orange, generate a short caption for this image."}]}, {"id": 1273, "image": "COCO_val2014_000000138078.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: baseball glove, person, sports ball, generate a short caption for this image."}]}, {"id": 1279, "image": "COCO_val2014_000000152499.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, surfboard, potted plant, dining table, chair, umbrella, generate a short caption for this image."}]}, {"id": 1285, "image": "COCO_val2014_000000180764.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, bottle, microwave, donut, generate a short caption for this image."}]}, {"id": 1291, "image": "COCO_val2014_000000147762.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: fork, cake, dining table, generate a short caption for this image."}]}, {"id": 1297, "image": "COCO_val2014_000000113860.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, boat, umbrella, generate a short caption for this image."}]}, {"id": 1303, "image": "COCO_val2014_000000460866.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: bowl, dining table, sink, toothbrush, cup, generate a short caption for this image."}]}, {"id": 1309, "image": "COCO_val2014_000000177015.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, couch, laptop, chair, refrigerator, apple, cat, generate a short caption for this image."}]}, {"id": 1315, "image": "COCO_val2014_000000333998.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, bowl, sink, refrigerator, suitcase, bottle, microwave, oven, cup, generate a short caption for this image."}]}, {"id": 1321, "image": "COCO_val2014_000000047131.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, train, car, generate a short caption for this image."}]}, {"id": 1327, "image": "COCO_val2014_000000425447.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: truck, person, fire hydrant, generate a short caption for this image."}]}, {"id": 1333, "image": "COCO_val2014_000000295105.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: bowl, person, bottle, sink, refrigerator, apple, oven, generate a short caption for this image."}]}, {"id": 1339, "image": "COCO_val2014_000000443259.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: bowl, dining table, chair, sink, refrigerator, orange, potted plant, bottle, oven, generate a short caption for this image."}]}, {"id": 1345, "image": "COCO_val2014_000000537861.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, train, bench, generate a short caption for this image."}]}, {"id": 1351, "image": "COCO_val2014_000000145020.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: bird, person, teddy bear, mouse, generate a short caption for this image."}]}, {"id": 1357, "image": "COCO_val2014_000000249969.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: bird, spoon, broccoli, dining table, knife, wine glass, fork, carrot, generate a short caption for this image."}]}, {"id": 1363, "image": "COCO_val2014_000000146504.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, mouse, laptop, dining table, keyboard, cup, generate a short caption for this image."}]}, {"id": 1369, "image": "COCO_val2014_000000040037.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: clock, car, person, generate a short caption for this image."}]}, {"id": 1375, "image": "COCO_val2014_000000343218.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: car, person, sports ball, tennis racket, generate a short caption for this image."}]}, {"id": 1381, "image": "COCO_val2014_000000301837.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: clock, sink, refrigerator, apple, oven, generate a short caption for this image."}]}, {"id": 1387, "image": "COCO_val2014_000000309071.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: backpack, person, bicycle, potted plant, bench, generate a short caption for this image."}]}, {"id": 1393, "image": "COCO_val2014_000000240681.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: backpack, person, motorcycle, car, generate a short caption for this image."}]}, {"id": 1399, "image": "COCO_val2014_000000178753.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: chair, person, cell phone, generate a short caption for this image."}]}, {"id": 1405, "image": "COCO_val2014_000000278890.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, cell phone, dining table, generate a short caption for this image."}]}, {"id": 1411, "image": "COCO_val2014_000000047781.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: tv, person, vase, bed, book, generate a short caption for this image."}]}, {"id": 1417, "image": "COCO_val2014_000000542792.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, sports ball, baseball bat, generate a short caption for this image."}]}, {"id": 1423, "image": "COCO_val2014_000000293703.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, bowl, dining table, chair, knife, bottle, fork, pizza, cup, generate a short caption for this image."}]}, {"id": 1429, "image": "COCO_val2014_000000318777.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: chair, baseball glove, person, sports ball, generate a short caption for this image."}]}, {"id": 1435, "image": "COCO_val2014_000000014773.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: spoon, person, dining table, bottle, fork, hot dog, car, cup, generate a short caption for this image."}]}, {"id": 1441, "image": "COCO_val2014_000000521863.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, bicycle, skateboard, generate a short caption for this image."}]}, {"id": 1447, "image": "COCO_val2014_000000250540.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, handbag, car, motorcycle, cell phone, generate a short caption for this image."}]}, {"id": 1453, "image": "COCO_val2014_000000211604.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, sports ball, tennis racket, generate a short caption for this image."}]}, {"id": 1459, "image": "COCO_val2014_000000298067.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: bowl, cup, bottle, dining table, generate a short caption for this image."}]}, {"id": 1465, "image": "COCO_val2014_000000563938.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, couch, bottle, bench, tie, generate a short caption for this image."}]}, {"id": 1471, "image": "COCO_val2014_000000461311.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: backpack, person, handbag, car, kite, generate a short caption for this image."}]}, {"id": 1477, "image": "COCO_val2014_000000047940.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: book, person, bed, laptop, generate a short caption for this image."}]}, {"id": 1483, "image": "COCO_val2014_000000261318.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, handbag, bench, chair, suitcase, cell phone, generate a short caption for this image."}]}, {"id": 1489, "image": "COCO_val2014_000000134768.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: keyboard, mouse, laptop, dining table, generate a short caption for this image."}]}, {"id": 1495, "image": "COCO_val2014_000000526414.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: scissors, bowl, laptop, remote, dining table, tv, couch, cat, book, cup, generate a short caption for this image."}]}, {"id": 1501, "image": "COCO_val2014_000000273132.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, bench, handbag, generate a short caption for this image."}]}, {"id": 1507, "image": "COCO_val2014_000000505213.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, sandwich, hot dog, generate a short caption for this image."}]}, {"id": 1513, "image": "COCO_val2014_000000192656.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: baseball glove, person, sports ball, generate a short caption for this image."}]}, {"id": 1519, "image": "COCO_val2014_000000272728.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: chair, vase, potted plant, dining table, generate a short caption for this image."}]}, {"id": 1525, "image": "COCO_val2014_000000509811.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, laptop, dining table, chair, apple, book, generate a short caption for this image."}]}, {"id": 1531, "image": "COCO_val2014_000000412407.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: clock, person, cell phone, generate a short caption for this image."}]}, {"id": 1537, "image": "COCO_val2014_000000353658.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, clock, handbag, suitcase, carrot, umbrella, generate a short caption for this image."}]}, {"id": 1543, "image": "COCO_val2014_000000441363.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, sports ball, tennis racket, generate a short caption for this image."}]}, {"id": 1549, "image": "COCO_val2014_000000331084.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, spoon, bowl, dining table, knife, pizza, carrot, generate a short caption for this image."}]}, {"id": 1555, "image": "COCO_val2014_000000282553.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, handbag, cell phone, generate a short caption for this image."}]}, {"id": 1561, "image": "COCO_val2014_000000328818.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, bicycle, bottle, bench, generate a short caption for this image."}]}, {"id": 1567, "image": "COCO_val2014_000000003103.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: clock, skateboard, person, generate a short caption for this image."}]}, {"id": 1573, "image": "COCO_val2014_000000188136.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: bowl, person, dining table, sink, orange, carrot, generate a short caption for this image."}]}, {"id": 1579, "image": "COCO_val2014_000000449675.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: spoon, dining table, knife, sandwich, fork, bottle, carrot, cup, generate a short caption for this image."}]}, {"id": 1585, "image": "COCO_val2014_000000191691.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: toothbrush, bottle, person, generate a short caption for this image."}]}, {"id": 1591, "image": "COCO_val2014_000000089378.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: backpack, person, motorcycle, bicycle, generate a short caption for this image."}]}, {"id": 1597, "image": "COCO_val2014_000000099874.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, handbag, bottle, dining table, cat, cell phone, generate a short caption for this image."}]}, {"id": 1603, "image": "COCO_val2014_000000437895.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: vase, potted plant, bed, generate a short caption for this image."}]}, {"id": 1609, "image": "COCO_val2014_000000557447.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, spoon, fork, dining table, knife, tie, generate a short caption for this image."}]}, {"id": 1615, "image": "COCO_val2014_000000159979.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, bowl, boat, umbrella, generate a short caption for this image."}]}, {"id": 1621, "image": "COCO_val2014_000000000357.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: baseball glove, person, baseball bat, generate a short caption for this image."}]}, {"id": 1627, "image": "COCO_val2014_000000538891.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, sports ball, tennis racket, generate a short caption for this image."}]}, {"id": 1633, "image": "COCO_val2014_000000561967.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: traffic light, train, car, generate a short caption for this image."}]}, {"id": 1639, "image": "COCO_val2014_000000565778.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: traffic light, person, train, car, generate a short caption for this image."}]}, {"id": 1645, "image": "COCO_val2014_000000007125.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: backpack, person, dog, bicycle, bottle, generate a short caption for this image."}]}, {"id": 1651, "image": "COCO_val2014_000000221350.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, handbag, dining table, chair, tie, generate a short caption for this image."}]}, {"id": 1657, "image": "COCO_val2014_000000280930.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: refrigerator, person, bottle, oven, generate a short caption for this image."}]}, {"id": 1663, "image": "COCO_val2014_000000151081.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: truck, person, airplane, generate a short caption for this image."}]}, {"id": 1669, "image": "COCO_val2014_000000122232.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, sports ball, tennis racket, generate a short caption for this image."}]}, {"id": 1675, "image": "COCO_val2014_000000134223.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: bowl, broccoli, hot dog, dining table, generate a short caption for this image."}]}, {"id": 1681, "image": "COCO_val2014_000000198163.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, bowl, dining table, generate a short caption for this image."}]}, {"id": 1687, "image": "COCO_val2014_000000003742.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: vase, bowl, apple, dining table, generate a short caption for this image."}]}, {"id": 1693, "image": "COCO_val2014_000000217593.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: baseball glove, person, handbag, chair, baseball bat, generate a short caption for this image."}]}, {"id": 1699, "image": "COCO_val2014_000000279078.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: refrigerator, bottle, person, generate a short caption for this image."}]}, {"id": 1705, "image": "COCO_val2014_000000572081.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, couch, sports ball, tennis racket, chair, generate a short caption for this image."}]}, {"id": 1711, "image": "COCO_val2014_000000477016.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: backpack, person, handbag, potted plant, sink, oven, generate a short caption for this image."}]}, {"id": 1717, "image": "COCO_val2014_000000301266.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, spoon, fork, dining table, sink, book, cup, generate a short caption for this image."}]}, {"id": 1723, "image": "COCO_val2014_000000475813.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, handbag, remote, generate a short caption for this image."}]}, {"id": 1729, "image": "COCO_val2014_000000558784.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: backpack, person, frisbee, generate a short caption for this image."}]}, {"id": 1735, "image": "COCO_val2014_000000308543.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: sink, spoon, person, generate a short caption for this image."}]}, {"id": 1741, "image": "COCO_val2014_000000359164.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: chair, dog, car, dining table, generate a short caption for this image."}]}, {"id": 1747, "image": "COCO_val2014_000000539951.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: refrigerator, sink, clock, oven, generate a short caption for this image."}]}, {"id": 1753, "image": "COCO_val2014_000000143479.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: chair, person, hot dog, generate a short caption for this image."}]}, {"id": 1759, "image": "COCO_val2014_000000475024.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: vase, orange, potted plant, dining table, generate a short caption for this image."}]}, {"id": 1765, "image": "COCO_val2014_000000238065.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: chair, person, skateboard, generate a short caption for this image."}]}, {"id": 1771, "image": "COCO_val2014_000000314016.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: sink, book, bottle, toilet, generate a short caption for this image."}]}, {"id": 1777, "image": "COCO_val2014_000000322163.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, bowl, remote, dining table, bottle, potted plant, pizza, microwave, oven, generate a short caption for this image."}]}, {"id": 1783, "image": "COCO_val2014_000000230983.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, skateboard, car, generate a short caption for this image."}]}, {"id": 1789, "image": "COCO_val2014_000000564629.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: sink, refrigerator, microwave, oven, generate a short caption for this image."}]}, {"id": 1795, "image": "COCO_val2014_000000053431.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: chair, person, laptop, generate a short caption for this image."}]}, {"id": 1801, "image": "COCO_val2014_000000441840.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: baseball glove, truck, person, car, baseball bat, generate a short caption for this image."}]}, {"id": 1807, "image": "COCO_val2014_000000053465.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, spoon, bowl, fork, dining table, cake, cup, generate a short caption for this image."}]}, {"id": 1813, "image": "COCO_val2014_000000381031.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: spoon, bottle, dining table, donut, apple, microwave, cup, generate a short caption for this image."}]}, {"id": 1819, "image": "COCO_val2014_000000530520.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: backpack, person, clock, laptop, motorcycle, generate a short caption for this image."}]}, {"id": 1825, "image": "COCO_val2014_000000539555.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, skateboard, frisbee, generate a short caption for this image."}]}, {"id": 1831, "image": "COCO_val2014_000000279521.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: backpack, person, handbag, train, car, bus, generate a short caption for this image."}]}, {"id": 1837, "image": "COCO_val2014_000000560439.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: backpack, person, skis, handbag, snowboard, generate a short caption for this image."}]}, {"id": 1843, "image": "COCO_val2014_000000248353.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: snowboard, person, skis, generate a short caption for this image."}]}, {"id": 1849, "image": "COCO_val2014_000000009628.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, surfboard, boat, generate a short caption for this image."}]}, {"id": 1855, "image": "COCO_val2014_000000330050.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, bench, tennis racket, generate a short caption for this image."}]}, {"id": 1861, "image": "COCO_val2014_000000254161.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, kite, handbag, generate a short caption for this image."}]}, {"id": 1867, "image": "COCO_val2014_000000275749.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, laptop, dining table, chair, tv, mouse, bottle, cup, generate a short caption for this image."}]}, {"id": 1873, "image": "COCO_val2014_000000163518.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, bowl, dining table, chair, oven, generate a short caption for this image."}]}, {"id": 1879, "image": "COCO_val2014_000000156696.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, train, car, handbag, generate a short caption for this image."}]}, {"id": 1885, "image": "COCO_val2014_000000525202.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, spoon, dining table, chair, knife, bottle, fork, pizza, cup, generate a short caption for this image."}]}, {"id": 1891, "image": "COCO_val2014_000000423984.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: broccoli, carrot, dining table, generate a short caption for this image."}]}, {"id": 1897, "image": "COCO_val2014_000000499486.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: bed, cat, book, generate a short caption for this image."}]}, {"id": 1903, "image": "COCO_val2014_000000118124.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, car, tie, generate a short caption for this image."}]}, {"id": 1909, "image": "COCO_val2014_000000288729.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, elephant, cake, teddy bear, bear, generate a short caption for this image."}]}, {"id": 1915, "image": "COCO_val2014_000000177721.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, skateboard, dining table, generate a short caption for this image."}]}, {"id": 1921, "image": "COCO_val2014_000000372220.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, horse, handbag, cell phone, generate a short caption for this image."}]}, {"id": 1927, "image": "COCO_val2014_000000252625.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: traffic light, person, bicycle, potted plant, generate a short caption for this image."}]}, {"id": 1933, "image": "COCO_val2014_000000493610.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: truck, car, person, bus, generate a short caption for this image."}]}, {"id": 1939, "image": "COCO_val2014_000000302789.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: backpack, handbag, bottle, sink, refrigerator, suitcase, bed, generate a short caption for this image."}]}, {"id": 1945, "image": "COCO_val2014_000000030667.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: dog, person, boat, generate a short caption for this image."}]}, {"id": 1951, "image": "COCO_val2014_000000114108.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, spoon, dining table, chair, knife, fork, pizza, cup, generate a short caption for this image."}]}, {"id": 1957, "image": "COCO_val2014_000000328110.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: backpack, person, bench, handbag, generate a short caption for this image."}]}, {"id": 1963, "image": "COCO_val2014_000000366111.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: traffic light, person, bicycle, car, generate a short caption for this image."}]}, {"id": 1969, "image": "COCO_val2014_000000532481.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, car, kite, surfboard, generate a short caption for this image."}]}, {"id": 1975, "image": "COCO_val2014_000000291634.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: motorcycle, person, bicycle, generate a short caption for this image."}]}, {"id": 1981, "image": "COCO_val2014_000000463678.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, sports ball, tennis racket, generate a short caption for this image."}]}, {"id": 1987, "image": "COCO_val2014_000000070815.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: truck, dog, person, bicycle, generate a short caption for this image."}]}, {"id": 1993, "image": "COCO_val2014_000000367872.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: baseball glove, person, sports ball, baseball bat, umbrella, generate a short caption for this image."}]}, {"id": 1999, "image": "COCO_val2014_000000353898.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: bird, person, backpack, bench, cell phone, generate a short caption for this image."}]}, {"id": 2005, "image": "COCO_val2014_000000088269.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: bowl, spoon, sandwich, generate a short caption for this image."}]}, {"id": 2011, "image": "COCO_val2014_000000463211.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: backpack, person, bus, generate a short caption for this image."}]}, {"id": 2017, "image": "COCO_val2014_000000396269.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: chair, vase, dining table, generate a short caption for this image."}]}, {"id": 2023, "image": "COCO_val2014_000000098979.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: knife, microwave, oven, generate a short caption for this image."}]}, {"id": 2029, "image": "COCO_val2014_000000244455.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: truck, person, bicycle, car, motorcycle, generate a short caption for this image."}]}, {"id": 2035, "image": "COCO_val2014_000000379443.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: scissors, orange, dining table, generate a short caption for this image."}]}, {"id": 2041, "image": "COCO_val2014_000000536154.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, cat, teddy bear, generate a short caption for this image."}]}, {"id": 2047, "image": "COCO_val2014_000000374061.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: couch, bed, cat, teddy bear, bear, generate a short caption for this image."}]}, {"id": 2053, "image": "COCO_val2014_000000560018.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, surfboard, kite, generate a short caption for this image."}]}, {"id": 2059, "image": "COCO_val2014_000000353320.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, bench, suitcase, handbag, generate a short caption for this image."}]}, {"id": 2065, "image": "COCO_val2014_000000256628.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: chair, dog, bowl, generate a short caption for this image."}]}, {"id": 2071, "image": "COCO_val2014_000000158806.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: dog, person, sandwich, generate a short caption for this image."}]}, {"id": 2077, "image": "COCO_val2014_000000424270.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: tv, person, bed, generate a short caption for this image."}]}, {"id": 2083, "image": "COCO_val2014_000000368335.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: car, truck, horse, generate a short caption for this image."}]}, {"id": 2089, "image": "COCO_val2014_000000483227.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, sports ball, tennis racket, generate a short caption for this image."}]}, {"id": 2095, "image": "COCO_val2014_000000228306.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, car, bus, generate a short caption for this image."}]}, {"id": 2101, "image": "COCO_val2014_000000416384.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: chair, person, cell phone, generate a short caption for this image."}]}, {"id": 2107, "image": "COCO_val2014_000000364783.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: potted plant, bottle, chair, sink, toothbrush, toilet, cup, generate a short caption for this image."}]}, {"id": 2113, "image": "COCO_val2014_000000335041.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, car, skis, generate a short caption for this image."}]}, {"id": 2119, "image": "COCO_val2014_000000531327.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: spoon, broccoli, wine glass, fork, dining table, knife, carrot, generate a short caption for this image."}]}, {"id": 2125, "image": "COCO_val2014_000000451228.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: traffic light, person, bicycle, bus, generate a short caption for this image."}]}, {"id": 2131, "image": "COCO_val2014_000000256365.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: cup, cake, dining table, generate a short caption for this image."}]}, {"id": 2137, "image": "COCO_val2014_000000502936.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, sports ball, tennis racket, generate a short caption for this image."}]}, {"id": 2143, "image": "COCO_val2014_000000451449.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: dog, bed, book, generate a short caption for this image."}]}, {"id": 2149, "image": "COCO_val2014_000000051054.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, bottle, bed, cell phone, cup, generate a short caption for this image."}]}, {"id": 2155, "image": "COCO_val2014_000000001268.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: bird, person, backpack, handbag, boat, cell phone, generate a short caption for this image."}]}, {"id": 2161, "image": "COCO_val2014_000000062692.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, sports ball, tennis racket, generate a short caption for this image."}]}, {"id": 2167, "image": "COCO_val2014_000000542771.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, potted plant, dining table, chair, vase, book, generate a short caption for this image."}]}, {"id": 2173, "image": "COCO_val2014_000000575406.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, motorcycle, potted plant, bus, generate a short caption for this image."}]}, {"id": 2179, "image": "COCO_val2014_000000244834.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, clock, handbag, bicycle, potted plant, car, umbrella, generate a short caption for this image."}]}, {"id": 2185, "image": "COCO_val2014_000000151159.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, bench, skateboard, generate a short caption for this image."}]}, {"id": 2191, "image": "COCO_val2014_000000277073.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: backpack, person, bicycle, car, motorcycle, generate a short caption for this image."}]}, {"id": 2197, "image": "COCO_val2014_000000377879.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: truck, train, bed, generate a short caption for this image."}]}, {"id": 2203, "image": "COCO_val2014_000000436814.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: chair, mouse, laptop, dining table, generate a short caption for this image."}]}, {"id": 2209, "image": "COCO_val2014_000000063934.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: truck, person, skateboard, car, generate a short caption for this image."}]}, {"id": 2215, "image": "COCO_val2014_000000275611.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: knife, pizza, dining table, generate a short caption for this image."}]}, {"id": 2221, "image": "COCO_val2014_000000516248.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: tv, person, mouse, laptop, dining table, keyboard, generate a short caption for this image."}]}, {"id": 2227, "image": "COCO_val2014_000000478380.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, car, sheep, generate a short caption for this image."}]}, {"id": 2233, "image": "COCO_val2014_000000422326.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: truck, person, bicycle, car, motorcycle, generate a short caption for this image."}]}, {"id": 2239, "image": "COCO_val2014_000000110449.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: tv, person, remote, laptop, dining table, generate a short caption for this image."}]}, {"id": 2245, "image": "COCO_val2014_000000045550.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: bowl, clock, sandwich, person, generate a short caption for this image."}]}, {"id": 2251, "image": "COCO_val2014_000000444913.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, car, frisbee, generate a short caption for this image."}]}, {"id": 2257, "image": "COCO_val2014_000000082709.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: dog, person, backpack, bottle, frisbee, cup, generate a short caption for this image."}]}, {"id": 2263, "image": "COCO_val2014_000000132796.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: backpack, person, chair, bench, elephant, generate a short caption for this image."}]}, {"id": 2269, "image": "COCO_val2014_000000303099.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: bottle, cat, toilet, generate a short caption for this image."}]}, {"id": 2275, "image": "COCO_val2014_000000271565.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: truck, person, motorcycle, car, generate a short caption for this image."}]}, {"id": 2281, "image": "COCO_val2014_000000205193.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, toaster, microwave, oven, generate a short caption for this image."}]}, {"id": 2287, "image": "COCO_val2014_000000272566.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: banana, person, apple, cup, generate a short caption for this image."}]}, {"id": 2293, "image": "COCO_val2014_000000170099.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, couch, orange, tie, generate a short caption for this image."}]}, {"id": 2299, "image": "COCO_val2014_000000014940.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: dog, couch, chair, orange, cat, generate a short caption for this image."}]}, {"id": 2305, "image": "COCO_val2014_000000408449.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: backpack, person, truck, skateboard, handbag, car, bus, generate a short caption for this image."}]}, {"id": 2311, "image": "COCO_val2014_000000140702.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: bird, person, elephant, generate a short caption for this image."}]}, {"id": 2317, "image": "COCO_val2014_000000354540.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, toaster, laptop, chair, microwave, cell phone, oven, generate a short caption for this image."}]}, {"id": 2323, "image": "COCO_val2014_000000019763.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, knife, fork, dining table, chair, pizza, cup, generate a short caption for this image."}]}, {"id": 2329, "image": "COCO_val2014_000000191301.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, spoon, dining table, cake, cup, generate a short caption for this image."}]}, {"id": 2335, "image": "COCO_val2014_000000124236.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: spoon, bowl, oven, dining table, toaster, generate a short caption for this image."}]}, {"id": 2341, "image": "COCO_val2014_000000344930.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: orange, car, fire hydrant, generate a short caption for this image."}]}, {"id": 2347, "image": "COCO_val2014_000000014285.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: tv, remote, chair, bed, cat, book, generate a short caption for this image."}]}, {"id": 2353, "image": "COCO_val2014_000000082488.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: clock, remote, dining table, orange, bed, book, generate a short caption for this image."}]}, {"id": 2359, "image": "COCO_val2014_000000348251.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: backpack, person, skis, generate a short caption for this image."}]}, {"id": 2365, "image": "COCO_val2014_000000150703.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: dining table, sandwich, hot dog, cup, generate a short caption for this image."}]}, {"id": 2371, "image": "COCO_val2014_000000381615.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: potted plant, chair, vase, book, cup, generate a short caption for this image."}]}, {"id": 2377, "image": "COCO_val2014_000000352703.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: bowl, person, spoon, dining table, handbag, bottle, oven, cup, generate a short caption for this image."}]}, {"id": 2383, "image": "COCO_val2014_000000111109.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: tv, person, chair, motorcycle, tie, generate a short caption for this image."}]}, {"id": 2389, "image": "COCO_val2014_000000226802.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, motorcycle, car, bus, generate a short caption for this image."}]}, {"id": 2395, "image": "COCO_val2014_000000049005.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: baseball glove, person, baseball bat, generate a short caption for this image."}]}, {"id": 2401, "image": "COCO_val2014_000000410114.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: truck, person, horse, cow, generate a short caption for this image."}]}, {"id": 2407, "image": "COCO_val2014_000000100274.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: truck, train, car, generate a short caption for this image."}]}, {"id": 2413, "image": "COCO_val2014_000000317254.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, motorcycle, car, generate a short caption for this image."}]}, {"id": 2419, "image": "COCO_val2014_000000425569.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: sink, bowl, oven, generate a short caption for this image."}]}, {"id": 2425, "image": "COCO_val2014_000000028842.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: dog, person, skateboard, generate a short caption for this image."}]}, {"id": 2431, "image": "COCO_val2014_000000319607.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: traffic light, person, backpack, bicycle, car, bus, generate a short caption for this image."}]}, {"id": 2437, "image": "COCO_val2014_000000210372.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: sink, refrigerator, bottle, generate a short caption for this image."}]}, {"id": 2443, "image": "COCO_val2014_000000156915.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: vase, potted plant, dining table, generate a short caption for this image."}]}, {"id": 2449, "image": "COCO_val2014_000000349642.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: dog, person, bench, generate a short caption for this image."}]}, {"id": 2455, "image": "COCO_val2014_000000423008.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, handbag, boat, suitcase, umbrella, generate a short caption for this image."}]}, {"id": 2461, "image": "COCO_val2014_000000045435.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, cell phone, cup, generate a short caption for this image."}]}, {"id": 2467, "image": "COCO_val2014_000000035827.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: tv, person, bottle, laptop, dining table, chair, cup, generate a short caption for this image."}]}, {"id": 2473, "image": "COCO_val2014_000000072656.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, bowl, dining table, chair, pizza, suitcase, generate a short caption for this image."}]}, {"id": 2479, "image": "COCO_val2014_000000372087.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, train, suitcase, bench, generate a short caption for this image."}]}, {"id": 2485, "image": "COCO_val2014_000000336209.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, bench, skateboard, generate a short caption for this image."}]}, {"id": 2491, "image": "COCO_val2014_000000046031.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: tv, mouse, laptop, dining table, keyboard, cell phone, generate a short caption for this image."}]}, {"id": 2497, "image": "COCO_val2014_000000283118.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, dining table, chair, backpack, handbag, car, bottle, pizza, cell phone, generate a short caption for this image."}]}, {"id": 2503, "image": "COCO_val2014_000000429598.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: sink, refrigerator, bottle, oven, generate a short caption for this image."}]}, {"id": 2509, "image": "COCO_val2014_000000333365.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: truck, airplane, car, person, generate a short caption for this image."}]}, {"id": 2515, "image": "COCO_val2014_000000424174.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: banana, bowl, dining table, orange, apple, generate a short caption for this image."}]}, {"id": 2521, "image": "COCO_val2014_000000003035.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: truck, airplane, car, person, generate a short caption for this image."}]}, {"id": 2527, "image": "COCO_val2014_000000177213.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, fork, dining table, knife, pizza, cup, generate a short caption for this image."}]}, {"id": 2533, "image": "COCO_val2014_000000438535.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: backpack, clock, car, person, generate a short caption for this image."}]}, {"id": 2539, "image": "COCO_val2014_000000490711.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: sink, refrigerator, banana, generate a short caption for this image."}]}, {"id": 2545, "image": "COCO_val2014_000000163728.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: mouse, laptop, dining table, cat, cup, generate a short caption for this image."}]}, {"id": 2551, "image": "COCO_val2014_000000456756.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: giraffe, person, zebra, generate a short caption for this image."}]}, {"id": 2557, "image": "COCO_val2014_000000281028.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: chair, person, train, car, generate a short caption for this image."}]}, {"id": 2563, "image": "COCO_val2014_000000343937.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: snowboard, person, skis, generate a short caption for this image."}]}, {"id": 2569, "image": "COCO_val2014_000000277051.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: bird, knife, bottle, dining table, chair, generate a short caption for this image."}]}, {"id": 2575, "image": "COCO_val2014_000000145666.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: bowl, broccoli, carrot, dining table, generate a short caption for this image."}]}, {"id": 2581, "image": "COCO_val2014_000000304984.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: bowl, wine glass, sandwich, fork, orange, cake, generate a short caption for this image."}]}, {"id": 2587, "image": "COCO_val2014_000000122467.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, couch, remote, dining table, generate a short caption for this image."}]}, {"id": 2593, "image": "COCO_val2014_000000518287.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: dining table, toothbrush, orange, cup, generate a short caption for this image."}]}, {"id": 2599, "image": "COCO_val2014_000000145448.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, car, dining table, chair, motorcycle, cat, generate a short caption for this image."}]}, {"id": 2605, "image": "COCO_val2014_000000352900.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: knife, bowl, broccoli, fork, generate a short caption for this image."}]}, {"id": 2611, "image": "COCO_val2014_000000153607.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: vase, bowl, orange, dining table, generate a short caption for this image."}]}, {"id": 2617, "image": "COCO_val2014_000000157581.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, wine glass, bottle, dining table, chair, cup, generate a short caption for this image."}]}, {"id": 2623, "image": "COCO_val2014_000000227438.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: chair, clock, couch, bed, generate a short caption for this image."}]}, {"id": 2629, "image": "COCO_val2014_000000116037.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: baseball glove, person, sports ball, baseball bat, generate a short caption for this image."}]}, {"id": 2635, "image": "COCO_val2014_000000202658.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: chair, bowl, toilet, generate a short caption for this image."}]}, {"id": 2641, "image": "COCO_val2014_000000018150.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: backpack, person, couch, bottle, pizza, generate a short caption for this image."}]}, {"id": 2647, "image": "COCO_val2014_000000418711.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: banana, truck, person, bicycle, car, generate a short caption for this image."}]}, {"id": 2653, "image": "COCO_val2014_000000092107.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: spoon, bowl, broccoli, dining table, generate a short caption for this image."}]}, {"id": 2659, "image": "COCO_val2014_000000491000.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: dog, person, handbag, car, motorcycle, generate a short caption for this image."}]}, {"id": 2665, "image": "COCO_val2014_000000401163.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: dog, person, bench, generate a short caption for this image."}]}, {"id": 2671, "image": "COCO_val2014_000000039663.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: backpack, scissors, laptop, cell phone, book, generate a short caption for this image."}]}, {"id": 2677, "image": "COCO_val2014_000000268293.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, sports ball, tennis racket, generate a short caption for this image."}]}, {"id": 2683, "image": "COCO_val2014_000000474499.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: vase, spoon, clock, bottle, sink, apple, oven, generate a short caption for this image."}]}, {"id": 2689, "image": "COCO_val2014_000000358086.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: backpack, person, bottle, frisbee, chair, generate a short caption for this image."}]}, {"id": 2695, "image": "COCO_val2014_000000089541.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: banana, bowl, cake, dining table, generate a short caption for this image."}]}, {"id": 2701, "image": "COCO_val2014_000000239445.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, sports ball, frisbee, generate a short caption for this image."}]}, {"id": 2707, "image": "COCO_val2014_000000105782.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: dog, cat, car, kite, generate a short caption for this image."}]}, {"id": 2713, "image": "COCO_val2014_000000487990.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, bowl, spoon, dining table, knife, donut, cake, fork, bottle, cup, generate a short caption for this image."}]}, {"id": 2719, "image": "COCO_val2014_000000138834.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, car, skis, generate a short caption for this image."}]}, {"id": 2725, "image": "COCO_val2014_000000492057.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, cell phone, tie, generate a short caption for this image."}]}, {"id": 2731, "image": "COCO_val2014_000000388215.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, sports ball, tennis racket, generate a short caption for this image."}]}, {"id": 2737, "image": "COCO_val2014_000000111409.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, motorcycle, bicycle, generate a short caption for this image."}]}, {"id": 2743, "image": "COCO_val2014_000000451391.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, broccoli, dining table, generate a short caption for this image."}]}, {"id": 2749, "image": "COCO_val2014_000000092462.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: backpack, person, car, cow, generate a short caption for this image."}]}, {"id": 2755, "image": "COCO_val2014_000000397268.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: chair, bicycle, potted plant, bench, generate a short caption for this image."}]}, {"id": 2761, "image": "COCO_val2014_000000127659.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: dog, person, train, bench, generate a short caption for this image."}]}, {"id": 2767, "image": "COCO_val2014_000000251857.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: traffic light, person, skateboard, car, generate a short caption for this image."}]}, {"id": 2773, "image": "COCO_val2014_000000243212.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: dog, truck, person, car, frisbee, generate a short caption for this image."}]}, {"id": 2779, "image": "COCO_val2014_000000444304.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: tv, person, bed, remote, generate a short caption for this image."}]}, {"id": 2785, "image": "COCO_val2014_000000305000.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: banana, person, apple, generate a short caption for this image."}]}, {"id": 2791, "image": "COCO_val2014_000000044136.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, wine glass, bottle, dining table, tie, generate a short caption for this image."}]}, {"id": 2797, "image": "COCO_val2014_000000154846.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: pizza, person, dining table, generate a short caption for this image."}]}, {"id": 2803, "image": "COCO_val2014_000000567254.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: tv, vase, person, bottle, generate a short caption for this image."}]}, {"id": 2809, "image": "COCO_val2014_000000079261.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: backpack, clock, handbag, person, generate a short caption for this image."}]}, {"id": 2815, "image": "COCO_val2014_000000301402.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, potted plant, bottle, laptop, dining table, chair, generate a short caption for this image."}]}, {"id": 2821, "image": "COCO_val2014_000000415828.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: traffic light, person, handbag, backpack, car, stop sign, umbrella, generate a short caption for this image."}]}, {"id": 2827, "image": "COCO_val2014_000000543231.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, bowl, remote, bed, book, generate a short caption for this image."}]}, {"id": 2833, "image": "COCO_val2014_000000495357.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: dog, handbag, motorcycle, person, generate a short caption for this image."}]}, {"id": 2839, "image": "COCO_val2014_000000523175.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: spoon, bowl, carrot, dining table, generate a short caption for this image."}]}, {"id": 2845, "image": "COCO_val2014_000000184945.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: motorcycle, bicycle, potted plant, generate a short caption for this image."}]}, {"id": 2851, "image": "COCO_val2014_000000049942.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: tie, person, umbrella, generate a short caption for this image."}]}, {"id": 2857, "image": "COCO_val2014_000000223019.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: backpack, person, handbag, train, car, suitcase, generate a short caption for this image."}]}, {"id": 2863, "image": "COCO_val2014_000000042889.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: keyboard, teddy bear, bear, generate a short caption for this image."}]}, {"id": 2869, "image": "COCO_val2014_000000022112.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: vase, bowl, potted plant, dining table, generate a short caption for this image."}]}, {"id": 2875, "image": "COCO_val2014_000000247587.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, car, surfboard, generate a short caption for this image."}]}, {"id": 2881, "image": "COCO_val2014_000000171962.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, bicycle, car, bus, generate a short caption for this image."}]}, {"id": 2887, "image": "COCO_val2014_000000545056.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: dog, car, cow, sheep, generate a short caption for this image."}]}, {"id": 2893, "image": "COCO_val2014_000000401411.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, bowl, dining table, book, carrot, oven, cup, generate a short caption for this image."}]}, {"id": 2899, "image": "COCO_val2014_000000002867.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: backpack, person, skis, generate a short caption for this image."}]}, {"id": 2905, "image": "COCO_val2014_000000246963.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, motorcycle, stop sign, generate a short caption for this image."}]}, {"id": 2911, "image": "COCO_val2014_000000498225.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, handbag, donut, cake, cell phone, generate a short caption for this image."}]}, {"id": 2917, "image": "COCO_val2014_000000124601.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: laptop, dining table, sink, wine glass, bottle, keyboard, microwave, cup, generate a short caption for this image."}]}, {"id": 2923, "image": "COCO_val2014_000000581749.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: broccoli, orange, dining table, generate a short caption for this image."}]}, {"id": 2929, "image": "COCO_val2014_000000227960.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: bowl, spoon, wine glass, carrot, cup, generate a short caption for this image."}]}, {"id": 2935, "image": "COCO_val2014_000000286503.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: person, sports ball, elephant, generate a short caption for this image."}]}, {"id": 2941, "image": "COCO_val2014_000000315740.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: bowl, chair, sink, tv, couch, potted plant, microwave, oven, generate a short caption for this image."}]}, {"id": 2947, "image": "COCO_val2014_000000142108.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: banana, person, dining table, generate a short caption for this image."}]}, {"id": 2953, "image": "COCO_val2014_000000542933.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: baseball glove, person, sports ball, chair, bench, baseball bat, generate a short caption for this image."}]}, {"id": 2959, "image": "COCO_val2014_000000475043.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: bowl, person, spoon, couch, laptop, bottle, generate a short caption for this image."}]}, {"id": 2965, "image": "COCO_val2014_000000456784.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: knife, person, cake, generate a short caption for this image."}]}, {"id": 2971, "image": "COCO_val2014_000000033753.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: tv, chair, bench, remote, generate a short caption for this image."}]}, {"id": 2977, "image": "COCO_val2014_000000448694.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: chair, person, frisbee, generate a short caption for this image."}]}, {"id": 2983, "image": "COCO_val2014_000000099053.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: bowl, broccoli, fork, dining table, carrot, generate a short caption for this image."}]}, {"id": 2989, "image": "COCO_val2014_000000503210.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: clock, couch, potted plant, chair, vase, orange, book, generate a short caption for this image."}]}, {"id": 2995, "image": "COCO_val2014_000000142257.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image. Generate a short caption of the image."}, {"from": "gpt", "value": "Focusing on the visible elements in this image: truck, traffic light, person, car, generate a short caption for this image."}]}]