[{"id": 1, "image": "COCO_val2014_000000310196.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: snowboard, person, skis."}]}, {"id": 7, "image": "COCO_val2014_000000210789.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: truck, person, umbrella."}]}, {"id": 13, "image": "COCO_val2014_000000429109.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, bicycle, car, train, bus."}]}, {"id": 19, "image": "COCO_val2014_000000211674.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, potted plant, car, bus."}]}, {"id": 25, "image": "COCO_val2014_000000458338.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: traffic light, person, car, clock."}]}, {"id": 31, "image": "COCO_val2014_000000283412.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: dog, dining table, bed, cup."}]}, {"id": 37, "image": "COCO_val2014_000000265719.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, spoon, fork, donut, cake."}]}, {"id": 43, "image": "COCO_val2014_000000461331.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: tv, toaster, microwave, oven."}]}, {"id": 49, "image": "COCO_val2014_000000544456.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: backpack, person, skis."}]}, {"id": 55, "image": "COCO_val2014_000000017708.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: bird, person, boat."}]}, {"id": 61, "image": "COCO_val2014_000000574692.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, orange, bottle."}]}, {"id": 67, "image": "COCO_val2014_000000353180.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: backpack, person, bus."}]}, {"id": 73, "image": "COCO_val2014_000000239444.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, dining table, chair, pizza, microwave, oven."}]}, {"id": 79, "image": "COCO_val2014_000000569839.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, sandwich, hot dog, dining table, chair, cup."}]}, {"id": 85, "image": "COCO_val2014_000000219622.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, car, frisbee."}]}, {"id": 91, "image": "COCO_val2014_000000300368.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: knife, person, cake, dining table."}]}, {"id": 97, "image": "COCO_val2014_000000482476.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, remote, handbag, cell phone."}]}, {"id": 103, "image": "COCO_val2014_000000131115.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: baseball glove, person, baseball bat."}]}, {"id": 109, "image": "COCO_val2014_000000157084.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: sink, bench, toilet."}]}, {"id": 115, "image": "COCO_val2014_000000381895.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, teddy bear, bear."}]}, {"id": 121, "image": "COCO_val2014_000000336872.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: bowl, spoon, sandwich, dining table."}]}, {"id": 127, "image": "COCO_val2014_000000075591.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: bed, cat, book."}]}, {"id": 133, "image": "COCO_val2014_000000516916.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: tv, mouse, laptop, dining table, chair, keyboard, cell phone."}]}, {"id": 139, "image": "COCO_val2014_000000542145.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: sink, toothbrush, bottle, toilet."}]}, {"id": 145, "image": "COCO_val2014_000000218224.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: truck, airplane, car."}]}, {"id": 151, "image": "COCO_val2014_000000297078.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: backpack, person, skis."}]}, {"id": 157, "image": "COCO_val2014_000000033270.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: tv, person, couch, refrigerator, orange, book."}]}, {"id": 163, "image": "COCO_val2014_000000140583.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: truck, person, cow, sheep."}]}, {"id": 169, "image": "COCO_val2014_000000421455.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: dog, truck, person, car, cell phone."}]}, {"id": 175, "image": "COCO_val2014_000000288639.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: sink, toothbrush, bottle, cat."}]}, {"id": 181, "image": "COCO_val2014_000000291936.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, bicycle, umbrella."}]}, {"id": 187, "image": "COCO_val2014_000000063953.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, bicycle, skateboard."}]}, {"id": 193, "image": "COCO_val2014_000000526321.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: laptop, remote, dining table, cell phone, tv, mouse, keyboard, book."}]}, {"id": 199, "image": "COCO_val2014_000000042190.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: snowboard, person, surfboard."}]}, {"id": 205, "image": "COCO_val2014_000000553165.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: bird, person, boat."}]}, {"id": 211, "image": "COCO_val2014_000000170517.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: dog, person, car, umbrella."}]}, {"id": 217, "image": "COCO_val2014_000000498759.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, skateboard, car."}]}, {"id": 223, "image": "COCO_val2014_000000360600.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, fork, knife, pizza, cup."}]}, {"id": 229, "image": "COCO_val2014_000000031773.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: tv, person, mouse, potted plant, laptop, dining table, keyboard."}]}, {"id": 235, "image": "COCO_val2014_000000500257.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: tv, backpack, person, suitcase."}]}, {"id": 241, "image": "COCO_val2014_000000574057.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: car, train, bed."}]}, {"id": 247, "image": "COCO_val2014_000000456178.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: traffic light, truck, clock, person, car."}]}, {"id": 253, "image": "COCO_val2014_000000565941.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, bottle, dining table, chair, cake, cup."}]}, {"id": 259, "image": "COCO_val2014_000000485564.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: car, person, sports ball."}]}, {"id": 265, "image": "COCO_val2014_000000454642.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: truck, person, motorcycle, car."}]}, {"id": 271, "image": "COCO_val2014_000000205729.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: backpack, person, skis."}]}, {"id": 277, "image": "COCO_val2014_000000424792.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: couch, chair, vase, bed, cat."}]}, {"id": 283, "image": "COCO_val2014_000000329717.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, suitcase, handbag."}]}, {"id": 289, "image": "COCO_val2014_000000012333.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: chair, person, horse."}]}, {"id": 295, "image": "COCO_val2014_000000480122.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: banana, spoon, bowl, dining table, chair, knife, sink, refrigerator, bottle, oven."}]}, {"id": 301, "image": "COCO_val2014_000000515904.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: vase, sink, potted plant, toilet."}]}, {"id": 307, "image": "COCO_val2014_000000437347.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: baseball glove, person, sports ball."}]}, {"id": 313, "image": "COCO_val2014_000000354229.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, car, bus."}]}, {"id": 319, "image": "COCO_val2014_000000538236.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, dining table, handbag, hot dog."}]}, {"id": 325, "image": "COCO_val2014_000000236865.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, orange, car, frisbee."}]}, {"id": 331, "image": "COCO_val2014_000000217397.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, wine glass, bottle, fork, dining table, cell phone."}]}, {"id": 337, "image": "COCO_val2014_000000060213.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: backpack, bed, book."}]}, {"id": 343, "image": "COCO_val2014_000000054025.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: traffic light, person, backpack, handbag, car, bus."}]}, {"id": 349, "image": "COCO_val2014_000000084447.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, motorcycle, bicycle."}]}, {"id": 355, "image": "COCO_val2014_000000192660.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, dining table, refrigerator, bench, orange."}]}, {"id": 361, "image": "COCO_val2014_000000575755.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: sandwich, carrot, broccoli, dining table."}]}, {"id": 367, "image": "COCO_val2014_000000354088.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, car, dining table, orange, apple, bus, umbrella."}]}, {"id": 373, "image": "COCO_val2014_000000311327.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, boat, handbag."}]}, {"id": 379, "image": "COCO_val2014_000000456552.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, bench, skateboard."}]}, {"id": 385, "image": "COCO_val2014_000000350898.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: spoon, bottle, refrigerator, oven, cup."}]}, {"id": 391, "image": "COCO_val2014_000000170365.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: sink, potted plant, toilet."}]}, {"id": 397, "image": "COCO_val2014_000000021645.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: backpack, clock, couch, dining table, chair, book."}]}, {"id": 403, "image": "COCO_val2014_000000528905.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, car, frisbee."}]}, {"id": 409, "image": "COCO_val2014_000000239347.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, bed, tie."}]}, {"id": 415, "image": "COCO_val2014_000000007320.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: tv, person, remote, dining table."}]}, {"id": 421, "image": "COCO_val2014_000000249715.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: chair, person, remote."}]}, {"id": 427, "image": "COCO_val2014_000000080022.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, sports ball, tennis racket."}]}, {"id": 433, "image": "COCO_val2014_000000564336.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, truck, laptop, dining table, chair, parking meter, bottle, car."}]}, {"id": 439, "image": "COCO_val2014_000000231589.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: chair, person, couch, bed."}]}, {"id": 445, "image": "COCO_val2014_000000465275.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, clock, couch, remote, vase."}]}, {"id": 451, "image": "COCO_val2014_000000083275.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: baseball glove, person, sports ball, baseball bat."}]}, {"id": 457, "image": "COCO_val2014_000000406403.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: snowboard, person, skis."}]}, {"id": 463, "image": "COCO_val2014_000000131018.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, bowl, scissors, spoon, bottle, sink, cup."}]}, {"id": 469, "image": "COCO_val2014_000000332625.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: chair, person, airplane, tv."}]}, {"id": 475, "image": "COCO_val2014_000000332025.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: dog, person, backpack, skis, snowboard, cell phone."}]}, {"id": 481, "image": "COCO_val2014_000000318204.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, horse, sheep, umbrella."}]}, {"id": 487, "image": "COCO_val2014_000000455157.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, laptop, dining table, bench, umbrella."}]}, {"id": 493, "image": "COCO_val2014_000000069189.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: bird, person, handbag, bicycle, car, umbrella."}]}, {"id": 499, "image": "COCO_val2014_000000199764.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: backpack, truck, person, handbag, dining table, chair, bench."}]}, {"id": 505, "image": "COCO_val2014_000000148766.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: vase, bowl, toilet."}]}, {"id": 511, "image": "COCO_val2014_000000175506.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, orange, frisbee."}]}, {"id": 517, "image": "COCO_val2014_000000353096.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: tv, mouse, laptop, dining table, keyboard, apple."}]}, {"id": 523, "image": "COCO_val2014_000000207205.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: chair, broccoli, carrot."}]}, {"id": 529, "image": "COCO_val2014_000000427113.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, bench, clock."}]}, {"id": 535, "image": "COCO_val2014_000000498374.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: sink, person, toilet."}]}, {"id": 541, "image": "COCO_val2014_000000013348.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: truck, person, airplane."}]}, {"id": 547, "image": "COCO_val2014_000000081336.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: traffic light, truck, person, car, stop sign."}]}, {"id": 553, "image": "COCO_val2014_000000190185.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: dog, bowl, sandwich, hot dog."}]}, {"id": 559, "image": "COCO_val2014_000000209755.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: spoon, bowl, person, carrot, oven."}]}, {"id": 565, "image": "COCO_val2014_000000227227.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: dog, person, car."}]}, {"id": 571, "image": "COCO_val2014_000000397705.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, bed, cat, book, cup."}]}, {"id": 577, "image": "COCO_val2014_000000287305.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, car, bus."}]}, {"id": 583, "image": "COCO_val2014_000000257327.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: sink, toilet, bottle."}]}, {"id": 589, "image": "COCO_val2014_000000557016.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, bicycle, car, cow, motorcycle."}]}, {"id": 595, "image": "COCO_val2014_000000105732.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: backpack, person, skis."}]}, {"id": 601, "image": "COCO_val2014_000000534121.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: tv, couch, boat, chair, book, cup."}]}, {"id": 607, "image": "COCO_val2014_000000520524.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: motorcycle, bicycle, car, dining table, orange."}]}, {"id": 613, "image": "COCO_val2014_000000580294.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, bowl, spoon, sink, cake, toothbrush, oven, cup."}]}, {"id": 619, "image": "COCO_val2014_000000374061.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: couch, bed, cat, teddy bear, bear."}]}, {"id": 625, "image": "COCO_val2014_000000094944.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: snowboard, backpack, person, skis."}]}, {"id": 631, "image": "COCO_val2014_000000572075.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: bowl, toilet, cat."}]}, {"id": 637, "image": "COCO_val2014_000000387098.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: spoon, laptop, remote, dining table, tv, mouse, bottle, keyboard, book, cup."}]}, {"id": 643, "image": "COCO_val2014_000000382670.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, bicycle, potted plant, boat, cow."}]}, {"id": 649, "image": "COCO_val2014_000000414516.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, dining table, chair, bench, elephant."}]}, {"id": 655, "image": "COCO_val2014_000000204360.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: traffic light, person, backpack, bicycle, car, bus."}]}, {"id": 661, "image": "COCO_val2014_000000245642.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, bicycle, sheep."}]}, {"id": 667, "image": "COCO_val2014_000000237767.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, remote, dining table, apple, cell phone, tv, couch, book."}]}, {"id": 673, "image": "COCO_val2014_000000370900.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: tie, bear, teddy bear, dining table."}]}, {"id": 679, "image": "COCO_val2014_000000094501.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: cake, bottle, fork, dining table."}]}, {"id": 685, "image": "COCO_val2014_000000069196.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: backpack, person, airplane, handbag."}]}, {"id": 691, "image": "COCO_val2014_000000366141.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: tv, couch, dining table, chair, cat."}]}, {"id": 697, "image": "COCO_val2014_000000093946.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, bowl, fork, dining table, chair, pizza."}]}, {"id": 703, "image": "COCO_val2014_000000032610.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: backpack, bench, laptop, dining table."}]}, {"id": 709, "image": "COCO_val2014_000000239130.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, bicycle, car, cell phone."}]}, {"id": 715, "image": "COCO_val2014_000000226097.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: spoon, bowl, bottle, sink, refrigerator, oven, cup."}]}, {"id": 721, "image": "COCO_val2014_000000538054.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: banana, person, handbag."}]}, {"id": 727, "image": "COCO_val2014_000000114710.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: backpack, person, sheep."}]}, {"id": 733, "image": "COCO_val2014_000000113701.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: chair, clock, bed, book."}]}, {"id": 739, "image": "COCO_val2014_000000065001.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, motorcycle, car, bicycle."}]}, {"id": 745, "image": "COCO_val2014_000000482275.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, knife, vase, cup, cake, tie."}]}, {"id": 751, "image": "COCO_val2014_000000140983.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: toothbrush, couch, person."}]}, {"id": 757, "image": "COCO_val2014_000000044520.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: traffic light, train, stop sign."}]}, {"id": 763, "image": "COCO_val2014_000000518177.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: toaster, person, microwave, oven."}]}, {"id": 769, "image": "COCO_val2014_000000203479.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, couch, dining table, teddy bear, cup."}]}, {"id": 775, "image": "COCO_val2014_000000551881.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: handbag, potted plant, dining table, chair, refrigerator, book."}]}, {"id": 781, "image": "COCO_val2014_000000369541.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: dog, person, frisbee."}]}, {"id": 787, "image": "COCO_val2014_000000007795.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: clock, bed, remote."}]}, {"id": 793, "image": "COCO_val2014_000000303099.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: bottle, cat, toilet."}]}, {"id": 799, "image": "COCO_val2014_000000147289.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: fire hydrant, car, stop sign."}]}, {"id": 805, "image": "COCO_val2014_000000102439.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: tv, person, remote."}]}, {"id": 811, "image": "COCO_val2014_000000183757.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: boat, dog, cat."}]}, {"id": 817, "image": "COCO_val2014_000000500473.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, broccoli, handbag, carrot, dining table, orange, apple."}]}, {"id": 823, "image": "COCO_val2014_000000327532.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: chair, person, sports ball, tennis racket."}]}, {"id": 829, "image": "COCO_val2014_000000167110.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: car, person, skateboard."}]}, {"id": 835, "image": "COCO_val2014_000000300876.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, spoon, bowl, dining table, knife, chair, cake, handbag, bottle, cup."}]}, {"id": 841, "image": "COCO_val2014_000000443240.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: tv, chair, couch."}]}, {"id": 847, "image": "COCO_val2014_000000288042.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: truck, person, umbrella."}]}, {"id": 853, "image": "COCO_val2014_000000270609.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, dining table, chair, orange, cup."}]}, {"id": 859, "image": "COCO_val2014_000000208028.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: keyboard, person, laptop."}]}, {"id": 865, "image": "COCO_val2014_000000314992.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, handbag, cell phone."}]}, {"id": 871, "image": "COCO_val2014_000000424642.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, bottle, frisbee."}]}, {"id": 877, "image": "COCO_val2014_000000467176.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: tv, person, couch, remote, chair, book, cup."}]}, {"id": 883, "image": "COCO_val2014_000000087435.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: motorcycle, person, car, bench, bus."}]}, {"id": 889, "image": "COCO_val2014_000000079446.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: sink, bottle, toilet, cup."}]}, {"id": 895, "image": "COCO_val2014_000000354398.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: dog, truck, person, car, tie."}]}, {"id": 901, "image": "COCO_val2014_000000430052.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: vase, bird, bottle, dining table."}]}, {"id": 907, "image": "COCO_val2014_000000567886.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: teddy bear, person, book."}]}, {"id": 913, "image": "COCO_val2014_000000539251.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: sink, person, potted plant."}]}, {"id": 919, "image": "COCO_val2014_000000551908.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: broccoli, carrot, bed, dining table."}]}, {"id": 925, "image": "COCO_val2014_000000102906.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: traffic light, car, fire hydrant."}]}, {"id": 931, "image": "COCO_val2014_000000327771.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: vase, bed, dining table."}]}, {"id": 937, "image": "COCO_val2014_000000280734.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: pizza, person, dining table."}]}, {"id": 943, "image": "COCO_val2014_000000111817.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, cup, cake, dining table."}]}, {"id": 949, "image": "COCO_val2014_000000379162.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, handbag, bottle, tennis racket, chair."}]}, {"id": 955, "image": "COCO_val2014_000000489728.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: bowl, person, laptop, orange, bed, tv, couch, book."}]}, {"id": 961, "image": "COCO_val2014_000000419453.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, sports ball, umbrella."}]}, {"id": 967, "image": "COCO_val2014_000000358255.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: teddy bear, person, bed, book."}]}, {"id": 973, "image": "COCO_val2014_000000178078.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: motorcycle, car, bicycle."}]}, {"id": 979, "image": "COCO_val2014_000000197219.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, handbag, dining table, chair, pizza."}]}, {"id": 985, "image": "COCO_val2014_000000460931.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: donut, cake, person."}]}, {"id": 991, "image": "COCO_val2014_000000506483.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: backpack, person, car, bench, cat."}]}, {"id": 997, "image": "COCO_val2014_000000468934.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: banana, person, fork, dining table."}]}, {"id": 1003, "image": "COCO_val2014_000000189694.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: sink, person, bottle, microwave."}]}, {"id": 1009, "image": "COCO_val2014_000000273450.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, orange, car, parking meter."}]}, {"id": 1015, "image": "COCO_val2014_000000278226.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, couch, laptop, cup."}]}, {"id": 1021, "image": "COCO_val2014_000000449432.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: backpack, person, handbag, train, boat, suitcase, bus."}]}, {"id": 1027, "image": "COCO_val2014_000000153300.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: spoon, bowl, person, dining table, knife, cake, fork, bottle, cup."}]}, {"id": 1033, "image": "COCO_val2014_000000506178.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, bed, remote."}]}, {"id": 1039, "image": "COCO_val2014_000000249720.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: backpack, person, traffic light, bicycle, car, stop sign, bench."}]}, {"id": 1045, "image": "COCO_val2014_000000205206.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: traffic light, train, car, bus."}]}, {"id": 1051, "image": "COCO_val2014_000000018150.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: backpack, person, couch, bottle, pizza."}]}, {"id": 1057, "image": "COCO_val2014_000000303534.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: tv, backpack, chair, bed, book, cup."}]}, {"id": 1063, "image": "COCO_val2014_000000278771.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, sports ball, bottle, bench, baseball bat."}]}, {"id": 1069, "image": "COCO_val2014_000000134075.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: spoon, person, bottle, dining table, cell phone, cup."}]}, {"id": 1075, "image": "COCO_val2014_000000227204.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: banana, person, spoon, bowl, dining table, knife, bottle, cell phone, cup."}]}, {"id": 1081, "image": "COCO_val2014_000000124629.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, sports ball, tennis racket."}]}, {"id": 1087, "image": "COCO_val2014_000000142774.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, motorcycle, bus."}]}, {"id": 1093, "image": "COCO_val2014_000000199959.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: backpack, person, train, handbag."}]}, {"id": 1099, "image": "COCO_val2014_000000316700.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: bowl, broccoli, dining table, knife, orange, sandwich, fork, cup."}]}, {"id": 1105, "image": "COCO_val2014_000000040361.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, sports ball, baseball bat."}]}, {"id": 1111, "image": "COCO_val2014_000000378873.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: truck, bowl, person, handbag, dining table, orange, carrot."}]}, {"id": 1117, "image": "COCO_val2014_000000574790.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: traffic light, truck, car."}]}, {"id": 1123, "image": "COCO_val2014_000000011241.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: bowl, spoon, sandwich, bottle, knife, donut, cup."}]}, {"id": 1129, "image": "COCO_val2014_000000525667.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, bicycle, dining table, bench, cup."}]}, {"id": 1135, "image": "COCO_val2014_000000246145.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: chair, clock, dining table."}]}, {"id": 1141, "image": "COCO_val2014_000000313162.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: pizza, cup, dining table."}]}, {"id": 1147, "image": "COCO_val2014_000000529668.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, handbag, cell phone."}]}, {"id": 1153, "image": "COCO_val2014_000000376959.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: chair, person, cell phone, dining table."}]}, {"id": 1159, "image": "COCO_val2014_000000275863.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: chair, person, bench, baseball bat."}]}, {"id": 1165, "image": "COCO_val2014_000000327038.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, spoon, bowl, fork, bottle, knife."}]}, {"id": 1171, "image": "COCO_val2014_000000513136.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: pizza, cup, dining table."}]}, {"id": 1177, "image": "COCO_val2014_000000184338.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: truck, bed, person."}]}, {"id": 1183, "image": "COCO_val2014_000000175437.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, dining table, donut."}]}, {"id": 1189, "image": "COCO_val2014_000000498100.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: vase, person, dining table."}]}, {"id": 1195, "image": "COCO_val2014_000000375909.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: refrigerator, couch, bed."}]}, {"id": 1201, "image": "COCO_val2014_000000396338.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: backpack, truck, person, handbag, car, motorcycle."}]}, {"id": 1207, "image": "COCO_val2014_000000124930.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, bowl, spoon, wine glass, knife, cup."}]}, {"id": 1213, "image": "COCO_val2014_000000112664.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, handbag, suitcase, bench, tie."}]}, {"id": 1219, "image": "COCO_val2014_000000377879.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: truck, train, bed."}]}, {"id": 1225, "image": "COCO_val2014_000000377401.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: truck, clock, car."}]}, {"id": 1231, "image": "COCO_val2014_000000049473.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: vase, book, dining table."}]}, {"id": 1237, "image": "COCO_val2014_000000153865.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, car, kite."}]}, {"id": 1243, "image": "COCO_val2014_000000383185.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: banana, spoon, bowl, bottle, cup."}]}, {"id": 1249, "image": "COCO_val2014_000000388983.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, bowl, sandwich, hot dog, dining table, cup."}]}, {"id": 1255, "image": "COCO_val2014_000000295377.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: backpack, person, skis."}]}, {"id": 1261, "image": "COCO_val2014_000000034773.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: chair, dog, couch, potted plant."}]}, {"id": 1267, "image": "COCO_val2014_000000244455.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: truck, person, bicycle, car, motorcycle."}]}, {"id": 1273, "image": "COCO_val2014_000000303652.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: spoon, dining table, knife, cake, cup."}]}, {"id": 1279, "image": "COCO_val2014_000000098493.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: truck, clock, person, car, chair."}]}, {"id": 1285, "image": "COCO_val2014_000000201148.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: traffic light, person, train."}]}, {"id": 1291, "image": "COCO_val2014_000000307166.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: pizza, bottle, oven, dining table."}]}, {"id": 1297, "image": "COCO_val2014_000000313034.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: clock, person, bottle, fork, vase, cake, cup."}]}, {"id": 1303, "image": "COCO_val2014_000000355342.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, orange, umbrella."}]}, {"id": 1309, "image": "COCO_val2014_000000333237.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: chair, bed, dining table."}]}, {"id": 1315, "image": "COCO_val2014_000000526368.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: laptop, dining table, chair, keyboard, book."}]}, {"id": 1321, "image": "COCO_val2014_000000429033.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: chair, person, laptop."}]}, {"id": 1327, "image": "COCO_val2014_000000287035.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: tv, mouse, laptop, dining table, chair, keyboard, cell phone."}]}, {"id": 1333, "image": "COCO_val2014_000000299074.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, cell phone, bus."}]}, {"id": 1339, "image": "COCO_val2014_000000120648.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: handbag, dining table, suitcase, bed, book."}]}, {"id": 1345, "image": "COCO_val2014_000000186709.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: car, truck, orange, apple."}]}, {"id": 1351, "image": "COCO_val2014_000000025972.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: truck, couch, car, bottle, bicycle, stop sign, chair."}]}, {"id": 1357, "image": "COCO_val2014_000000505542.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, train, car."}]}, {"id": 1363, "image": "COCO_val2014_000000115636.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, horse, cow."}]}, {"id": 1369, "image": "COCO_val2014_000000280810.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: broccoli, fork, laptop, dining table, carrot."}]}, {"id": 1375, "image": "COCO_val2014_000000037900.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: backpack, handbag, knife, book, cell phone."}]}, {"id": 1381, "image": "COCO_val2014_000000534516.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: traffic light, person, tie."}]}, {"id": 1387, "image": "COCO_val2014_000000123017.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: bowl, dining table, chair, vase, bench."}]}, {"id": 1393, "image": "COCO_val2014_000000316237.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, handbag, cell phone."}]}, {"id": 1399, "image": "COCO_val2014_000000463522.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: backpack, person, dog, horse, bicycle."}]}, {"id": 1405, "image": "COCO_val2014_000000511341.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: backpack, person, handbag, car, umbrella."}]}, {"id": 1411, "image": "COCO_val2014_000000200583.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: bottle, boat, bench, dining table, suitcase."}]}, {"id": 1417, "image": "COCO_val2014_000000424585.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: truck, car, person, bed."}]}, {"id": 1423, "image": "COCO_val2014_000000100215.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: clock, bed, bottle, teddy bear."}]}, {"id": 1429, "image": "COCO_val2014_000000017379.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: tv, sink, person."}]}, {"id": 1435, "image": "COCO_val2014_000000511622.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: bowl, scissors, person, spoon, cup."}]}, {"id": 1441, "image": "COCO_val2014_000000407386.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: tv, laptop, dining table, keyboard, cat."}]}, {"id": 1447, "image": "COCO_val2014_000000039516.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: spoon, orange, cake."}]}, {"id": 1453, "image": "COCO_val2014_000000018918.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: sink, bowl, oven."}]}, {"id": 1459, "image": "COCO_val2014_000000115626.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, motorcycle, parking meter."}]}, {"id": 1465, "image": "COCO_val2014_000000391735.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: traffic light, toothbrush, person."}]}, {"id": 1471, "image": "COCO_val2014_000000236370.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: chair, person, book, tie."}]}, {"id": 1477, "image": "COCO_val2014_000000463542.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: snowboard, skis, person."}]}, {"id": 1483, "image": "COCO_val2014_000000377352.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, motorcycle, car, bicycle."}]}, {"id": 1489, "image": "COCO_val2014_000000302405.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: snowboard, backpack, person."}]}, {"id": 1495, "image": "COCO_val2014_000000554328.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: baseball glove, person, sports ball."}]}, {"id": 1501, "image": "COCO_val2014_000000384040.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: spoon, dining table, knife, donut, sandwich, bottle, fork, cup."}]}, {"id": 1507, "image": "COCO_val2014_000000023731.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: clock, dining table, chair, orange, cat."}]}, {"id": 1513, "image": "COCO_val2014_000000463640.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, wine glass, dining table, knife, cake, tie."}]}, {"id": 1519, "image": "COCO_val2014_000000365822.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: potted plant, bottle, chair, vase, oven."}]}, {"id": 1525, "image": "COCO_val2014_000000524979.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: baseball glove, person, sports ball, dog."}]}, {"id": 1531, "image": "COCO_val2014_000000070813.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: handbag, bed, bottle, dining table."}]}, {"id": 1537, "image": "COCO_val2014_000000318209.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: sandwich, fork, dining table, refrigerator, cat."}]}, {"id": 1543, "image": "COCO_val2014_000000016775.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: bird, truck, person, dog, handbag."}]}, {"id": 1549, "image": "COCO_val2014_000000015338.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: truck, person, sports ball, car, bus."}]}, {"id": 1555, "image": "COCO_val2014_000000441156.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: backpack, person, elephant, handbag."}]}, {"id": 1561, "image": "COCO_val2014_000000408757.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, kite, umbrella."}]}, {"id": 1567, "image": "COCO_val2014_000000550691.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, car, bus, tie."}]}, {"id": 1573, "image": "COCO_val2014_000000204100.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: banana, person, bowl, bench."}]}, {"id": 1579, "image": "COCO_val2014_000000468169.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, bench, sports ball, baseball bat."}]}, {"id": 1585, "image": "COCO_val2014_000000198312.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: bowl, couch, potted plant, dining table, chair, vase."}]}, {"id": 1591, "image": "COCO_val2014_000000485485.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, car, elephant."}]}, {"id": 1597, "image": "COCO_val2014_000000309371.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: backpack, person, bench, cup."}]}, {"id": 1603, "image": "COCO_val2014_000000472143.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: bird, person, dining table, chair, bench, backpack, handbag, car, fork."}]}, {"id": 1609, "image": "COCO_val2014_000000236023.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, clock, bottle, knife, refrigerator, oven, cup."}]}, {"id": 1615, "image": "COCO_val2014_000000214244.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: snowboard, person, skis."}]}, {"id": 1621, "image": "COCO_val2014_000000397665.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: motorcycle, person, handbag, bicycle, car, orange, bus."}]}, {"id": 1627, "image": "COCO_val2014_000000183965.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: bowl, sandwich, hot dog, dining table, donut, cup."}]}, {"id": 1633, "image": "COCO_val2014_000000499105.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, sports ball, tennis racket."}]}, {"id": 1639, "image": "COCO_val2014_000000259755.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: baseball glove, person, sports ball."}]}, {"id": 1645, "image": "COCO_val2014_000000281028.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: chair, person, train, car."}]}, {"id": 1651, "image": "COCO_val2014_000000199122.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: baseball glove, person, sports ball, chair, baseball bat."}]}, {"id": 1657, "image": "COCO_val2014_000000442809.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: bottle, cake, cup."}]}, {"id": 1663, "image": "COCO_val2014_000000482829.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, sports ball, tennis racket."}]}, {"id": 1669, "image": "COCO_val2014_000000097994.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: clock, laptop, dining table, tv, mouse, bottle, keyboard, book, cup."}]}, {"id": 1675, "image": "COCO_val2014_000000246999.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: tv, clock, couch, potted plant, remote, chair, vase."}]}, {"id": 1681, "image": "COCO_val2014_000000338291.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: snowboard, backpack, person, bench."}]}, {"id": 1687, "image": "COCO_val2014_000000331366.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: truck, person, bicycle, car, umbrella."}]}, {"id": 1693, "image": "COCO_val2014_000000348524.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: snowboard, person, skis."}]}, {"id": 1699, "image": "COCO_val2014_000000108189.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: sandwich, fork, dining table, knife, cup."}]}, {"id": 1705, "image": "COCO_val2014_000000301575.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: tv, person, couch, potted plant, chair, cat, book."}]}, {"id": 1711, "image": "COCO_val2014_000000239773.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: baseball glove, person, sports ball, baseball bat."}]}, {"id": 1717, "image": "COCO_val2014_000000354976.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, clock, laptop, dining table, chair, tv, mouse, cell phone."}]}, {"id": 1723, "image": "COCO_val2014_000000473199.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, handbag, potted plant, dining table, chair, umbrella."}]}, {"id": 1729, "image": "COCO_val2014_000000311759.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: backpack, teddy bear, bear."}]}, {"id": 1735, "image": "COCO_val2014_000000431615.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: dog, person, boat."}]}, {"id": 1741, "image": "COCO_val2014_000000343401.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: car, dog, person, horse."}]}, {"id": 1747, "image": "COCO_val2014_000000246199.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: banana, orange, dining table."}]}, {"id": 1753, "image": "COCO_val2014_000000372009.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: bird, bed, teddy bear."}]}, {"id": 1759, "image": "COCO_val2014_000000346707.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: banana, bowl, person."}]}, {"id": 1765, "image": "COCO_val2014_000000501898.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, sports ball, tennis racket, bottle, bench."}]}, {"id": 1771, "image": "COCO_val2014_000000281766.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, bicycle, handbag."}]}, {"id": 1777, "image": "COCO_val2014_000000245448.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: truck, person, motorcycle."}]}, {"id": 1783, "image": "COCO_val2014_000000510138.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, wine glass, couch, bottle, laptop, dining table, cell phone."}]}, {"id": 1789, "image": "COCO_val2014_000000001171.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: truck, train, car, bed."}]}, {"id": 1795, "image": "COCO_val2014_000000100238.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, bottle, frisbee, handbag."}]}, {"id": 1801, "image": "COCO_val2014_000000465418.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: traffic light, person, bus."}]}, {"id": 1807, "image": "COCO_val2014_000000392364.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: car, traffic light, person, horse."}]}, {"id": 1813, "image": "COCO_val2014_000000188958.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: snowboard, person, skis."}]}, {"id": 1819, "image": "COCO_val2014_000000436127.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: clock, person, horse, car, bus."}]}, {"id": 1825, "image": "COCO_val2014_000000198397.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: chair, person, sports ball, tennis racket."}]}, {"id": 1831, "image": "COCO_val2014_000000349936.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: oven, dining table, chair, cake, tv, couch, book, cup."}]}, {"id": 1837, "image": "COCO_val2014_000000333756.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, sports ball, tennis racket."}]}, {"id": 1843, "image": "COCO_val2014_000000474741.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: backpack, clock, person, handbag, dining table, vase, tie."}]}, {"id": 1849, "image": "COCO_val2014_000000286342.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: keyboard, mouse, laptop, dining table."}]}, {"id": 1855, "image": "COCO_val2014_000000093948.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: tv, mouse, laptop, dining table, keyboard, cat."}]}, {"id": 1861, "image": "COCO_val2014_000000405740.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: wine glass, person, laptop, cup."}]}, {"id": 1867, "image": "COCO_val2014_000000304387.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, surfboard, kite."}]}, {"id": 1873, "image": "COCO_val2014_000000147165.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: sink, person, spoon, oven."}]}, {"id": 1879, "image": "COCO_val2014_000000495311.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: truck, person, airplane."}]}, {"id": 1885, "image": "COCO_val2014_000000312406.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: banana, bowl, cup, dining table."}]}, {"id": 1891, "image": "COCO_val2014_000000515820.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: tv, clock, person, train, bench."}]}, {"id": 1897, "image": "COCO_val2014_000000031971.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: orange, bed, book."}]}, {"id": 1903, "image": "COCO_val2014_000000418680.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: snowboard, person, skis."}]}, {"id": 1909, "image": "COCO_val2014_000000459680.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, orange, umbrella."}]}, {"id": 1915, "image": "COCO_val2014_000000196462.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: bowl, fork, cup."}]}, {"id": 1921, "image": "COCO_val2014_000000276693.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: dog, person, skis."}]}, {"id": 1927, "image": "COCO_val2014_000000356424.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, car, bottle, dining table, chair, cup."}]}, {"id": 1933, "image": "COCO_val2014_000000505335.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, bottle, umbrella, tie."}]}, {"id": 1939, "image": "COCO_val2014_000000172342.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, spoon, bowl, clock, dining table, bottle, fork, book, cup."}]}, {"id": 1945, "image": "COCO_val2014_000000384970.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: truck, person, car, motorcycle, cell phone."}]}, {"id": 1951, "image": "COCO_val2014_000000572260.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: wine glass, teddy bear, apple, book, cup."}]}, {"id": 1957, "image": "COCO_val2014_000000418471.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, dining table, chair, pizza, cup."}]}, {"id": 1963, "image": "COCO_val2014_000000379404.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: banana, sandwich, dining table."}]}, {"id": 1969, "image": "COCO_val2014_000000044801.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: traffic light, person, truck, car, motorcycle, bus."}]}, {"id": 1975, "image": "COCO_val2014_000000293564.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, surfboard, umbrella."}]}, {"id": 1981, "image": "COCO_val2014_000000452297.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: banana, spoon, bowl, cup."}]}, {"id": 1987, "image": "COCO_val2014_000000543393.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: fire hydrant, car, stop sign."}]}, {"id": 1993, "image": "COCO_val2014_000000172648.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: truck, car, person, bed."}]}, {"id": 1999, "image": "COCO_val2014_000000308907.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: traffic light, person, bicycle, potted plant."}]}, {"id": 2005, "image": "COCO_val2014_000000263594.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: bird, truck, car."}]}, {"id": 2011, "image": "COCO_val2014_000000501294.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: tv, person, couch."}]}, {"id": 2017, "image": "COCO_val2014_000000500680.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, handbag, dining table."}]}, {"id": 2023, "image": "COCO_val2014_000000091954.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: truck, train, car."}]}, {"id": 2029, "image": "COCO_val2014_000000454607.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: book, person, cell phone, apple."}]}, {"id": 2035, "image": "COCO_val2014_000000363908.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: baseball glove, person, sports ball, baseball bat."}]}, {"id": 2041, "image": "COCO_val2014_000000560064.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, bench, potted plant, donut."}]}, {"id": 2047, "image": "COCO_val2014_000000348469.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: bird, person, baseball bat."}]}, {"id": 2053, "image": "COCO_val2014_000000167724.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: oven, spoon, bottle, person."}]}, {"id": 2059, "image": "COCO_val2014_000000262736.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: dog, truck, car, frisbee."}]}, {"id": 2065, "image": "COCO_val2014_000000076416.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, bicycle, cell phone, bus."}]}, {"id": 2071, "image": "COCO_val2014_000000541783.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: backpack, person, motorcycle."}]}, {"id": 2077, "image": "COCO_val2014_000000445200.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: bowl, broccoli, dining table, knife, carrot, cup."}]}, {"id": 2083, "image": "COCO_val2014_000000433574.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, couch, dining table, chair, cell phone."}]}, {"id": 2089, "image": "COCO_val2014_000000212241.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: truck, person, train, car."}]}, {"id": 2095, "image": "COCO_val2014_000000114941.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, train, car."}]}, {"id": 2101, "image": "COCO_val2014_000000493753.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: remote, laptop, chair, dining table, tv, mouse, bottle, keyboard, cup."}]}, {"id": 2107, "image": "COCO_val2014_000000410597.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, wine glass, dining table, cup, cake, tie."}]}, {"id": 2113, "image": "COCO_val2014_000000382617.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: mouse, dining table, book, cup."}]}, {"id": 2119, "image": "COCO_val2014_000000467887.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, mouse, couch, laptop, dining table, chair."}]}, {"id": 2125, "image": "COCO_val2014_000000120792.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, remote, chair, refrigerator, tv, couch, bottle, book, cup."}]}, {"id": 2131, "image": "COCO_val2014_000000125524.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, bus, fire hydrant."}]}, {"id": 2137, "image": "COCO_val2014_000000196053.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, sports ball, tennis racket."}]}, {"id": 2143, "image": "COCO_val2014_000000156704.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, surfboard, umbrella."}]}, {"id": 2149, "image": "COCO_val2014_000000088507.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: backpack, person, orange, skis."}]}, {"id": 2155, "image": "COCO_val2014_000000279499.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, bus, umbrella."}]}, {"id": 2161, "image": "COCO_val2014_000000155131.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, sports ball, tennis racket, bench, tie."}]}, {"id": 2167, "image": "COCO_val2014_000000252135.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: chair, person, couch, remote."}]}, {"id": 2173, "image": "COCO_val2014_000000501652.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: chair, person, car, cell phone."}]}, {"id": 2179, "image": "COCO_val2014_000000238691.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: backpack, person, handbag, bottle, suitcase."}]}, {"id": 2185, "image": "COCO_val2014_000000332908.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: bowl, bottle, oven."}]}, {"id": 2191, "image": "COCO_val2014_000000372819.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: dog, person, bench, dining table."}]}, {"id": 2197, "image": "COCO_val2014_000000470699.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: dog, person, surfboard."}]}, {"id": 2203, "image": "COCO_val2014_000000023084.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, sports ball, tennis racket."}]}, {"id": 2209, "image": "COCO_val2014_000000545353.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: pizza, person, sink, dining table."}]}, {"id": 2215, "image": "COCO_val2014_000000394535.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: backpack, person, handbag, surfboard, car, bus."}]}, {"id": 2221, "image": "COCO_val2014_000000579415.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: traffic light, person, handbag, bus, umbrella."}]}, {"id": 2227, "image": "COCO_val2014_000000233521.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, couch, bottle, remote, vase, book."}]}, {"id": 2233, "image": "COCO_val2014_000000355256.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, bottle, chair, refrigerator, microwave, donut."}]}, {"id": 2239, "image": "COCO_val2014_000000304819.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: tv, cat, laptop, dining table."}]}, {"id": 2245, "image": "COCO_val2014_000000240434.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: clock, person, remote, chair, refrigerator, book, tv, handbag, couch, cell phone."}]}, {"id": 2251, "image": "COCO_val2014_000000199940.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: backpack, bowl, bottle, refrigerator, cat, oven."}]}, {"id": 2257, "image": "COCO_val2014_000000549390.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: backpack, person, bicycle."}]}, {"id": 2263, "image": "COCO_val2014_000000235541.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: giraffe, person, tie."}]}, {"id": 2269, "image": "COCO_val2014_000000068418.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: tv, mouse, laptop, keyboard, apple."}]}, {"id": 2275, "image": "COCO_val2014_000000368541.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: backpack, truck, person, handbag, suitcase, bottle, airplane."}]}, {"id": 2281, "image": "COCO_val2014_000000006033.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, suitcase, bus, sheep."}]}, {"id": 2287, "image": "COCO_val2014_000000021327.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: tv, pizza, dining table, cup."}]}, {"id": 2293, "image": "COCO_val2014_000000390157.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: wine glass, person, bottle, dining table."}]}, {"id": 2299, "image": "COCO_val2014_000000298633.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, sports ball, tennis racket."}]}, {"id": 2305, "image": "COCO_val2014_000000579277.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, motorcycle, carrot."}]}, {"id": 2311, "image": "COCO_val2014_000000042685.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, bowl, pizza, bed, cell phone, cup."}]}, {"id": 2317, "image": "COCO_val2014_000000553992.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: tv, person, remote, chair, tie."}]}, {"id": 2323, "image": "COCO_val2014_000000243158.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: vase, couch, bottle, dining table."}]}, {"id": 2329, "image": "COCO_val2014_000000303971.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: tv, person, remote."}]}, {"id": 2335, "image": "COCO_val2014_000000405762.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: car, truck, sports ball, person."}]}, {"id": 2341, "image": "COCO_val2014_000000343967.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: clock, couch, potted plant, chair, vase."}]}, {"id": 2347, "image": "COCO_val2014_000000477598.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: dog, person, frisbee."}]}, {"id": 2353, "image": "COCO_val2014_000000044993.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: banana, bowl, orange, apple."}]}, {"id": 2359, "image": "COCO_val2014_000000381195.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: bowl, broccoli, dining table."}]}, {"id": 2365, "image": "COCO_val2014_000000337502.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: pizza, broccoli, oven."}]}, {"id": 2371, "image": "COCO_val2014_000000325347.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: backpack, person, sports ball, tennis racket, chair."}]}, {"id": 2377, "image": "COCO_val2014_000000256906.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: traffic light, person, truck, handbag, car, bus, tie."}]}, {"id": 2383, "image": "COCO_val2014_000000061507.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, bicycle, suitcase, kite, umbrella."}]}, {"id": 2389, "image": "COCO_val2014_000000084410.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: vase, wine glass, cup."}]}, {"id": 2395, "image": "COCO_val2014_000000288576.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, bicycle, frisbee."}]}, {"id": 2401, "image": "COCO_val2014_000000328301.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: car, boat, fire hydrant."}]}, {"id": 2407, "image": "COCO_val2014_000000307262.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: bowl, fork, bottle, pizza, orange."}]}, {"id": 2413, "image": "COCO_val2014_000000573796.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: knife, apple, dining table."}]}, {"id": 2419, "image": "COCO_val2014_000000154846.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: pizza, person, dining table."}]}, {"id": 2425, "image": "COCO_val2014_000000472375.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: dog, motorcycle, cup."}]}, {"id": 2431, "image": "COCO_val2014_000000446651.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, motorcycle, car."}]}, {"id": 2437, "image": "COCO_val2014_000000267684.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: backpack, person, skis."}]}, {"id": 2443, "image": "COCO_val2014_000000238029.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, wine glass, fork, dining table, knife, pizza."}]}, {"id": 2449, "image": "COCO_val2014_000000294475.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: potted plant, bottle, dining table, chair, pizza, cup."}]}, {"id": 2455, "image": "COCO_val2014_000000149592.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, bench, boat."}]}, {"id": 2461, "image": "COCO_val2014_000000447787.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, bowl, bottle, carrot, microwave, cake, oven."}]}, {"id": 2467, "image": "COCO_val2014_000000458325.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: car, traffic light, person, skateboard."}]}, {"id": 2473, "image": "COCO_val2014_000000388237.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: chair, person, skateboard, bottle."}]}, {"id": 2479, "image": "COCO_val2014_000000038645.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: chair, cup, potted plant, umbrella."}]}, {"id": 2485, "image": "COCO_val2014_000000071738.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: chair, person, motorcycle, tie."}]}, {"id": 2491, "image": "COCO_val2014_000000170077.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, wine glass, handbag, dining table, chair, umbrella."}]}, {"id": 2497, "image": "COCO_val2014_000000378751.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: tv, couch, remote, chair, cat."}]}, {"id": 2503, "image": "COCO_val2014_000000073182.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, handbag, bicycle, car, motorcycle."}]}, {"id": 2509, "image": "COCO_val2014_000000246928.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: chair, car, bed, boat."}]}, {"id": 2515, "image": "COCO_val2014_000000528136.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: truck, person, car, bus."}]}, {"id": 2521, "image": "COCO_val2014_000000209290.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, fork, dining table, chair, cup."}]}, {"id": 2527, "image": "COCO_val2014_000000564511.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, wine glass, bottle, pizza, oven, cup."}]}, {"id": 2533, "image": "COCO_val2014_000000200739.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: truck, person, horse, car, suitcase."}]}, {"id": 2539, "image": "COCO_val2014_000000462805.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: snowboard, person, skis."}]}, {"id": 2545, "image": "COCO_val2014_000000163575.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: dog, person, bed."}]}, {"id": 2551, "image": "COCO_val2014_000000059383.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: oven, bottle, sink, microwave, cell phone, cup."}]}, {"id": 2557, "image": "COCO_val2014_000000478736.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: traffic light, car, stop sign."}]}, {"id": 2563, "image": "COCO_val2014_000000156282.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, kite, umbrella."}]}, {"id": 2569, "image": "COCO_val2014_000000052689.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, bench, skis."}]}, {"id": 2575, "image": "COCO_val2014_000000516601.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: snowboard, person, skis."}]}, {"id": 2581, "image": "COCO_val2014_000000562155.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, car, tie."}]}, {"id": 2587, "image": "COCO_val2014_000000142056.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, sports ball, tennis racket."}]}, {"id": 2593, "image": "COCO_val2014_000000299986.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, bench, cell phone."}]}, {"id": 2599, "image": "COCO_val2014_000000521643.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: vase, bottle, dining table."}]}, {"id": 2605, "image": "COCO_val2014_000000574454.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, surfboard, kite."}]}, {"id": 2611, "image": "COCO_val2014_000000415727.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: backpack, person, bench, baseball bat."}]}, {"id": 2617, "image": "COCO_val2014_000000222118.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, suitcase, handbag, cell phone."}]}, {"id": 2623, "image": "COCO_val2014_000000514292.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: book, teddy bear, bear."}]}, {"id": 2629, "image": "COCO_val2014_000000117527.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: dining table, person, bowl, cup."}]}, {"id": 2635, "image": "COCO_val2014_000000016451.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: backpack, person, handbag, surfboard, chair, umbrella."}]}, {"id": 2641, "image": "COCO_val2014_000000429706.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: tv, backpack, person, handbag, suitcase."}]}, {"id": 2647, "image": "COCO_val2014_000000429580.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: dog, person, frisbee."}]}, {"id": 2653, "image": "COCO_val2014_000000283168.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: bowl, sandwich, fork, cup."}]}, {"id": 2659, "image": "COCO_val2014_000000405205.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: truck, person, bus."}]}, {"id": 2665, "image": "COCO_val2014_000000123570.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, car, umbrella."}]}, {"id": 2671, "image": "COCO_val2014_000000365317.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, sports ball, tennis racket."}]}, {"id": 2677, "image": "COCO_val2014_000000224155.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, toilet, cell phone."}]}, {"id": 2683, "image": "COCO_val2014_000000191964.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: keyboard, orange, cat, laptop."}]}, {"id": 2689, "image": "COCO_val2014_000000265472.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: banana, bowl, orange, cake."}]}, {"id": 2695, "image": "COCO_val2014_000000550514.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: snowboard, backpack, person, skis."}]}, {"id": 2701, "image": "COCO_val2014_000000163814.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, motorcycle, skateboard."}]}, {"id": 2707, "image": "COCO_val2014_000000505933.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: truck, train, car."}]}, {"id": 2713, "image": "COCO_val2014_000000318550.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: traffic light, truck, train, car."}]}, {"id": 2719, "image": "COCO_val2014_000000367528.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: bowl, person, remote, dining table, vase, tv, potted plant, book, cup."}]}, {"id": 2725, "image": "COCO_val2014_000000252911.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, car, boat, stop sign, motorcycle."}]}, {"id": 2731, "image": "COCO_val2014_000000214421.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: tv, person, potted plant, bottle, laptop, dining table, cup."}]}, {"id": 2737, "image": "COCO_val2014_000000050627.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: baseball glove, person, sports ball, baseball bat."}]}, {"id": 2743, "image": "COCO_val2014_000000233426.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: vase, potted plant, dining table."}]}, {"id": 2749, "image": "COCO_val2014_000000372817.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, car, handbag, umbrella."}]}, {"id": 2755, "image": "COCO_val2014_000000536073.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: knife, bottle, cup."}]}, {"id": 2761, "image": "COCO_val2014_000000579231.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: baseball glove, person, baseball bat."}]}, {"id": 2767, "image": "COCO_val2014_000000092624.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: giraffe, cow, zebra."}]}, {"id": 2773, "image": "COCO_val2014_000000390184.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: spoon, dining table, chair, microwave, oven."}]}, {"id": 2779, "image": "COCO_val2014_000000079213.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, couch, remote."}]}, {"id": 2785, "image": "COCO_val2014_000000465346.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: spoon, sandwich, fork, dining table, knife."}]}, {"id": 2791, "image": "COCO_val2014_000000514248.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: car, person, horse, fire hydrant."}]}, {"id": 2797, "image": "COCO_val2014_000000355776.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, chair, sink, toothbrush, cup."}]}, {"id": 2803, "image": "COCO_val2014_000000575355.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: chair, bench, dining table."}]}, {"id": 2809, "image": "COCO_val2014_000000396068.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: tv, person, remote."}]}, {"id": 2815, "image": "COCO_val2014_000000559547.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: baseball glove, person, sports ball, baseball bat."}]}, {"id": 2821, "image": "COCO_val2014_000000230175.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, cup, tie."}]}, {"id": 2827, "image": "COCO_val2014_000000159969.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: backpack, person, skis."}]}, {"id": 2833, "image": "COCO_val2014_000000399702.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: bowl, sandwich, cup, dining table."}]}, {"id": 2839, "image": "COCO_val2014_000000555538.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: baseball glove, person, sports ball, baseball bat."}]}, {"id": 2845, "image": "COCO_val2014_000000264155.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, bottle, fork, dining table, knife, pizza."}]}, {"id": 2851, "image": "COCO_val2014_000000125572.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: truck, clock, person, traffic light, handbag, bicycle, backpack, motorcycle, bus."}]}, {"id": 2857, "image": "COCO_val2014_000000560744.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, spoon, bowl, dining table, knife, chair, wine glass, bottle, book, cup."}]}, {"id": 2863, "image": "COCO_val2014_000000045685.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, sports ball, tennis racket."}]}, {"id": 2869, "image": "COCO_val2014_000000434179.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: chair, spoon, cup, dining table."}]}, {"id": 2875, "image": "COCO_val2014_000000008749.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: pizza, cup, dining table."}]}, {"id": 2881, "image": "COCO_val2014_000000069863.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: car, vase, potted plant, dining table."}]}, {"id": 2887, "image": "COCO_val2014_000000041180.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: backpack, person, cow."}]}, {"id": 2893, "image": "COCO_val2014_000000429913.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: baseball glove, person, sports ball, baseball bat."}]}, {"id": 2899, "image": "COCO_val2014_000000554002.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: chair, dog, person, handbag."}]}, {"id": 2905, "image": "COCO_val2014_000000141278.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: tv, mouse, laptop, dining table, chair, keyboard, cell phone."}]}, {"id": 2911, "image": "COCO_val2014_000000190788.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, dining table, knife, pizza, cup."}]}, {"id": 2917, "image": "COCO_val2014_000000534942.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: bed, cat, tie."}]}, {"id": 2923, "image": "COCO_val2014_000000377951.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: pizza, dining table, person, cup."}]}, {"id": 2929, "image": "COCO_val2014_000000533201.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: truck, person, airplane."}]}, {"id": 2935, "image": "COCO_val2014_000000235203.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, couch, remote, chair, keyboard, cat."}]}, {"id": 2941, "image": "COCO_val2014_000000468997.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: baseball glove, person, sports ball, baseball bat."}]}, {"id": 2947, "image": "COCO_val2014_000000401398.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, potted plant, vase, teddy bear, bear."}]}, {"id": 2953, "image": "COCO_val2014_000000323752.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, sports ball, tennis racket."}]}, {"id": 2959, "image": "COCO_val2014_000000003845.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: spoon, broccoli, fork, dining table, carrot, cup."}]}, {"id": 2965, "image": "COCO_val2014_000000122962.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: pizza, person, cell phone, dining table."}]}, {"id": 2971, "image": "COCO_val2014_000000015738.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: person, sports ball, tennis racket, bottle, chair."}]}, {"id": 2977, "image": "COCO_val2014_000000497599.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: chair, person, laptop, dining table."}]}, {"id": 2983, "image": "COCO_val2014_000000121959.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: chair, person, handbag, umbrella."}]}, {"id": 2989, "image": "COCO_val2014_000000361430.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: bowl, broccoli, spoon, knife, carrot, oven."}]}, {"id": 2995, "image": "COCO_val2014_000000277289.jpg", "conversations": [{"from": "human", "value": "Generate a short caption of the image."}, {"from": "gpt", "value": "Generate a short caption for this image, focusing on the visible elements: tv, bowl, couch, potted plant, dining table, vase."}]}]