[
  {
        "input": "An audience gives applause as a man yells and a group sings",
        "reasoning": "This clip contains three main elements: applause from an audience, a man yelling, and a group singing. The presence of applause suggests a live performance setting. Since there are already multiple sounds, the simplest edit would be to change one existing sound rather than adding a new one. Converting the man into a woman yelling keeps the original structure intact.",
        "speech": true,
        "n_elements": 3,
        "instruction": "Make the man a woman",
        "type": "replace",
        "output": "An audience gives applause as a woman yells and a group sings",
        "input_neg": "woman yelling",
        "output_neg": ""
    },
    {
        "input": "A man speaks over intermittent keyboard taps",
        "reasoning": "This clip features two primary elements: a man speaking and the sound of keyboard taps. Amplifying the keyboard taps is a straightforward way to modify the audio without altering its structure. Increasing their volume adds emphasis while preserving the spoken element.",
        "speech": true,
        "n_elements": 2,
        "instruction": "keyboard should be louder",
        "type": "other",
        "output": "A man speaks over loud intermittent keyboard taps",
        "input_neg": "",
        "output_neg": ""
    },
    {
        "input": "Motor noise is followed by a horn honking and a siren wailing",
        "reasoning": "The audio already includes three distinct sounds: a motor, a horn, and a siren. To avoid adding new sounds, removing one of the existing sounds is the simplest edit. Eliminating the siren keeps the overall scene of vehicles but simplifies the soundscape.",
        "speech": false,
        "n_elements": 3,
        "instruction": "Remove the siren",
        "type": "remove",
        "output": "Motor noise is followed by a horn honking",
        "input_neg": "",
        "output_neg": "siren wailing"
    },
    {
        "input": "Vehicles pass by on a roadway",
        "reasoning": "Currently, only vehicles are mentioned passing by on a roadway. Adding an ambulance with a siren introduces a new distinct sound that fits naturally with traffic. This keeps the road environment intact while increasing the complexity of the soundscape.",
        "speech": false,
        "n_elements": 1,
        "instruction": "add an ambulance with a siren",
        "type": "add",
        "output": "Vehicles and an ambulance with a siren pass by on a roadway",
        "input_neg": "ambulance, siren",
        "output_neg": ""
    },
    {
        "input": "A man speaks followed by another man speaking outside",
        "reasoning": "Two men are speaking in an outdoor setting, which could include background noises like wind or distant cars. Changing the location to indoors is a single, straightforward alteration that doesn’t involve new sounds. This maintains the presence of voices but shifts the implied acoustics.",
        "speech": true,
        "n_elements": 2,
        "instruction": "they should be inside",
        "type": "other",
        "output": "A man speaks followed by another man speaking inside",
        "input_neg": "",
        "output_neg": ""
    },
    {
        "input": "A bus engine idles while a woman speaks making an announcement",
        "reasoning": "There are two main sources here: the idling bus engine and a woman making an announcement. Emphasizing that the woman’s voice is coming through a poor-quality microphone adds a unique acoustic characteristic. This modifies how the existing voice sounds without adding or removing other elements.",
        "speech": true,
        "n_elements": 2,
        "instruction": "she should speak through a bad microphone",
        "type": "other",
        "output": "A bus engine idles while a woman speaks making an announcement through a bad microphone",
        "input_neg": "bad microphone",
        "output_neg": ""
    },
    {
        "input": "A man speaks as a car is passing by",
        "reasoning": "This clip contains two main sounds: a man speaking and a car passing by. Replacing the car with a motorcycle preserves the presence of a passing vehicle without adding new sounds. It is a straightforward swap that keeps the man’s speech intact while altering the type of vehicle heard.",
        "speech": true,
        "n_elements": 2,
        "instruction": "Replace the car with a motorcycle",
        "type": "replace",
        "output": "A man speaks as a motorcycle is passing by",
        "input_neg": "",
        "output_neg": ""
    },
    {
        "input": "Plastic is tapped on while someone speaks",
        "reasoning": "This clip consists of two main sounds: plastic tapping and a voice speaking. Converting plastic to metal changes the tonal quality of the tapping while preserving the overall structure. The voice remains, and no additional sounds are introduced or removed.",
        "speech": true,
        "n_elements": 2,
        "instruction": "make it metal",
        "type": "replace",
        "output": "Metal is tapped on while someone speaks",
        "input_neg": "",
        "output_neg": ""
    },
    {
        "input": "A vehicle moves while the wind blows and a man talks",
        "reasoning": "This caption consists three main sounds: a vehicle moving, wind blowing, and a man talking. To simplify the sound landscape the man talking can be removed.",
        "speech": true,
        "n_elements": 3,
        "instruction": "there shouldn't be a man",
        "type": "remove",
        "output": "A vehicle moves while the wind blows",
        "input_neg": "",
        "output_neg": "wind"
    },
    {
        "input": "A bell chimes melodically",
        "reasoning": "This audio focuses on a single melodic bell sound. Based on the melodic sound, the bell could be changed to an instrument. A fitting instruction would be to change the bell to a piano.",
        "speech": false,
        "n_elements": 1,
        "instruction": "bell should be piano notes",
        "type": "replace",
        "output": "Piano notes play melodically",
        "input_neg": "",
        "output_neg": ""
    },
    {
        "input": "A goat bleats continuously",
        "reasoning": "This clip focuses on a single sound source: a goat bleating. Changing the goat to a cow is a straightforward replacement that preserves the notion of a farm animal making a continuous vocalization.",
        "speech": false,
        "n_elements": 1,
        "instruction": "Make it a cow",
        "type": "replace",
        "output": "A cow moos continuously",
        "input_neg": "cow",
        "output_neg": "goat"
    },
    {
        "input": "At a distance, several motors run",
        "reasoning": "This clip describes multiple motors operating far away. Making them closer modifies their perceived distance but keeps the same motor sounds. It simply changes how near they are depicted to be.",
        "speech": false,
        "n_elements": 1,
        "instruction": "they should be closer",
        "type": "other",
        "output": "Several motors run",
        "input_neg": "",
        "output_neg": ""
    },
    {
        "input": "Rapid fire loud booming gunshots",
        "reasoning": "This audio contains only a series of loud, booming gunshots. The instruction is to add the sound of people screaming, which creates a more intense scene. This keeps the original gunshots intact while introducing an additional element.",
        "speech": false,
        "n_elements": 1,
        "instruction": "Add people screaming",
        "type": "add",
        "output": "Rapid fire loud booming gunshots with people screaming",
        "input_neg": "people screaming",
        "output_neg": ""
    },
    {
        "input": "Waves of water rumble",
        "reasoning": "This audio contains one major element: waves of water rumbling. As the setting in on water, a fitting edit would be to add elements associated with water. One possible instruction is to add a motorboat.",
        "speech": false,
        "n_elements": 1,
        "instruction": "add a motorboat",
        "type": "add",
        "output": "Waves of water rumble as a motorboat passes by",
        "input_neg": "motorboat",
        "output_neg": ""
    },
    {
        "input": "An electronic device bleeps once",
        "reasoning": "This audio contains one major element: an electronic bleep. As a bleep is typically short, it can be replaced with another typically short sound. A fitting instruction would be to change the bleep to a clap.",
        "speech": false,
        "n_elements": 1,
        "instruction": "the bleep should be a clap",
        "type": "replace",
        "output": "An clap is heard once",
        "input_neg": "clap",
        "output_neg": "electronic bleep"
    },
    {
        "input": "A guitar is played.",
        "reasoning": "This audio contains only a guitar playing. As it concerns a instrument, the guitar can be changed to a piano.",
        "speech": false,
        "n_elements": 1,
        "instruction": "the guitar should be a piano",
        "type": "replace",
        "output": "A piano plays.",
        "input_neg": "piano",
        "output_neg": "guitar"
    },
    {
        "input": "A clarinet plays a melodic tune indoors.",
        "reasoning": "This audio contains only one major element: a clarinet playing. As a clarinet plays long notes, a fitting edit instruction would be to change it to a violin which also exhibits the same audio characteristics.",
        "speech": false,
        "n_elements": 1,
        "instruction": "replace the clarinet with a violin",
        "type": "replace",
        "output": "A violin plays a melodic tune indoors",
        "input_neg": "violin",
        "output_neg": "clarinet"
    },
    {
        "input": "The sound of a motorcycle engine idling can be heard in a workshop.",
        "reasoning": "There is only one major sound present: a motorcycle engine. We can change the audio characteristics of the sound by replacing the workshop with a busy street.",
        "speech": false,
        "n_elements": 1,
        "instruction": "Change it to be in a busy street",
        "type": "replace",
        "output": "The sound of a motorcycle engine idling can be heard in a busy street",
        "input_neg": "street",
        "output_neg": "workshop"
    },
    {
        "input": "A parrot talks",
        "reasoning": "The major element in this audio is a parrot talking. The audio setting could be a jungle, so it would make sense to add some jungle sounds.",
        "speech": false,
        "n_elements": 1,
        "instruction": "Add jungle sounds.",
        "type": "add",
        "output": "A parrot talks with jungle sounds in the background",
        "input_neg": "jungle",
        "output_neg": ""
    },
    {
        "input": "The sound of yodelling.",
        "reasoning": "As there is only the sound of yodelling present it would make sense to add some traditional music playing.",
        "speech": false,
        "n_elements": 1,
        "instruction": "there should also be traditional music",
        "type": "add",
        "output": "The sound of yodelling with traditional music playing.",
        "input_neg": "traditional music",
        "output_neg": ""
    },
    {
        "input": "Music plays as a car passes by in a parking lot.",
        "reasoning": "There are two major sounds present: music playing and a car passing by. A fitting instruction would remove the music to only focus on the car passing by.",
        "speech": false,
        "n_elements": 2,
        "instruction": "please remove the music",
        "type": "remove",
        "output": "A car passes by in a parking lot.",
        "input_neg": "",
        "output_neg": "music"
    },
    {
        "input": "The loud ringing of church bells fills the air.",
        "reasoning": "There is only one sound: church bells ringing. We could change the audio by replacing the church bells with a telephone ringing at the same pace.",
        "speech": false,
        "n_elements": 1,
        "instruction": "change it to be a telephone ringing",
        "type": "replace",
        "output": "The loud ringing of a telephone fills the air.",
        "input_neg": "telephone",
        "output_neg": "church bells"
    },
    {
        "input": "The lawn mower weeds the grass.",
        "reasoning": "There is only a lawn mower present. We could add some birds chirping as the audio suggests that the sound was recorded outside",
        "speech": false,
        "n_elements": 1,
        "instruction": "Please add some birds chirping",
        "type": "add",
        "output": "The lawn mower weeds the grass with birds chirping.",
        "input_neg": "birds chirping",
        "output_neg": ""
    },
    {
        "input": "A rooster crows loudly in a farmyard.",
        "reasoning": "There is only a rooster present with possible additional sounds from the farmyard. A fitting instruction would make the loud rooster more quiet.",
        "speech": false,
        "n_elements": 1,
        "instruction": "Make it quieter",
        "type": "other",
        "output": "A rooster crows quietly in a farmyard.",
        "input_neg": "",
        "output_neg": ""
    },
    {
        "input": "The sound of a hair dryer can be heard in a bathroom.",
        "reasoning": "There is only a hair dryer present. A fitting edit instruction could make the hair dryer more quiet, mimicking the sound of recording outside of the bathroom.",
        "speech": false,
        "n_elements": 1,
        "instruction": "it should be muffled",
        "type": "other",
        "output": "The muffled sound of a hair dryer can be heard in a bathroom.",
        "input_neg": "",
        "output_neg": ""
    },
    {
        "input": "Rain falls gently outdoors.",
        "reasoning": "As there is only rain present in the audio, we could add some fireworks as the audio appears to be recorded outside.",
        "speech": false,
        "n_elements": 1,
        "instruction": "Please add fireworks",
        "type": "add",
        "output": "Rain falls gently outdoors with fireworks going off",
        "input_neg": "fireworks",
        "output_neg": ""
    },
    {
        "input": "Water rushes down a waterfall while people talk in the background.",
        "reasoning": "There are two main sounds present: a waterfall and people talking. One fitting edit instruction would remove the people talking",
        "speech": true,
        "n_elements": 2,
        "instruction": "remove the people",
        "type": "remove",
        "output": "Water rushes down a waterfall",
        "input_neg": "",
        "output_neg": "people talking"
    },
    {
        "input": "The wind blows and a motorboat accelerates on the water.",
        "reasoning": "There are two sounds present: wind blowing and a motorboat accelerating. One possible edit would make the wind blowing quieter.",
        "speech": false,
        "n_elements": 2,
        "instruction": "Make the wind quieter",
        "type": "other",
        "output": "The wind blows quietly and a motorboat accelerates on the water.",
        "input_neg": "",
        "output_neg": ""
    },
    {
        "input": "A person is clicking on a mouse in a small room.",
        "reasoning": "As there is only clicking present, we could change the clicking to a keyboard tapping.",
        "speech": false,
        "n_elements": 1,
        "instruction": "it should be a keyboard instead",
        "type": "replace",
        "output": "A person is typing on a keyboard in a small room",
        "input_neg": "keyboard",
        "output_neg": "clicking, mouse"
    },
    {
        "input": "A choir sings harmoniously as music fills the air in a square.",
        "reasoning": "As there is only a choir, we could specify that it should be a female choir.",
        "speech": false,
        "n_elements": 1,
        "instruction": "Make it a female choir please",
        "type": "replace",
        "output": "A female choir sings harmoniously as music fills the air in a square.",
        "input_neg": "",
        "output_neg": ""
    },
    {
        "input": "A woman whispers.",
        "reasoning": "There is only a woman whispering. We could make the woman scream instead.",
        "speech": true,
        "n_elements": 1,
        "instruction": "Make her scream",
        "type": "other",
        "output": "A woman screams",
        "input_neg": "scream",
        "output_neg": "whisper"
    },
    {
        "input": "A guitar is played.",
        "reasoning": "As there is only a guitar playing, a person could be accompanying it by singing.",
        "speech": false,
        "n_elements": 1,
        "instruction": "add someone singing",
        "type": "add",
        "output": "Someone sings while a guitar is played.",
        "input_neg": "singing",
        "output_neg": ""
    }
]