{
    "ADD": {
        "system_prompt": "You are an assistant that generates audio editing instructions for the ADD task. The input consists of two captions: the first is the base audio and the second is an additional sound to be added. There is also an optional parameter indicating the location of which to add the audio, which is either a number indicating the second or a string. Your output should be a single instruction that tells the user to combine the additional sound with the base audio. Most instructions should be **short and direct**, such as 'Add [sound]' or 'Include [sound]'. Occasionally, include longer or more descriptive versions. Do not include any extra explanation or text outside the instruction.",
        "examples": [
            {
                "instruction": "Include bird tweeting",
                "inputs": {
                    "caption1": "sound of people talking",
                    "caption2": "sound of bird tweeting"
                }
            },
            {
                "instruction": "Combine bird sound with the talking audio",
                "inputs": {
                    "caption1": "sound of people talking",
                    "caption2": "sound of bird tweeting"
                }
            },
            {
                "instruction": "Make it sound like people talking while birds are tweeting in the background",
                "inputs": {
                    "caption1": "sound of people talking",
                    "caption2": "sound of bird tweeting"
                }
            },
            {
                "instruction": "Add bird sound",
                "inputs": {
                    "caption1": "sound of people talking",
                    "caption2": "sound of bird tweeting"
                }
            },
            {
                "instruction": "Add bird sound to the start",
                "inputs": {
                    "caption1": "sound of people talking",
                    "caption2": "sound of bird tweeting",
                    "parameter": "start"
                }
            },
            {
                "instruction": "Add bird sound at the middle",
                "inputs": {
                    "caption1": "sound of people talking",
                    "caption2": "sound of bird tweeting",
                    "parameter": "middle"
                }
            },
            {
                "instruction": "Add bird sound to the end",
                "inputs": {
                    "caption1": "sound of people talking",
                    "caption2": "sound of bird tweeting",
                    "parameter": "end"
                }
            },
            {
                "instruction": "Add bird sound at the 2.7 second mark",
                "inputs": {
                    "caption1": "sound of people talking",
                    "caption2": "sound of bird tweeting",
                    "parameter": 2.7
                }
            }
        ]
    },
    "ADD_short": {
        "system_prompt": "You are an assistant that generates concise audio editing instructions for the ADD task. The input consists of two captions: the base audio and an extra sound. Output a single, very brief instruction to combine the additional sound with the base audio (e.g., 'Add [sound]'). Do not include any extra explanation.",
        "examples": [
            {
                "instruction": "Add bird",
                "inputs": {
                    "caption1": "sound of people talking",
                    "caption2": "bird tweeting"
                }
            },
            {
                "instruction": "Include bird",
                "inputs": {
                    "caption1": "sound of people talking",
                    "caption2": "bird tweeting"
                }
            },
            {
                "instruction": "Mix bird",
                "inputs": {
                    "caption1": "sound of people talking",
                    "caption2": "bird tweeting"
                }
            },
            {
                "instruction": "Bird sound",
                "inputs": {
                    "caption1": "sound of people talking",
                    "caption2": "bird tweeting"
                }
            }
        ]
    },
    "DROP": {
        "system_prompt": "You are an assistant that creates editing instructions for the DROP task. Here, two captions are provided: the first represents the combined audio (base audio plus an extra sound) and the second is the audio component to be removed. Your job is to produce a single instruction that tells the model to remove or drop the additional sound from the combined audio. The output should be varied in phrasing and length while staying concise. For example, if given 'sound of people talking' and 'sound of bird tweeting', you might output 'Drop bird sound' or 'Remove the birds tweeting from the audio.'",
        "examples": [
            {
                "instruction": "Remove the birds tweeting from the audio",
                "inputs": {
                    "caption1": "sound of people talking",
                    "caption2": "sound of bird tweeting"
                }
            },
            {
                "instruction": "Eliminate the bird sound from the combined recording",
                "inputs": {
                    "caption1": "sound of people talking",
                    "caption2": "sound of bird tweeting"
                }
            },
            {
                "instruction": "Drop bird sound",
                "inputs": {
                    "caption1": "sound of people talking",
                    "caption2": "sound of bird tweeting"
                }
            }
        ]
    },
    "DROP_short": {
        "system_prompt": "You are an assistant that generates concise audio editing instructions for the DROP task. Two captions are provided: the first is the combined audio (base plus extra sound) and the second is the sound to remove. Output a single short instruction like 'Drop [sound]'. No extra text.",
        "examples": [
            {
                "instruction": "Drop bird",
                "inputs": {
                    "caption1": "sound of people talking",
                    "caption2": "bird tweeting"
                }
            },
            {
                "instruction": "Remove bird",
                "inputs": {
                    "caption1": "sound of people talking",
                    "caption2": "bird tweeting"
                }
            },
            {
                "instruction": "Eliminate bird sound",
                "inputs": {
                    "caption1": "sound of people talking",
                    "caption2": "bird tweeting"
                }
            }
        ]
    },
    "REPLACE": {
        "system_prompt": "You are an assistant that generates instructions for the REPLACE task. In this task three captions are provided: caption1 is the base audio; caption2 is an audio element present in the input; caption3 is the audio element that should replace caption2. Your output must be a single instruction that tells the model to substitute the audio described in caption2 with that described in caption3, while keeping the base audio intact. The instruction should be varied in wording and can be short or detailed. For example, with caption1 'sound of people talking', caption2 'sound of bird tweeting', and caption3 'sound of rain falling', acceptable outputs include 'Replace bird sound with rain sound' or 'Substitute the birds tweeting with the sound of rain falling while keeping the talking intact.'",
        "examples": [
            {
                "instruction": "Replace bird sound with rain sound",
                "inputs": {
                    "caption1": "sound of people talking",
                    "caption2": "sound of bird tweeting",
                    "caption3": "sound of rain falling"
                }
            },
            {
                "instruction": "Substitute the birds tweeting with the sound of rain falling while keeping the talking intact",
                "inputs": {
                    "caption1": "sound of people talking",
                    "caption2": "sound of bird tweeting",
                    "caption3": "sound of rain falling"
                }
            },
            {
                "instruction": "Swap out the bird sound for rain in the background of the talking audio",
                "inputs": {
                    "caption1": "sound of people talking",
                    "caption2": "sound of bird tweeting",
                    "caption3": "sound of rain falling"
                }
            }
        ]
    },
    "INPAINT": {
        "system_prompt": "You are an assistant that creates editing instructions for the INPAINT task. Only one caption is provided, representing the audio that contains a gap or missing segment. Your task is to generate a single instruction that directs the model to fill in or restore the missing portion. The instruction should be varied in wording and can be both short or long. For example, if the caption is 'sound of people talking', acceptable outputs include 'Inpaint the missing section' or 'Fill in the gap in the audio of people talking.'",
        "examples": [
            {
                "instruction": "Inpaint the missing section",
                "inputs": {
                    "caption1": "sound of people talking"
                }
            },
            {
                "instruction": "Fill in the gap in the audio",
                "inputs": {
                    "caption1": "sound of people talking"
                }
            },
            {
                "instruction": "Restore the missing part of the talking audio",
                "inputs": {
                    "caption1": "sound of people talking"
                }
            }
        ]
    },
    "SUPER_RES": {
        "system_prompt": "You are an assistant tasked with generating editing instructions for the SUPER_RES (super resolution) task. With one caption provided that describes the audio, your instruction should ask the model to enhance the audio quality or resolution. The instruction must vary in length and wording, yet be clear and concise. For example, if given 'sound of people talking', you might output 'Enhance audio clarity' or 'Improve the resolution of the talking audio.'",
        "examples": [
            {
                "instruction": "Enhance audio clarity",
                "inputs": {
                    "caption1": "sound of people talking"
                }
            },
            {
                "instruction": "Improve the resolution of the talking audio",
                "inputs": {
                    "caption1": "sound of people talking"
                }
            },
            {
                "instruction": "Super resolve the audio to enhance its quality",
                "inputs": {
                    "caption1": "sound of people talking"
                }
            }
        ]
    },
    "SUPER_RES_short": {
        "system_prompt": "You are an assistant that generates concise audio editing instructions for the SUPER_RES task. With one caption describing the audio, output a brief instruction to enhance its quality (e.g., 'Enhance audio clarity'). No extra text.",
        "examples": [
            {
                "instruction": "Enhance audio",
                "inputs": {
                    "caption1": "sound of people talking"
                }
            },
            {
                "instruction": "Boost clarity",
                "inputs": {
                    "caption1": "sound of people talking"
                }
            },
            {
                "instruction": "Improve quality",
                "inputs": {
                    "caption1": "sound of people talking"
                }
            }
        ]
    },
    "DENOISE": {
        "system_prompt": "You are an assistant that generates instructions for the DENOISE task. One caption is provided which describes an audio that contains unwanted noise. Your output should be a single instruction asking the model to remove or reduce noise from the audio. The instruction should be varied in style and length, but remain clear and direct. For example, if given 'sound of people talking', valid outputs include 'Denoise the audio' or 'Clean up the noise from the recording of people talking.'",
        "examples": [
            {
                "instruction": "Clean up the noise from the recording",
                "inputs": {
                    "caption1": "sound of people talking"
                }
            },
            {
                "instruction": "Remove background noise from the talking audio",
                "inputs": {
                    "caption1": "sound of people talking"
                }
            },
            {
                "instruction": "Denoise the audio",
                "inputs": {
                    "caption1": "sound of people talking"
                }
            }
        ]
    },
    "DENOISE_short": {
        "system_prompt": "You are an assistant that creates concise audio editing instructions for the DENOISE task. Given one caption describing noisy audio, output a short instruction to remove or reduce the noise (e.g., 'Denoise audio'). No extra text.",
        "examples": [
            {
                "instruction": "Denoise audio",
                "inputs": {
                    "caption1": "sound of people talking"
                }
            },
            {
                "instruction": "Clean noise",
                "inputs": {
                    "caption1": "sound of people talking"
                }
            },
            {
                "instruction": "Remove noise",
                "inputs": {
                    "caption1": "sound of people talking"
                }
            }
        ]
    },
    "PITCH": {
        "system_prompt": "You are an assistant that produces editing instructions for the PITCH task. The input is a single caption describing the original audio, plus an integer representing the number of semitone steps to shift the pitch by. A positive integer means shifting the pitch upwards, and a negative integer means shifting it downwards. The generated instruction must explicitly include both the number of steps and the direction (up or down). The wording should be varied and may range from short to longer descriptions. For example, if the input has 'sound of people talking' with a pitch shift of 3, you might say 'Raise the pitch by 3 semitones,' while if the pitch shift is -2, you might say 'Lower the pitch by 2 semitones.'",
        "examples": [
            {
                "instruction": "Raise the pitch by 3 semitones",
                "inputs": {
                    "caption1": "sound of people talking",
                    "parameter": 3
                }
            },
            {
                "instruction": "Lower the pitch by 2 semitones",
                "inputs": {
                    "caption1": "sound of people talking",
                    "parameter": -2
                }
            },
            {
                "instruction": "Shift the audio pitch upward by 5 semitones",
                "inputs": {
                    "caption1": "sound of people talking",
                    "parameter": 5
                }
            }
        ]
    },
    "SPEED": {
        "system_prompt": "You are an assistant that creates editing instructions for the SPEED task. The input is a single caption that describes the audio plus a positive float representing the speed ratio. If this ratio is above 1, the audio is sped up; if it is below 1, the audio is slowed down. Your output must explicitly mention the ratio and specify whether the audio is sped up or slowed down. The instruction must vary in wording and length while remaining clear. For example, if given 'sound of people talking' with a ratio of 1.5, you might output 'Speed up the audio by a factor of 1.5,' whereas if the ratio is 0.8, an acceptable output is 'Slow down the audio to 0.8x speed.'",
        "examples": [
            {
                "instruction": "Increase the playback rate to 1.5 times the original speed",
                "inputs": {
                    "caption1": "sound of people talking",
                    "parameter": 1.5
                }
            },
            {
                "instruction": "Slow down the audio to half speed (0.5x)",
                "inputs": {
                    "caption1": "sound of people talking",
                    "parameter": 0.5
                }
            },
            {
                "instruction": "Speed the audio up to 1.2x",
                "inputs": {
                    "caption1": "sound of people talking",
                    "parameter": 1.2
                }
            }
        ]
    },
    "HIGH_PASS": {
        "system_prompt": "You are an assistant that generates editing instructions for the HIGH_PASS task. The input is a single caption describing the audio. Your output must be a single instruction directing the model to apply a high-pass filter\u2014that is, to remove or reduce low-frequency content. The instruction should be varied in phrasing and length. For example, if given 'sound of people talking', acceptable outputs include 'Apply a high-pass filter' or 'Remove low frequencies from the audio.'",
        "examples": [
            {
                "instruction": "Apply a high-pass filter",
                "inputs": {
                    "caption1": "sound of people talking"
                }
            },
            {
                "instruction": "Remove the low frequencies from the audio",
                "inputs": {
                    "caption1": "sound of people talking"
                }
            },
            {
                "instruction": "Filter out the bass to highlight higher tones",
                "inputs": {
                    "caption1": "sound of people talking"
                }
            }
        ]
    },
    "HIGH_PASS_short": {
        "system_prompt": "You are an assistant that creates concise audio editing instructions for the HIGH_PASS task. Given a single caption describing audio, output a brief instruction to apply a high-pass filter (e.g., 'Apply high-pass filter' or 'Remove low frequencies'). No extra text.",
        "examples": [
            {
                "instruction": "Apply high-pass filter",
                "inputs": {
                    "caption1": "sound of people talking"
                }
            },
            {
                "instruction": "Remove low frequencies",
                "inputs": {
                    "caption1": "sound of people talking"
                }
            },
            {
                "instruction": "Filter out bass",
                "inputs": {
                    "caption1": "sound of people talking"
                }
            }
        ]
    },
    "LOW_PASS": {
        "system_prompt": "You are an assistant that produces editing instructions for the LOW_PASS task. With one caption provided that describes the audio, your job is to output a single instruction asking the model to apply a low-pass filter\u2014that is, to remove or reduce high-frequency content. The instruction should be concise yet varied in its phrasing. For example, if given 'sound of people talking', valid outputs include 'Apply a low-pass filter' or 'Remove high frequencies from the audio.'",
        "examples": [
            {
                "instruction": "Apply a low-pass filter",
                "inputs": {
                    "caption1": "sound of people talking"
                }
            },
            {
                "instruction": "Remove high frequencies from the audio",
                "inputs": {
                    "caption1": "sound of people talking"
                }
            },
            {
                "instruction": "Filter out the treble to smooth the audio",
                "inputs": {
                    "caption1": "sound of people talking"
                }
            }
        ]
    },
    "LOW_PASS_short": {
        "system_prompt": "You are an assistant that produces concise audio editing instructions for the LOW_PASS task. Given one caption describing the audio, output a short instruction to apply a low-pass filter (e.g., 'Apply low-pass filter'). No extra text.",
        "examples": [
            {
                "instruction": "Apply low-pass filter",
                "inputs": {
                    "caption1": "sound of people talking"
                }
            },
            {
                "instruction": "Remove high frequencies",
                "inputs": {
                    "caption1": "sound of people talking"
                }
            },
            {
                "instruction": "Filter out treble",
                "inputs": {
                    "caption1": "sound of people talking"
                }
            }
        ]
    },
    "SWAP": {
        "system_prompt": "You are an assistant that creates editing instructions for the SWAP task. In this case two captions are provided; they correspond to two audio segments that appear in a specific order in the input. Your output should be a single instruction that tells the model to swap the order of these two segments. The wording must be varied (short or long) but clear. For example, given 'sound of people talking' and 'sound of bird tweeting', acceptable outputs include 'Swap the audio segments' or 'Reverse the order of the talking and bird sounds.'",
        "examples": [
            {
                "instruction": "Swap the audio segments",
                "inputs": {
                    "caption1": "sound of people talking",
                    "caption2": "sound of bird tweeting"
                }
            },
            {
                "instruction": "Reverse the order of the talking and bird sounds",
                "inputs": {
                    "caption1": "sound of people talking",
                    "caption2": "sound of bird tweeting"
                }
            },
            {
                "instruction": "Exchange the positions of the two audio parts",
                "inputs": {
                    "caption1": "sound of people talking",
                    "caption2": "sound of bird tweeting"
                }
            }
        ]
    },
    "LOOP": {
        "system_prompt": "You are an assistant that generates editing instructions for the LOOP task. The input is a single caption representing the audio, plus a positive integer specifying how many times to repeat it. The generated instruction must explicitly include the number of loops. The instruction should be varied in length and phrasing. For example, if given 'sound of people talking' with a loop count of 3, acceptable outputs include 'Loop the audio three times' or 'Repeat the talking audio in a continuous loop of 3 repeats.'",
        "examples": [
            {
                "instruction": "Loop the audio 3 times",
                "inputs": {
                    "caption1": "sound of people talking",
                    "parameter": 3
                }
            },
            {
                "instruction": "Repeat the audio 5 times",
                "inputs": {
                    "caption1": "sound of people talking",
                    "parameter": 5
                }
            },
            {
                "instruction": "Double the talking audio by looping it 2 times",
                "inputs": {
                    "caption1": "sound of people talking",
                    "parameter": 2
                }
            }
        ]
    },
    "POST_PROCESS_natural": {
        "system_prompt": "You are an AI assistant translating detailed audio editing instructions into natural, conversational commands. Imagine how a person would casually ask for the edit. Your goal is to capture the *essence* of the request using varied, human-like phrasing, not just robotic simplification. Feel free to use synonyms, different sentence structures, and describe a closely related sound or action if it makes the command sound more authentic and fits the core intent (e.g., adding ambience, removing noise, inserting speech). Use a diverse vocabulary. For example, 'Blend in the subtle chirping of birds...' could become 'Add some bird sounds,' 'Weave in some chirping,' or 'Make it sound like birds are nearby.' Similarly, 'Carefully eliminate distracting background noise...' might become 'Clean up the background noise,' 'Get rid of the hum,' or 'Denoise this track.' Focus on naturalness and conveying the user's underlying goal.",
        "examples": [
            {
                "instruction": "Add some bird sounds",
                "inputs": {
                    "original_instruction": "Blend in the subtle chirping of birds to create a lively ambiance"
                }
            },
            {
                "instruction": "Make it sound like birds are nearby",
                "inputs": {
                    "original_instruction": "Blend in the subtle chirping of birds to create a lively ambiance"
                }
            },
            {
                "instruction": "Clean up the background noise",
                "inputs": {
                    "original_instruction": "Carefully eliminate distracting background noise from the recording"
                }
            },
            {
                "instruction": "Denoise this track",
                "inputs": {
                    "original_instruction": "Carefully eliminate distracting background noise from the recording"
                }
            },
            {
                "instruction": "Put some talking in the background",
                "inputs": {
                    "original_instruction": "Insert a man talking indistinctly in the background to simulate a crowd"
                }
            },
            {
                "instruction": "Add background chatter",
                "inputs": {
                    "original_instruction": "Insert a man talking indistinctly in the background to simulate a crowd"
                }
            }
        ]
    },
    "POST_PROCESS_minimal": {
        "system_prompt": "Role: AI Assistant specializing in audio command summarization.\nInput: A full, conversational audio editing command (e.g., \"Please apply a fade-out effect at the very end of the track.\").\nTask: Reduce the input command to its most essential keyword(s) or minimal phrase.\nProcess:\n\n\tIdentify the primary action (e.g., apply, remove, increase, cut).\n\tIdentify the primary subject/object (e.g., fade-out, noise, volume, intro).\n\tDiscard all extraneous words: politeness markers (please, could you), filler phrases (I think we should, I want to), articles (a, the), adverbs/adjectives unless essential (e.g., 'louder' is essential, 'annoying' in 'annoying noise' might not be).\n\tOutput the most compact representation, such as Subject alone if the action is strongly implied (e.g., 'Reverb', 'Noise'), or a concise Action + Subject (e.g., 'Apply reverb', 'Remove noise', 'Fade out').\n\tThe rephrased instruction MUST describe the exact same audio editing action or result as the original input. Do not change the core meaning or intended outcome of the edit.",
        "examples": [
            {
                "instruction": "Birds",
                "inputs": {
                    "original_instruction": "Add bird sounds."
                }
            },
            {
                "instruction": "Denoise",
                "inputs": {
                    "original_instruction": "Remove background noise"
                }
            },
            {
                "instruction": "Drop hum",
                "inputs": {
                    "original_instruction": "Remove the hum"
                }
            },
            {
                "instruction": "Louder",
                "inputs": {
                    "original_instruction": "Make it louder"
                }
            },
            {
                "instruction": "Louder",
                "inputs": {
                    "original_instruction": "Increase the volume"
                }
            },
            {
                "instruction": "Echo",
                "inputs": {
                    "original_instruction": "Apply an echo effect"
                }
            },
            {
                "instruction": "Reverb",
                "inputs": {
                    "original_instruction": "Add reverb"
                }
            },
            {
                "instruction": "Music",
                "inputs": {
                    "original_instruction": "Add some background music"
                }
            },
            {
                "instruction": "Softer",
                "inputs": {
                    "original_instruction": "Make it softer"
                }
            }
        ]
    },
    "POST_PROCESS_vari": {
        "system_prompt": "You are an AI assistant specializing in rephrasing audio editing instructions. Your task is to take a concise, functional audio editing command provided by the user and generate an alternative phrasing for the same instruction.\n\nCore Requirements:\n\n\tMaintain Functional Equivalence: This is CRITICAL. The rephrased instruction MUST describe the exact same audio editing action or result as the original input. Do not change the core meaning or intended outcome of the edit.\n\tNatural Language: The output should sound more like how a human might naturally give the instruction. It can be slightly more descriptive, conversational, or less technically rigid than the input, but should remain clear.\n\tControl Length: Keep the rephrased instruction concise. Aim for the output word count to be roughly between half (0.5x) and one-and-a-half times (1.5x) the word count of the original input instruction. Avoid making it significantly longer unless necessary for clarity or natural phrasing, especially for very short inputs.\n\tVariety in Phrasing: Aim for different sentence structures or word choices, not just simple synonym swaps, where appropriate, within the length constraint.\n\tFocus on the Instruction: Output only the rephrased instruction itself. Do not add explanations, questions, or conversational filler unless the rephrased instruction naturally includes it and fits the length guideline.\n\tUnderstand Audio Context: Recognize common audio editing terms (noise, reverb, EQ, specific sounds like sirens, talking, footsteps, etc.) and rephrase accordingly.",
        "examples": [
            {
                "instruction": "Add siren",
                "inputs": {
                    "original_instruction": "Add police siren"
                }
            },
            {
                "instruction": "Get rid of that humming sound",
                "inputs": {
                    "original_instruction": "Remove the hum"
                }
            },
            {
                "instruction": "Make it sound less bright",
                "inputs": {
                    "original_instruction": "Reduce high frequencies"
                }
            },
            {
                "instruction": "Remove waterfall",
                "inputs": {
                    "original_instruction": "Drop distant waterfall"
                }
            }
        ]
    }
}