[
    {
        "id": "0",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_21/Actor_21/01-01-01-01-02-01-21.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "neutral"
            }
        ]
    },
    {
        "id": "1",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_16/Actor_16/01-01-01-01-01-01-16.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "neutral"
            }
        ]
    },
    {
        "id": "2",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_15/Actor_15/01-01-01-01-02-01-15.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "neutral"
            }
        ]
    },
    {
        "id": "3",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_23/Actor_23/01-01-01-01-02-02-23.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "neutral"
            }
        ]
    },
    {
        "id": "4",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_13/Actor_13/01-01-01-01-02-01-13.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "neutral"
            }
        ]
    },
    {
        "id": "5",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_21/Actor_21/01-01-01-01-01-01-21.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "neutral"
            }
        ]
    },
    {
        "id": "6",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_16/Actor_16/01-01-01-01-02-02-16.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "neutral"
            }
        ]
    },
    {
        "id": "7",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_12/Actor_12/01-01-01-01-02-02-12.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "neutral"
            }
        ]
    },
    {
        "id": "8",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_15/Actor_15/01-01-01-01-01-02-15.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "neutral"
            }
        ]
    },
    {
        "id": "9",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_08/Actor_08/01-01-01-01-01-01-08.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "neutral"
            }
        ]
    },
    {
        "id": "10",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_06/Actor_06/01-01-01-01-01-02-06.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "neutral"
            }
        ]
    },
    {
        "id": "11",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_02/Actor_02/01-01-01-01-01-02-02.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "neutral"
            }
        ]
    },
    {
        "id": "12",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_13/Actor_13/01-01-01-01-01-02-13.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "neutral"
            }
        ]
    },
    {
        "id": "13",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_22/Actor_22/01-01-01-01-02-01-22.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "neutral"
            }
        ]
    },
    {
        "id": "14",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_08/Actor_08/01-01-01-01-02-02-08.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "neutral"
            }
        ]
    },
    {
        "id": "15",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_01/Actor_01/01-01-01-01-01-02-01.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "neutral"
            }
        ]
    },
    {
        "id": "16",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_04/Actor_04/01-01-01-01-01-02-04.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "neutral"
            }
        ]
    },
    {
        "id": "17",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_01/Actor_01/01-01-01-01-02-02-01.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "neutral"
            }
        ]
    },
    {
        "id": "18",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_20/Actor_20/01-01-01-01-01-01-20.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "neutral"
            }
        ]
    },
    {
        "id": "19",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_01/Actor_01/01-01-01-01-02-01-01.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "neutral"
            }
        ]
    },
    {
        "id": "20",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_07/Actor_07/01-01-01-01-01-01-07.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "neutral"
            }
        ]
    },
    {
        "id": "21",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_01/Actor_01/01-01-01-01-01-01-01.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "neutral"
            }
        ]
    },
    {
        "id": "22",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_14/Actor_14/01-01-01-01-01-01-14.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "neutral"
            }
        ]
    },
    {
        "id": "23",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_13/Actor_13/01-01-01-01-01-01-13.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "neutral"
            }
        ]
    },
    {
        "id": "24",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_11/Actor_11/01-01-01-01-01-01-11.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "neutral"
            }
        ]
    },
    {
        "id": "25",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_22/Actor_22/01-01-01-01-02-02-22.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "neutral"
            }
        ]
    },
    {
        "id": "26",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_09/Actor_09/01-01-01-01-01-01-09.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "neutral"
            }
        ]
    },
    {
        "id": "27",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_05/Actor_05/01-01-01-01-01-01-05.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "neutral"
            }
        ]
    },
    {
        "id": "28",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_14/Actor_14/01-01-01-01-02-01-14.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "neutral"
            }
        ]
    },
    {
        "id": "29",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_19/Actor_19/01-01-01-01-02-02-19.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "neutral"
            }
        ]
    },
    {
        "id": "30",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_03/Actor_03/01-01-01-01-02-01-03.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "neutral"
            }
        ]
    },
    {
        "id": "31",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_19/Actor_19/01-01-01-01-01-02-19.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "neutral"
            }
        ]
    },
    {
        "id": "32",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_03/Actor_03/01-01-01-01-01-01-03.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "neutral"
            }
        ]
    },
    {
        "id": "33",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_05/Actor_05/01-01-01-01-02-02-05.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "neutral"
            }
        ]
    },
    {
        "id": "34",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_05/Actor_05/01-01-01-01-01-02-05.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "neutral"
            }
        ]
    },
    {
        "id": "35",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_18/Actor_18/01-01-01-01-01-01-18.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "neutral"
            }
        ]
    },
    {
        "id": "36",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_15/Actor_15/01-01-01-01-02-02-15.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "neutral"
            }
        ]
    },
    {
        "id": "37",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_03/Actor_03/01-01-01-01-02-02-03.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "neutral"
            }
        ]
    },
    {
        "id": "38",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_16/Actor_16/01-01-01-01-01-02-16.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "neutral"
            }
        ]
    },
    {
        "id": "39",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_18/Actor_18/01-01-01-01-02-01-18.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "neutral"
            }
        ]
    },
    {
        "id": "40",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_19/Actor_19/01-01-01-01-01-01-19.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "neutral"
            }
        ]
    },
    {
        "id": "41",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_20/Actor_20/01-01-01-01-02-02-20.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "neutral"
            }
        ]
    },
    {
        "id": "42",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_19/Actor_19/01-01-01-01-02-01-19.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "neutral"
            }
        ]
    },
    {
        "id": "43",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_12/Actor_12/01-01-01-01-01-02-12.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "neutral"
            }
        ]
    },
    {
        "id": "44",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_04/Actor_04/01-01-01-01-02-02-04.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "neutral"
            }
        ]
    },
    {
        "id": "45",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_10/Actor_10/01-01-01-01-02-01-10.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "neutral"
            }
        ]
    },
    {
        "id": "46",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_18/Actor_18/01-01-01-01-01-02-18.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "neutral"
            }
        ]
    },
    {
        "id": "47",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_12/Actor_12/01-01-01-01-01-01-12.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "neutral"
            }
        ]
    },
    {
        "id": "48",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_17/Actor_17/01-01-01-01-02-01-17.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "neutral"
            }
        ]
    },
    {
        "id": "49",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_22/Actor_22/01-01-01-01-01-02-22.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "neutral"
            }
        ]
    },
    {
        "id": "50",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_08/Actor_08/01-01-01-01-01-02-08.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "neutral"
            }
        ]
    },
    {
        "id": "51",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_07/Actor_07/01-01-01-01-02-01-07.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "neutral"
            }
        ]
    },
    {
        "id": "52",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_18/Actor_18/01-01-01-01-02-02-18.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "neutral"
            }
        ]
    },
    {
        "id": "53",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_11/Actor_11/01-01-01-01-01-02-11.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "neutral"
            }
        ]
    },
    {
        "id": "54",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_21/Actor_21/01-01-01-01-01-02-21.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "neutral"
            }
        ]
    },
    {
        "id": "55",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_06/Actor_06/01-01-01-01-02-01-06.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "neutral"
            }
        ]
    },
    {
        "id": "56",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_11/Actor_11/01-01-01-01-02-01-11.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "neutral"
            }
        ]
    },
    {
        "id": "57",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_02/Actor_02/01-01-01-01-01-01-02.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "neutral"
            }
        ]
    },
    {
        "id": "58",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_04/Actor_04/01-01-01-01-02-01-04.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "neutral"
            }
        ]
    },
    {
        "id": "59",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_07/Actor_07/01-01-01-01-01-02-07.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "neutral"
            }
        ]
    },
    {
        "id": "60",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_17/Actor_17/01-01-01-01-02-02-17.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "neutral"
            }
        ]
    },
    {
        "id": "61",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_23/Actor_23/01-01-01-01-02-01-23.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "neutral"
            }
        ]
    },
    {
        "id": "62",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_02/Actor_02/01-01-02-02-01-02-02.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "calm"
            }
        ]
    },
    {
        "id": "63",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_12/Actor_12/01-01-02-01-02-02-12.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "calm"
            }
        ]
    },
    {
        "id": "64",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_11/Actor_11/01-01-02-02-01-02-11.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "calm"
            }
        ]
    },
    {
        "id": "65",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_08/Actor_08/01-01-02-01-02-01-08.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "calm"
            }
        ]
    },
    {
        "id": "66",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_19/Actor_19/01-01-02-01-02-01-19.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "calm"
            }
        ]
    },
    {
        "id": "67",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_15/Actor_15/01-01-02-02-01-02-15.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "calm"
            }
        ]
    },
    {
        "id": "68",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_23/Actor_23/01-01-02-01-01-01-23.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "calm"
            }
        ]
    },
    {
        "id": "69",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_22/Actor_22/01-01-02-01-01-01-22.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "calm"
            }
        ]
    },
    {
        "id": "70",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_24/Actor_24/01-01-02-01-01-01-24.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "calm"
            }
        ]
    },
    {
        "id": "71",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_07/Actor_07/01-01-02-02-01-01-07.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "calm"
            }
        ]
    },
    {
        "id": "72",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_04/Actor_04/01-01-02-02-02-02-04.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "calm"
            }
        ]
    },
    {
        "id": "73",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_24/Actor_24/01-01-02-02-01-01-24.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "calm"
            }
        ]
    },
    {
        "id": "74",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_16/Actor_16/01-01-02-02-02-02-16.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "calm"
            }
        ]
    },
    {
        "id": "75",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_14/Actor_14/01-01-02-02-01-02-14.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "calm"
            }
        ]
    },
    {
        "id": "76",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_07/Actor_07/01-01-02-01-01-01-07.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "calm"
            }
        ]
    },
    {
        "id": "77",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_01/Actor_01/01-01-02-02-02-02-01.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "calm"
            }
        ]
    },
    {
        "id": "78",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_14/Actor_14/01-01-02-02-02-02-14.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "calm"
            }
        ]
    },
    {
        "id": "79",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_19/Actor_19/01-01-02-01-02-02-19.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "calm"
            }
        ]
    },
    {
        "id": "80",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_24/Actor_24/01-01-02-01-01-02-24.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "calm"
            }
        ]
    },
    {
        "id": "81",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_23/Actor_23/01-01-02-02-01-01-23.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "calm"
            }
        ]
    },
    {
        "id": "82",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_07/Actor_07/01-01-02-01-01-02-07.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "calm"
            }
        ]
    },
    {
        "id": "83",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_04/Actor_04/01-01-02-02-01-01-04.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "calm"
            }
        ]
    },
    {
        "id": "84",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_09/Actor_09/01-01-02-02-01-01-09.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "calm"
            }
        ]
    },
    {
        "id": "85",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_20/Actor_20/01-01-02-01-02-02-20.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "calm"
            }
        ]
    },
    {
        "id": "86",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_17/Actor_17/01-01-02-02-02-01-17.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "calm"
            }
        ]
    },
    {
        "id": "87",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_02/Actor_02/01-01-02-01-01-02-02.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "calm"
            }
        ]
    },
    {
        "id": "88",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_10/Actor_10/01-01-02-02-02-02-10.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "calm"
            }
        ]
    },
    {
        "id": "89",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_01/Actor_01/01-01-02-01-01-01-01.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "calm"
            }
        ]
    },
    {
        "id": "90",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_01/Actor_01/01-01-02-02-02-01-01.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "calm"
            }
        ]
    },
    {
        "id": "91",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_12/Actor_12/01-01-02-01-02-01-12.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "calm"
            }
        ]
    },
    {
        "id": "92",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_21/Actor_21/01-01-02-02-02-01-21.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "calm"
            }
        ]
    },
    {
        "id": "93",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_06/Actor_06/01-01-02-02-01-01-06.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "calm"
            }
        ]
    },
    {
        "id": "94",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_16/Actor_16/01-01-02-01-01-01-16.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "calm"
            }
        ]
    },
    {
        "id": "95",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_23/Actor_23/01-01-02-02-02-02-23.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "calm"
            }
        ]
    },
    {
        "id": "96",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_14/Actor_14/01-01-02-02-02-01-14.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "calm"
            }
        ]
    },
    {
        "id": "97",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_20/Actor_20/01-01-02-02-02-01-20.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "calm"
            }
        ]
    },
    {
        "id": "98",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_08/Actor_08/01-01-02-02-02-01-08.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "calm"
            }
        ]
    },
    {
        "id": "99",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_07/Actor_07/01-01-02-01-02-01-07.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "calm"
            }
        ]
    },
    {
        "id": "100",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_23/Actor_23/01-01-02-02-02-01-23.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "calm"
            }
        ]
    },
    {
        "id": "101",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_05/Actor_05/01-01-02-01-01-02-05.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "calm"
            }
        ]
    },
    {
        "id": "102",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_08/Actor_08/01-01-02-01-01-02-08.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "calm"
            }
        ]
    },
    {
        "id": "103",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_01/Actor_01/01-01-02-01-01-02-01.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "calm"
            }
        ]
    },
    {
        "id": "104",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_06/Actor_06/01-01-02-01-02-02-06.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "calm"
            }
        ]
    },
    {
        "id": "105",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_24/Actor_24/01-01-02-02-02-01-24.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "calm"
            }
        ]
    },
    {
        "id": "106",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_23/Actor_23/01-01-02-01-01-02-23.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "calm"
            }
        ]
    },
    {
        "id": "107",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_11/Actor_11/01-01-02-01-02-01-11.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "calm"
            }
        ]
    },
    {
        "id": "108",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_06/Actor_06/01-01-02-01-02-01-06.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "calm"
            }
        ]
    },
    {
        "id": "109",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_12/Actor_12/01-01-02-01-01-01-12.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "calm"
            }
        ]
    },
    {
        "id": "110",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_19/Actor_19/01-01-02-02-02-01-19.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "calm"
            }
        ]
    },
    {
        "id": "111",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_13/Actor_13/01-01-02-01-01-02-13.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "calm"
            }
        ]
    },
    {
        "id": "112",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_17/Actor_17/01-01-02-02-01-01-17.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "calm"
            }
        ]
    },
    {
        "id": "113",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_20/Actor_20/01-01-02-01-01-01-20.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "calm"
            }
        ]
    },
    {
        "id": "114",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_13/Actor_13/01-01-02-02-01-01-13.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "calm"
            }
        ]
    },
    {
        "id": "115",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_17/Actor_17/01-01-02-01-02-02-17.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "calm"
            }
        ]
    },
    {
        "id": "116",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_13/Actor_13/01-01-02-02-02-02-13.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "calm"
            }
        ]
    },
    {
        "id": "117",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_17/Actor_17/01-01-02-02-01-02-17.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "calm"
            }
        ]
    },
    {
        "id": "118",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_04/Actor_04/01-01-02-01-01-02-04.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "calm"
            }
        ]
    },
    {
        "id": "119",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_16/Actor_16/01-01-02-02-01-02-16.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "calm"
            }
        ]
    },
    {
        "id": "120",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_15/Actor_15/01-01-02-01-01-02-15.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "calm"
            }
        ]
    },
    {
        "id": "121",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_01/Actor_01/01-01-02-02-01-01-01.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "calm"
            }
        ]
    },
    {
        "id": "122",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_15/Actor_15/01-01-02-02-01-01-15.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "calm"
            }
        ]
    },
    {
        "id": "123",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_06/Actor_06/01-01-02-02-02-02-06.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "calm"
            }
        ]
    },
    {
        "id": "124",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_10/Actor_10/01-01-03-01-02-01-10.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "happy"
            }
        ]
    },
    {
        "id": "125",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_08/Actor_08/01-01-03-01-02-01-08.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "happy"
            }
        ]
    },
    {
        "id": "126",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_13/Actor_13/01-01-03-02-01-01-13.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "happy"
            }
        ]
    },
    {
        "id": "127",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_01/Actor_01/01-01-03-01-02-02-01.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "happy"
            }
        ]
    },
    {
        "id": "128",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_12/Actor_12/01-01-03-02-01-01-12.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "happy"
            }
        ]
    },
    {
        "id": "129",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_05/Actor_05/01-01-03-02-01-02-05.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "happy"
            }
        ]
    },
    {
        "id": "130",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_15/Actor_15/01-01-03-02-01-02-15.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "happy"
            }
        ]
    },
    {
        "id": "131",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_14/Actor_14/01-01-03-01-01-02-14.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "happy"
            }
        ]
    },
    {
        "id": "132",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_15/Actor_15/01-01-03-02-01-01-15.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "happy"
            }
        ]
    },
    {
        "id": "133",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_03/Actor_03/01-01-03-02-01-02-03.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "happy"
            }
        ]
    },
    {
        "id": "134",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_04/Actor_04/01-01-03-02-02-02-04.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "happy"
            }
        ]
    },
    {
        "id": "135",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_02/Actor_02/01-01-03-01-02-01-02.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "happy"
            }
        ]
    },
    {
        "id": "136",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_02/Actor_02/01-01-03-02-02-01-02.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "happy"
            }
        ]
    },
    {
        "id": "137",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_12/Actor_12/01-01-03-02-02-02-12.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "happy"
            }
        ]
    },
    {
        "id": "138",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_20/Actor_20/01-01-03-01-01-01-20.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "happy"
            }
        ]
    },
    {
        "id": "139",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_14/Actor_14/01-01-03-02-02-01-14.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "happy"
            }
        ]
    },
    {
        "id": "140",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_09/Actor_09/01-01-03-02-02-01-09.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "happy"
            }
        ]
    },
    {
        "id": "141",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_08/Actor_08/01-01-03-02-01-02-08.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "happy"
            }
        ]
    },
    {
        "id": "142",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_01/Actor_01/01-01-03-02-02-01-01.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "happy"
            }
        ]
    },
    {
        "id": "143",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_09/Actor_09/01-01-03-01-01-02-09.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "happy"
            }
        ]
    },
    {
        "id": "144",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_20/Actor_20/01-01-03-01-02-01-20.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "happy"
            }
        ]
    },
    {
        "id": "145",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_12/Actor_12/01-01-03-01-02-01-12.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "happy"
            }
        ]
    },
    {
        "id": "146",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_14/Actor_14/01-01-03-02-01-01-14.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "happy"
            }
        ]
    },
    {
        "id": "147",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_20/Actor_20/01-01-03-02-01-02-20.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "happy"
            }
        ]
    },
    {
        "id": "148",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_02/Actor_02/01-01-03-02-01-02-02.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "happy"
            }
        ]
    },
    {
        "id": "149",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_09/Actor_09/01-01-03-02-02-02-09.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "happy"
            }
        ]
    },
    {
        "id": "150",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_14/Actor_14/01-01-03-01-01-01-14.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "happy"
            }
        ]
    },
    {
        "id": "151",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_15/Actor_15/01-01-03-01-01-01-15.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "happy"
            }
        ]
    },
    {
        "id": "152",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_17/Actor_17/01-01-03-01-01-02-17.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "happy"
            }
        ]
    },
    {
        "id": "153",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_20/Actor_20/01-01-03-02-02-01-20.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "happy"
            }
        ]
    },
    {
        "id": "154",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_23/Actor_23/01-01-03-01-02-02-23.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "happy"
            }
        ]
    },
    {
        "id": "155",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_04/Actor_04/01-01-03-01-02-01-04.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "happy"
            }
        ]
    },
    {
        "id": "156",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_18/Actor_18/01-01-03-01-01-02-18.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "happy"
            }
        ]
    },
    {
        "id": "157",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_06/Actor_06/01-01-03-02-02-02-06.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "happy"
            }
        ]
    },
    {
        "id": "158",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_15/Actor_15/01-01-03-01-01-02-15.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "happy"
            }
        ]
    },
    {
        "id": "159",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_08/Actor_08/01-01-03-02-01-01-08.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "happy"
            }
        ]
    },
    {
        "id": "160",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_16/Actor_16/01-01-03-01-01-01-16.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "happy"
            }
        ]
    },
    {
        "id": "161",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_19/Actor_19/01-01-03-01-02-02-19.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "happy"
            }
        ]
    },
    {
        "id": "162",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_11/Actor_11/01-01-03-01-01-01-11.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "happy"
            }
        ]
    },
    {
        "id": "163",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_11/Actor_11/01-01-03-02-02-01-11.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "happy"
            }
        ]
    },
    {
        "id": "164",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_14/Actor_14/01-01-03-02-01-02-14.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "happy"
            }
        ]
    },
    {
        "id": "165",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_06/Actor_06/01-01-03-02-01-02-06.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "happy"
            }
        ]
    },
    {
        "id": "166",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_09/Actor_09/01-01-03-01-02-02-09.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "happy"
            }
        ]
    },
    {
        "id": "167",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_22/Actor_22/01-01-03-02-02-02-22.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "happy"
            }
        ]
    },
    {
        "id": "168",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_08/Actor_08/01-01-03-01-01-02-08.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "happy"
            }
        ]
    },
    {
        "id": "169",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_11/Actor_11/01-01-03-02-01-01-11.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "happy"
            }
        ]
    },
    {
        "id": "170",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_11/Actor_11/01-01-03-01-01-02-11.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "happy"
            }
        ]
    },
    {
        "id": "171",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_04/Actor_04/01-01-03-02-01-02-04.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "happy"
            }
        ]
    },
    {
        "id": "172",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_01/Actor_01/01-01-03-01-01-02-01.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "happy"
            }
        ]
    },
    {
        "id": "173",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_05/Actor_05/01-01-03-02-01-01-05.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "happy"
            }
        ]
    },
    {
        "id": "174",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_07/Actor_07/01-01-03-01-02-01-07.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "happy"
            }
        ]
    },
    {
        "id": "175",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_18/Actor_18/01-01-03-02-01-02-18.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "happy"
            }
        ]
    },
    {
        "id": "176",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_04/Actor_04/01-01-03-01-02-02-04.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "happy"
            }
        ]
    },
    {
        "id": "177",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_10/Actor_10/01-01-03-01-01-02-10.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "happy"
            }
        ]
    },
    {
        "id": "178",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_18/Actor_18/01-01-03-01-02-02-18.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "happy"
            }
        ]
    },
    {
        "id": "179",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_11/Actor_11/01-01-03-02-02-02-11.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "happy"
            }
        ]
    },
    {
        "id": "180",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_07/Actor_07/01-01-03-02-01-02-07.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "happy"
            }
        ]
    },
    {
        "id": "181",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_07/Actor_07/01-01-03-01-01-01-07.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "happy"
            }
        ]
    },
    {
        "id": "182",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_02/Actor_02/01-01-03-01-01-02-02.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "happy"
            }
        ]
    },
    {
        "id": "183",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_06/Actor_06/01-01-03-01-01-01-06.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "happy"
            }
        ]
    },
    {
        "id": "184",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_19/Actor_19/01-01-03-02-01-01-19.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "happy"
            }
        ]
    },
    {
        "id": "185",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_17/Actor_17/01-01-03-02-01-01-17.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "happy"
            }
        ]
    },
    {
        "id": "186",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_21/Actor_21/01-01-04-01-02-01-21.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "sad"
            }
        ]
    },
    {
        "id": "187",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_04/Actor_04/01-01-04-02-01-02-04.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "sad"
            }
        ]
    },
    {
        "id": "188",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_08/Actor_08/01-01-04-01-02-02-08.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "sad"
            }
        ]
    },
    {
        "id": "189",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_16/Actor_16/01-01-04-02-02-01-16.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "sad"
            }
        ]
    },
    {
        "id": "190",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_03/Actor_03/01-01-04-01-01-02-03.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "sad"
            }
        ]
    },
    {
        "id": "191",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_10/Actor_10/01-01-04-02-01-02-10.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "sad"
            }
        ]
    },
    {
        "id": "192",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_23/Actor_23/01-01-04-01-01-01-23.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "sad"
            }
        ]
    },
    {
        "id": "193",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_14/Actor_14/01-01-04-02-01-02-14.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "sad"
            }
        ]
    },
    {
        "id": "194",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_01/Actor_01/01-01-04-01-01-01-01.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "sad"
            }
        ]
    },
    {
        "id": "195",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_23/Actor_23/01-01-04-02-01-01-23.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "sad"
            }
        ]
    },
    {
        "id": "196",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_01/Actor_01/01-01-04-01-01-02-01.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "sad"
            }
        ]
    },
    {
        "id": "197",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_02/Actor_02/01-01-04-01-02-02-02.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "sad"
            }
        ]
    },
    {
        "id": "198",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_18/Actor_18/01-01-04-02-02-02-18.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "sad"
            }
        ]
    },
    {
        "id": "199",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_10/Actor_10/01-01-04-02-02-02-10.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "sad"
            }
        ]
    },
    {
        "id": "200",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_20/Actor_20/01-01-04-02-02-02-20.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "sad"
            }
        ]
    },
    {
        "id": "201",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_16/Actor_16/01-01-04-01-01-02-16.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "sad"
            }
        ]
    },
    {
        "id": "202",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_08/Actor_08/01-01-04-02-01-02-08.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "sad"
            }
        ]
    },
    {
        "id": "203",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_20/Actor_20/01-01-04-01-01-02-20.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "sad"
            }
        ]
    },
    {
        "id": "204",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_09/Actor_09/01-01-04-01-01-01-09.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "sad"
            }
        ]
    },
    {
        "id": "205",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_16/Actor_16/01-01-04-01-02-01-16.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "sad"
            }
        ]
    },
    {
        "id": "206",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_09/Actor_09/01-01-04-02-01-01-09.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "sad"
            }
        ]
    },
    {
        "id": "207",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_03/Actor_03/01-01-04-01-02-02-03.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "sad"
            }
        ]
    },
    {
        "id": "208",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_04/Actor_04/01-01-04-01-01-02-04.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "sad"
            }
        ]
    },
    {
        "id": "209",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_10/Actor_10/01-01-04-01-01-01-10.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "sad"
            }
        ]
    },
    {
        "id": "210",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_04/Actor_04/01-01-04-02-02-02-04.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "sad"
            }
        ]
    },
    {
        "id": "211",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_17/Actor_17/01-01-04-01-01-01-17.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "sad"
            }
        ]
    },
    {
        "id": "212",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_12/Actor_12/01-01-04-02-02-01-12.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "sad"
            }
        ]
    },
    {
        "id": "213",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_23/Actor_23/01-01-04-01-02-02-23.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "sad"
            }
        ]
    },
    {
        "id": "214",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_12/Actor_12/01-01-04-02-01-02-12.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "sad"
            }
        ]
    },
    {
        "id": "215",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_09/Actor_09/01-01-04-01-02-01-09.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "sad"
            }
        ]
    },
    {
        "id": "216",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_06/Actor_06/01-01-04-02-02-02-06.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "sad"
            }
        ]
    },
    {
        "id": "217",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_04/Actor_04/01-01-04-01-02-01-04.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "sad"
            }
        ]
    },
    {
        "id": "218",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_16/Actor_16/01-01-04-02-01-02-16.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "sad"
            }
        ]
    },
    {
        "id": "219",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_03/Actor_03/01-01-04-02-02-01-03.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "sad"
            }
        ]
    },
    {
        "id": "220",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_09/Actor_09/01-01-04-01-02-02-09.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "sad"
            }
        ]
    },
    {
        "id": "221",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_11/Actor_11/01-01-04-02-01-01-11.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "sad"
            }
        ]
    },
    {
        "id": "222",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_15/Actor_15/01-01-04-01-02-02-15.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "sad"
            }
        ]
    },
    {
        "id": "223",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_17/Actor_17/01-01-04-01-01-02-17.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "sad"
            }
        ]
    },
    {
        "id": "224",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_23/Actor_23/01-01-04-02-02-02-23.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "sad"
            }
        ]
    },
    {
        "id": "225",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_21/Actor_21/01-01-04-01-01-01-21.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "sad"
            }
        ]
    },
    {
        "id": "226",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_11/Actor_11/01-01-04-02-02-01-11.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "sad"
            }
        ]
    },
    {
        "id": "227",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_16/Actor_16/01-01-04-02-02-02-16.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "sad"
            }
        ]
    },
    {
        "id": "228",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_02/Actor_02/01-01-04-01-01-02-02.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "sad"
            }
        ]
    },
    {
        "id": "229",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_19/Actor_19/01-01-04-02-01-02-19.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "sad"
            }
        ]
    },
    {
        "id": "230",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_21/Actor_21/01-01-04-02-01-02-21.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "sad"
            }
        ]
    },
    {
        "id": "231",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_02/Actor_02/01-01-04-01-02-01-02.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "sad"
            }
        ]
    },
    {
        "id": "232",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_03/Actor_03/01-01-04-02-02-02-03.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "sad"
            }
        ]
    },
    {
        "id": "233",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_09/Actor_09/01-01-04-02-01-02-09.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "sad"
            }
        ]
    },
    {
        "id": "234",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_11/Actor_11/01-01-04-02-01-02-11.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "sad"
            }
        ]
    },
    {
        "id": "235",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_07/Actor_07/01-01-04-01-02-02-07.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "sad"
            }
        ]
    },
    {
        "id": "236",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_05/Actor_05/01-01-04-02-02-01-05.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "sad"
            }
        ]
    },
    {
        "id": "237",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_15/Actor_15/01-01-04-02-02-01-15.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "sad"
            }
        ]
    },
    {
        "id": "238",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_05/Actor_05/01-01-04-02-02-02-05.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "sad"
            }
        ]
    },
    {
        "id": "239",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_08/Actor_08/01-01-04-01-02-01-08.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "sad"
            }
        ]
    },
    {
        "id": "240",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_11/Actor_11/01-01-04-01-01-02-11.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "sad"
            }
        ]
    },
    {
        "id": "241",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_24/Actor_24/01-01-04-01-02-01-24.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "sad"
            }
        ]
    },
    {
        "id": "242",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_12/Actor_12/01-01-04-02-02-02-12.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "sad"
            }
        ]
    },
    {
        "id": "243",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_12/Actor_12/01-01-04-01-01-01-12.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "sad"
            }
        ]
    },
    {
        "id": "244",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_02/Actor_02/01-01-04-02-01-01-02.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "sad"
            }
        ]
    },
    {
        "id": "245",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_10/Actor_10/01-01-04-02-01-01-10.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "sad"
            }
        ]
    },
    {
        "id": "246",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_15/Actor_15/01-01-04-02-01-01-15.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "sad"
            }
        ]
    },
    {
        "id": "247",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_04/Actor_04/01-01-04-02-01-01-04.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "sad"
            }
        ]
    },
    {
        "id": "248",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_20/Actor_20/01-01-05-02-01-01-20.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "angry"
            }
        ]
    },
    {
        "id": "249",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_02/Actor_02/01-01-05-02-02-02-02.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "angry"
            }
        ]
    },
    {
        "id": "250",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_10/Actor_10/01-01-05-01-02-01-10.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "angry"
            }
        ]
    },
    {
        "id": "251",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_11/Actor_11/01-01-05-01-01-01-11.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "angry"
            }
        ]
    },
    {
        "id": "252",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_17/Actor_17/01-01-05-01-02-01-17.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "angry"
            }
        ]
    },
    {
        "id": "253",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_02/Actor_02/01-01-05-02-01-02-02.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "angry"
            }
        ]
    },
    {
        "id": "254",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_01/Actor_01/01-01-05-01-01-02-01.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "angry"
            }
        ]
    },
    {
        "id": "255",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_22/Actor_22/01-01-05-02-02-02-22.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "angry"
            }
        ]
    },
    {
        "id": "256",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_13/Actor_13/01-01-05-02-01-01-13.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "angry"
            }
        ]
    },
    {
        "id": "257",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_24/Actor_24/01-01-05-02-01-02-24.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "angry"
            }
        ]
    },
    {
        "id": "258",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_09/Actor_09/01-01-05-02-02-02-09.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "angry"
            }
        ]
    },
    {
        "id": "259",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_17/Actor_17/01-01-05-01-01-02-17.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "angry"
            }
        ]
    },
    {
        "id": "260",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_07/Actor_07/01-01-05-01-01-01-07.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "angry"
            }
        ]
    },
    {
        "id": "261",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_03/Actor_03/01-01-05-02-01-02-03.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "angry"
            }
        ]
    },
    {
        "id": "262",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_23/Actor_23/01-01-05-02-01-02-23.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "angry"
            }
        ]
    },
    {
        "id": "263",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_11/Actor_11/01-01-05-02-01-02-11.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "angry"
            }
        ]
    },
    {
        "id": "264",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_08/Actor_08/01-01-05-01-02-02-08.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "angry"
            }
        ]
    },
    {
        "id": "265",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_23/Actor_23/01-01-05-02-02-02-23.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "angry"
            }
        ]
    },
    {
        "id": "266",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_06/Actor_06/01-01-05-02-01-01-06.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "angry"
            }
        ]
    },
    {
        "id": "267",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_15/Actor_15/01-01-05-01-02-01-15.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "angry"
            }
        ]
    },
    {
        "id": "268",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_18/Actor_18/01-01-05-02-01-01-18.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "angry"
            }
        ]
    },
    {
        "id": "269",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_24/Actor_24/01-01-05-02-02-01-24.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "angry"
            }
        ]
    },
    {
        "id": "270",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_20/Actor_20/01-01-05-02-01-02-20.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "angry"
            }
        ]
    },
    {
        "id": "271",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_19/Actor_19/01-01-05-02-01-02-19.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "angry"
            }
        ]
    },
    {
        "id": "272",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_22/Actor_22/01-01-05-01-01-01-22.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "angry"
            }
        ]
    },
    {
        "id": "273",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_10/Actor_10/01-01-05-02-02-01-10.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "angry"
            }
        ]
    },
    {
        "id": "274",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_06/Actor_06/01-01-05-02-02-02-06.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "angry"
            }
        ]
    },
    {
        "id": "275",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_19/Actor_19/01-01-05-01-01-02-19.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "angry"
            }
        ]
    },
    {
        "id": "276",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_14/Actor_14/01-01-05-01-02-02-14.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "angry"
            }
        ]
    },
    {
        "id": "277",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_22/Actor_22/01-01-05-01-02-01-22.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "angry"
            }
        ]
    },
    {
        "id": "278",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_11/Actor_11/01-01-05-02-02-01-11.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "angry"
            }
        ]
    },
    {
        "id": "279",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_18/Actor_18/01-01-05-01-01-01-18.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "angry"
            }
        ]
    },
    {
        "id": "280",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_16/Actor_16/01-01-05-02-01-01-16.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "angry"
            }
        ]
    },
    {
        "id": "281",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_04/Actor_04/01-01-05-01-02-01-04.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "angry"
            }
        ]
    },
    {
        "id": "282",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_07/Actor_07/01-01-05-01-02-02-07.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "angry"
            }
        ]
    },
    {
        "id": "283",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_20/Actor_20/01-01-05-02-02-02-20.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "angry"
            }
        ]
    },
    {
        "id": "284",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_09/Actor_09/01-01-05-02-02-01-09.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "angry"
            }
        ]
    },
    {
        "id": "285",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_09/Actor_09/01-01-05-01-01-01-09.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "angry"
            }
        ]
    },
    {
        "id": "286",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_10/Actor_10/01-01-05-01-02-02-10.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "angry"
            }
        ]
    },
    {
        "id": "287",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_15/Actor_15/01-01-05-01-02-02-15.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "angry"
            }
        ]
    },
    {
        "id": "288",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_16/Actor_16/01-01-05-01-01-02-16.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "angry"
            }
        ]
    },
    {
        "id": "289",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_19/Actor_19/01-01-05-01-02-02-19.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "angry"
            }
        ]
    },
    {
        "id": "290",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_04/Actor_04/01-01-05-01-01-01-04.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "angry"
            }
        ]
    },
    {
        "id": "291",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_11/Actor_11/01-01-05-01-02-02-11.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "angry"
            }
        ]
    },
    {
        "id": "292",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_05/Actor_05/01-01-05-01-02-01-05.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "angry"
            }
        ]
    },
    {
        "id": "293",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_01/Actor_01/01-01-05-02-02-01-01.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "angry"
            }
        ]
    },
    {
        "id": "294",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_19/Actor_19/01-01-05-02-01-01-19.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "angry"
            }
        ]
    },
    {
        "id": "295",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_19/Actor_19/01-01-05-01-02-01-19.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "angry"
            }
        ]
    },
    {
        "id": "296",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_01/Actor_01/01-01-05-02-02-02-01.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "angry"
            }
        ]
    },
    {
        "id": "297",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_24/Actor_24/01-01-05-02-01-01-24.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "angry"
            }
        ]
    },
    {
        "id": "298",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_02/Actor_02/01-01-05-02-01-01-02.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "angry"
            }
        ]
    },
    {
        "id": "299",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_18/Actor_18/01-01-05-01-02-02-18.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "angry"
            }
        ]
    },
    {
        "id": "300",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_13/Actor_13/01-01-05-02-02-01-13.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "angry"
            }
        ]
    },
    {
        "id": "301",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_13/Actor_13/01-01-05-02-02-02-13.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "angry"
            }
        ]
    },
    {
        "id": "302",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_05/Actor_05/01-01-05-02-01-01-05.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "angry"
            }
        ]
    },
    {
        "id": "303",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_03/Actor_03/01-01-05-01-01-02-03.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "angry"
            }
        ]
    },
    {
        "id": "304",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_06/Actor_06/01-01-05-01-01-02-06.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "angry"
            }
        ]
    },
    {
        "id": "305",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_06/Actor_06/01-01-05-02-02-01-06.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "angry"
            }
        ]
    },
    {
        "id": "306",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_05/Actor_05/01-01-05-01-01-01-05.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "angry"
            }
        ]
    },
    {
        "id": "307",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_05/Actor_05/01-01-05-01-02-02-05.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "angry"
            }
        ]
    },
    {
        "id": "308",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_01/Actor_01/01-01-05-01-02-01-01.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "angry"
            }
        ]
    },
    {
        "id": "309",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_12/Actor_12/01-01-05-01-02-02-12.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "angry"
            }
        ]
    },
    {
        "id": "310",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_01/Actor_01/01-01-06-01-01-02-01.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "fearful"
            }
        ]
    },
    {
        "id": "311",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_20/Actor_20/01-01-06-02-02-02-20.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "fearful"
            }
        ]
    },
    {
        "id": "312",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_16/Actor_16/01-01-06-01-02-02-16.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "fearful"
            }
        ]
    },
    {
        "id": "313",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_19/Actor_19/01-01-06-01-01-02-19.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "fearful"
            }
        ]
    },
    {
        "id": "314",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_04/Actor_04/01-01-06-01-01-02-04.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "fearful"
            }
        ]
    },
    {
        "id": "315",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_02/Actor_02/01-01-06-02-01-01-02.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "fearful"
            }
        ]
    },
    {
        "id": "316",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_23/Actor_23/01-01-06-02-02-01-23.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "fearful"
            }
        ]
    },
    {
        "id": "317",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_05/Actor_05/01-01-06-01-02-01-05.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "fearful"
            }
        ]
    },
    {
        "id": "318",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_11/Actor_11/01-01-06-01-01-01-11.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "fearful"
            }
        ]
    },
    {
        "id": "319",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_09/Actor_09/01-01-06-02-02-01-09.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "fearful"
            }
        ]
    },
    {
        "id": "320",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_12/Actor_12/01-01-06-02-02-01-12.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "fearful"
            }
        ]
    },
    {
        "id": "321",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_05/Actor_05/01-01-06-02-02-02-05.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "fearful"
            }
        ]
    },
    {
        "id": "322",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_09/Actor_09/01-01-06-02-01-02-09.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "fearful"
            }
        ]
    },
    {
        "id": "323",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_09/Actor_09/01-01-06-01-01-02-09.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "fearful"
            }
        ]
    },
    {
        "id": "324",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_19/Actor_19/01-01-06-02-01-01-19.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "fearful"
            }
        ]
    },
    {
        "id": "325",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_21/Actor_21/01-01-06-02-02-02-21.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "fearful"
            }
        ]
    },
    {
        "id": "326",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_08/Actor_08/01-01-06-01-01-02-08.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "fearful"
            }
        ]
    },
    {
        "id": "327",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_20/Actor_20/01-01-06-02-01-02-20.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "fearful"
            }
        ]
    },
    {
        "id": "328",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_17/Actor_17/01-01-06-02-01-02-17.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "fearful"
            }
        ]
    },
    {
        "id": "329",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_21/Actor_21/01-01-06-02-02-01-21.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "fearful"
            }
        ]
    },
    {
        "id": "330",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_20/Actor_20/01-01-06-01-01-02-20.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "fearful"
            }
        ]
    },
    {
        "id": "331",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_09/Actor_09/01-01-06-01-02-02-09.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "fearful"
            }
        ]
    },
    {
        "id": "332",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_11/Actor_11/01-01-06-01-02-01-11.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "fearful"
            }
        ]
    },
    {
        "id": "333",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_15/Actor_15/01-01-06-01-02-01-15.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "fearful"
            }
        ]
    },
    {
        "id": "334",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_06/Actor_06/01-01-06-02-01-02-06.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "fearful"
            }
        ]
    },
    {
        "id": "335",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_20/Actor_20/01-01-06-01-01-01-20.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "fearful"
            }
        ]
    },
    {
        "id": "336",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_22/Actor_22/01-01-06-01-01-01-22.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "fearful"
            }
        ]
    },
    {
        "id": "337",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_16/Actor_16/01-01-06-01-02-01-16.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "fearful"
            }
        ]
    },
    {
        "id": "338",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_02/Actor_02/01-01-06-01-02-02-02.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "fearful"
            }
        ]
    },
    {
        "id": "339",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_06/Actor_06/01-01-06-01-01-01-06.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "fearful"
            }
        ]
    },
    {
        "id": "340",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_17/Actor_17/01-01-06-02-02-02-17.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "fearful"
            }
        ]
    },
    {
        "id": "341",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_06/Actor_06/01-01-06-01-02-01-06.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "fearful"
            }
        ]
    },
    {
        "id": "342",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_07/Actor_07/01-01-06-01-02-02-07.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "fearful"
            }
        ]
    },
    {
        "id": "343",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_21/Actor_21/01-01-06-01-02-01-21.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "fearful"
            }
        ]
    },
    {
        "id": "344",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_10/Actor_10/01-01-06-01-01-01-10.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "fearful"
            }
        ]
    },
    {
        "id": "345",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_07/Actor_07/01-01-06-02-01-02-07.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "fearful"
            }
        ]
    },
    {
        "id": "346",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_04/Actor_04/01-01-06-02-01-01-04.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "fearful"
            }
        ]
    },
    {
        "id": "347",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_24/Actor_24/01-01-06-01-02-02-24.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "fearful"
            }
        ]
    },
    {
        "id": "348",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_01/Actor_01/01-01-06-01-02-02-01.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "fearful"
            }
        ]
    },
    {
        "id": "349",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_08/Actor_08/01-01-06-01-01-01-08.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "fearful"
            }
        ]
    },
    {
        "id": "350",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_06/Actor_06/01-01-06-01-02-02-06.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "fearful"
            }
        ]
    },
    {
        "id": "351",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_23/Actor_23/01-01-06-02-02-02-23.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "fearful"
            }
        ]
    },
    {
        "id": "352",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_01/Actor_01/01-01-06-01-02-01-01.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "fearful"
            }
        ]
    },
    {
        "id": "353",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_14/Actor_14/01-01-06-02-01-01-14.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "fearful"
            }
        ]
    },
    {
        "id": "354",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_05/Actor_05/01-01-06-01-02-02-05.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "fearful"
            }
        ]
    },
    {
        "id": "355",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_16/Actor_16/01-01-06-02-01-01-16.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "fearful"
            }
        ]
    },
    {
        "id": "356",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_23/Actor_23/01-01-06-01-01-01-23.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "fearful"
            }
        ]
    },
    {
        "id": "357",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_03/Actor_03/01-01-06-02-01-01-03.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "fearful"
            }
        ]
    },
    {
        "id": "358",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_11/Actor_11/01-01-06-02-01-02-11.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "fearful"
            }
        ]
    },
    {
        "id": "359",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_13/Actor_13/01-01-06-01-01-02-13.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "fearful"
            }
        ]
    },
    {
        "id": "360",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_13/Actor_13/01-01-06-01-01-01-13.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "fearful"
            }
        ]
    },
    {
        "id": "361",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_12/Actor_12/01-01-06-01-01-01-12.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "fearful"
            }
        ]
    },
    {
        "id": "362",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_12/Actor_12/01-01-06-02-02-02-12.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "fearful"
            }
        ]
    },
    {
        "id": "363",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_23/Actor_23/01-01-06-02-01-02-23.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "fearful"
            }
        ]
    },
    {
        "id": "364",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_01/Actor_01/01-01-06-02-02-01-01.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "fearful"
            }
        ]
    },
    {
        "id": "365",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_08/Actor_08/01-01-06-01-02-02-08.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "fearful"
            }
        ]
    },
    {
        "id": "366",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_01/Actor_01/01-01-06-02-02-02-01.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "fearful"
            }
        ]
    },
    {
        "id": "367",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_08/Actor_08/01-01-06-02-01-02-08.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "fearful"
            }
        ]
    },
    {
        "id": "368",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_24/Actor_24/01-01-06-02-01-02-24.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "fearful"
            }
        ]
    },
    {
        "id": "369",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_03/Actor_03/01-01-06-01-02-01-03.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "fearful"
            }
        ]
    },
    {
        "id": "370",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_10/Actor_10/01-01-06-01-01-02-10.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "fearful"
            }
        ]
    },
    {
        "id": "371",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_02/Actor_02/01-01-06-01-02-01-02.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "fearful"
            }
        ]
    },
    {
        "id": "372",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_02/Actor_02/01-01-07-02-01-02-02.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "disgust"
            }
        ]
    },
    {
        "id": "373",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_07/Actor_07/01-01-07-01-02-02-07.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "disgust"
            }
        ]
    },
    {
        "id": "374",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_23/Actor_23/01-01-07-01-01-02-23.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "disgust"
            }
        ]
    },
    {
        "id": "375",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_06/Actor_06/01-01-07-01-01-01-06.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "disgust"
            }
        ]
    },
    {
        "id": "376",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_11/Actor_11/01-01-07-02-02-01-11.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "disgust"
            }
        ]
    },
    {
        "id": "377",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_20/Actor_20/01-01-07-01-02-01-20.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "disgust"
            }
        ]
    },
    {
        "id": "378",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_02/Actor_02/01-01-07-02-02-01-02.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "disgust"
            }
        ]
    },
    {
        "id": "379",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_05/Actor_05/01-01-07-02-01-02-05.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "disgust"
            }
        ]
    },
    {
        "id": "380",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_09/Actor_09/01-01-07-02-02-01-09.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "disgust"
            }
        ]
    },
    {
        "id": "381",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_20/Actor_20/01-01-07-02-02-02-20.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "disgust"
            }
        ]
    },
    {
        "id": "382",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_15/Actor_15/01-01-07-01-02-01-15.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "disgust"
            }
        ]
    },
    {
        "id": "383",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_02/Actor_02/01-01-07-01-02-01-02.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "disgust"
            }
        ]
    },
    {
        "id": "384",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_16/Actor_16/01-01-07-02-01-02-16.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "disgust"
            }
        ]
    },
    {
        "id": "385",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_14/Actor_14/01-01-07-02-01-02-14.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "disgust"
            }
        ]
    },
    {
        "id": "386",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_10/Actor_10/01-01-07-01-01-02-10.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "disgust"
            }
        ]
    },
    {
        "id": "387",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_07/Actor_07/01-01-07-01-02-01-07.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "disgust"
            }
        ]
    },
    {
        "id": "388",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_14/Actor_14/01-01-07-01-01-02-14.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "disgust"
            }
        ]
    },
    {
        "id": "389",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_15/Actor_15/01-01-07-02-02-02-15.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "disgust"
            }
        ]
    },
    {
        "id": "390",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_03/Actor_03/01-01-07-02-01-01-03.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "disgust"
            }
        ]
    },
    {
        "id": "391",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_17/Actor_17/01-01-07-02-01-02-17.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "disgust"
            }
        ]
    },
    {
        "id": "392",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_01/Actor_01/01-01-07-02-02-02-01.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "disgust"
            }
        ]
    },
    {
        "id": "393",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_09/Actor_09/01-01-07-01-01-01-09.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "disgust"
            }
        ]
    },
    {
        "id": "394",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_09/Actor_09/01-01-07-02-01-02-09.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "disgust"
            }
        ]
    },
    {
        "id": "395",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_08/Actor_08/01-01-07-01-01-01-08.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "disgust"
            }
        ]
    },
    {
        "id": "396",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_01/Actor_01/01-01-07-02-02-01-01.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "disgust"
            }
        ]
    },
    {
        "id": "397",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_12/Actor_12/01-01-07-02-01-02-12.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "disgust"
            }
        ]
    },
    {
        "id": "398",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_12/Actor_12/01-01-07-01-01-01-12.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "disgust"
            }
        ]
    },
    {
        "id": "399",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_10/Actor_10/01-01-07-01-01-01-10.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "disgust"
            }
        ]
    },
    {
        "id": "400",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_05/Actor_05/01-01-07-01-01-02-05.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "disgust"
            }
        ]
    },
    {
        "id": "401",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_15/Actor_15/01-01-07-01-01-01-15.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "disgust"
            }
        ]
    },
    {
        "id": "402",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_12/Actor_12/01-01-07-01-02-01-12.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "disgust"
            }
        ]
    },
    {
        "id": "403",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_24/Actor_24/01-01-07-02-02-02-24.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "disgust"
            }
        ]
    },
    {
        "id": "404",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_08/Actor_08/01-01-07-01-02-02-08.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "disgust"
            }
        ]
    },
    {
        "id": "405",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_18/Actor_18/01-01-07-01-02-01-18.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "disgust"
            }
        ]
    },
    {
        "id": "406",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_08/Actor_08/01-01-07-02-02-01-08.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "disgust"
            }
        ]
    },
    {
        "id": "407",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_20/Actor_20/01-01-07-02-01-02-20.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "disgust"
            }
        ]
    },
    {
        "id": "408",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_15/Actor_15/01-01-07-01-02-02-15.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "disgust"
            }
        ]
    },
    {
        "id": "409",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_14/Actor_14/01-01-07-01-02-01-14.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "disgust"
            }
        ]
    },
    {
        "id": "410",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_03/Actor_03/01-01-07-01-02-02-03.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "disgust"
            }
        ]
    },
    {
        "id": "411",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_12/Actor_12/01-01-07-02-01-01-12.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "disgust"
            }
        ]
    },
    {
        "id": "412",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_16/Actor_16/01-01-07-02-02-02-16.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "disgust"
            }
        ]
    },
    {
        "id": "413",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_18/Actor_18/01-01-07-02-02-01-18.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "disgust"
            }
        ]
    },
    {
        "id": "414",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_01/Actor_01/01-01-07-02-01-02-01.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "disgust"
            }
        ]
    },
    {
        "id": "415",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_12/Actor_12/01-01-07-02-02-02-12.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "disgust"
            }
        ]
    },
    {
        "id": "416",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_10/Actor_10/01-01-07-01-02-02-10.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "disgust"
            }
        ]
    },
    {
        "id": "417",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_15/Actor_15/01-01-07-02-02-01-15.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "disgust"
            }
        ]
    },
    {
        "id": "418",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_05/Actor_05/01-01-07-01-01-01-05.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "disgust"
            }
        ]
    },
    {
        "id": "419",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_02/Actor_02/01-01-07-01-01-02-02.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "disgust"
            }
        ]
    },
    {
        "id": "420",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_08/Actor_08/01-01-07-01-01-02-08.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "disgust"
            }
        ]
    },
    {
        "id": "421",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_19/Actor_19/01-01-07-01-02-01-19.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "disgust"
            }
        ]
    },
    {
        "id": "422",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_04/Actor_04/01-01-07-02-02-01-04.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "disgust"
            }
        ]
    },
    {
        "id": "423",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_18/Actor_18/01-01-07-01-01-01-18.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "disgust"
            }
        ]
    },
    {
        "id": "424",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_05/Actor_05/01-01-07-02-02-02-05.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "disgust"
            }
        ]
    },
    {
        "id": "425",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_19/Actor_19/01-01-07-02-01-01-19.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "disgust"
            }
        ]
    },
    {
        "id": "426",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_11/Actor_11/01-01-07-02-01-02-11.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "disgust"
            }
        ]
    },
    {
        "id": "427",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_23/Actor_23/01-01-07-01-02-02-23.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "disgust"
            }
        ]
    },
    {
        "id": "428",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_16/Actor_16/01-01-07-01-01-02-16.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "disgust"
            }
        ]
    },
    {
        "id": "429",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_14/Actor_14/01-01-07-01-01-01-14.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "disgust"
            }
        ]
    },
    {
        "id": "430",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_18/Actor_18/01-01-07-02-02-02-18.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "disgust"
            }
        ]
    },
    {
        "id": "431",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_12/Actor_12/01-01-07-01-01-02-12.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "disgust"
            }
        ]
    },
    {
        "id": "432",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_18/Actor_18/01-01-07-02-01-01-18.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "disgust"
            }
        ]
    },
    {
        "id": "433",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_16/Actor_16/01-01-07-02-02-01-16.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "disgust"
            }
        ]
    },
    {
        "id": "434",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_22/Actor_22/01-01-08-01-02-02-22.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "surprised"
            }
        ]
    },
    {
        "id": "435",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_14/Actor_14/01-01-08-02-02-02-14.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "surprised"
            }
        ]
    },
    {
        "id": "436",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_24/Actor_24/01-01-08-02-01-01-24.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "surprised"
            }
        ]
    },
    {
        "id": "437",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_23/Actor_23/01-01-08-01-02-01-23.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "surprised"
            }
        ]
    },
    {
        "id": "438",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_24/Actor_24/01-01-08-02-02-02-24.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "surprised"
            }
        ]
    },
    {
        "id": "439",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_24/Actor_24/01-01-08-01-02-02-24.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "surprised"
            }
        ]
    },
    {
        "id": "440",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_02/Actor_02/01-01-08-01-01-02-02.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "surprised"
            }
        ]
    },
    {
        "id": "441",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_18/Actor_18/01-01-08-01-01-01-18.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "surprised"
            }
        ]
    },
    {
        "id": "442",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_15/Actor_15/01-01-08-02-02-02-15.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "surprised"
            }
        ]
    },
    {
        "id": "443",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_23/Actor_23/01-01-08-02-01-02-23.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "surprised"
            }
        ]
    },
    {
        "id": "444",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_04/Actor_04/01-01-08-01-02-02-04.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "surprised"
            }
        ]
    },
    {
        "id": "445",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_01/Actor_01/01-01-08-01-02-01-01.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "surprised"
            }
        ]
    },
    {
        "id": "446",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_24/Actor_24/01-01-08-02-02-01-24.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "surprised"
            }
        ]
    },
    {
        "id": "447",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_24/Actor_24/01-01-08-02-01-02-24.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "surprised"
            }
        ]
    },
    {
        "id": "448",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_13/Actor_13/01-01-08-02-02-02-13.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "surprised"
            }
        ]
    },
    {
        "id": "449",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_20/Actor_20/01-01-08-02-02-01-20.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "surprised"
            }
        ]
    },
    {
        "id": "450",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_06/Actor_06/01-01-08-02-02-01-06.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "surprised"
            }
        ]
    },
    {
        "id": "451",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_08/Actor_08/01-01-08-02-02-01-08.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "surprised"
            }
        ]
    },
    {
        "id": "452",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_12/Actor_12/01-01-08-02-02-02-12.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "surprised"
            }
        ]
    },
    {
        "id": "453",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_19/Actor_19/01-01-08-02-01-02-19.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "surprised"
            }
        ]
    },
    {
        "id": "454",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_10/Actor_10/01-01-08-01-01-01-10.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "surprised"
            }
        ]
    },
    {
        "id": "455",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_18/Actor_18/01-01-08-01-01-02-18.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "surprised"
            }
        ]
    },
    {
        "id": "456",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_18/Actor_18/01-01-08-02-01-01-18.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "surprised"
            }
        ]
    },
    {
        "id": "457",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_04/Actor_04/01-01-08-02-02-01-04.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "surprised"
            }
        ]
    },
    {
        "id": "458",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_18/Actor_18/01-01-08-01-02-02-18.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "surprised"
            }
        ]
    },
    {
        "id": "459",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_12/Actor_12/01-01-08-02-01-02-12.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "surprised"
            }
        ]
    },
    {
        "id": "460",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_01/Actor_01/01-01-08-02-01-02-01.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "surprised"
            }
        ]
    },
    {
        "id": "461",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_22/Actor_22/01-01-08-01-01-01-22.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "surprised"
            }
        ]
    },
    {
        "id": "462",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_03/Actor_03/01-01-08-02-02-01-03.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "surprised"
            }
        ]
    },
    {
        "id": "463",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_09/Actor_09/01-01-08-02-02-01-09.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "surprised"
            }
        ]
    },
    {
        "id": "464",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_17/Actor_17/01-01-08-01-02-01-17.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "surprised"
            }
        ]
    },
    {
        "id": "465",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_05/Actor_05/01-01-08-02-01-01-05.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "surprised"
            }
        ]
    },
    {
        "id": "466",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_04/Actor_04/01-01-08-02-01-02-04.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "surprised"
            }
        ]
    },
    {
        "id": "467",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_10/Actor_10/01-01-08-01-01-02-10.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "surprised"
            }
        ]
    },
    {
        "id": "468",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_08/Actor_08/01-01-08-01-02-02-08.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "surprised"
            }
        ]
    },
    {
        "id": "469",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_15/Actor_15/01-01-08-02-01-01-15.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "surprised"
            }
        ]
    },
    {
        "id": "470",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_14/Actor_14/01-01-08-02-01-02-14.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "surprised"
            }
        ]
    },
    {
        "id": "471",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_20/Actor_20/01-01-08-01-02-02-20.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "surprised"
            }
        ]
    },
    {
        "id": "472",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_15/Actor_15/01-01-08-02-01-02-15.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "surprised"
            }
        ]
    },
    {
        "id": "473",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_24/Actor_24/01-01-08-01-01-02-24.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "surprised"
            }
        ]
    },
    {
        "id": "474",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_01/Actor_01/01-01-08-02-02-02-01.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "surprised"
            }
        ]
    },
    {
        "id": "475",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_17/Actor_17/01-01-08-02-01-02-17.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "surprised"
            }
        ]
    },
    {
        "id": "476",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_21/Actor_21/01-01-08-02-01-02-21.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "surprised"
            }
        ]
    },
    {
        "id": "477",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_01/Actor_01/01-01-08-02-02-01-01.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "surprised"
            }
        ]
    },
    {
        "id": "478",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_23/Actor_23/01-01-08-01-02-02-23.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "surprised"
            }
        ]
    },
    {
        "id": "479",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_09/Actor_09/01-01-08-01-02-01-09.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "surprised"
            }
        ]
    },
    {
        "id": "480",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_02/Actor_02/01-01-08-02-01-01-02.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "surprised"
            }
        ]
    },
    {
        "id": "481",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_18/Actor_18/01-01-08-02-02-02-18.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "surprised"
            }
        ]
    },
    {
        "id": "482",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_10/Actor_10/01-01-08-01-02-02-10.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "surprised"
            }
        ]
    },
    {
        "id": "483",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_17/Actor_17/01-01-08-01-01-02-17.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "surprised"
            }
        ]
    },
    {
        "id": "484",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_21/Actor_21/01-01-08-02-01-01-21.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "surprised"
            }
        ]
    },
    {
        "id": "485",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_01/Actor_01/01-01-08-01-01-01-01.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "surprised"
            }
        ]
    },
    {
        "id": "486",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_19/Actor_19/01-01-08-01-01-02-19.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "surprised"
            }
        ]
    },
    {
        "id": "487",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_15/Actor_15/01-01-08-01-02-01-15.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "surprised"
            }
        ]
    },
    {
        "id": "488",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_22/Actor_22/01-01-08-02-02-02-22.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "surprised"
            }
        ]
    },
    {
        "id": "489",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_14/Actor_14/01-01-08-01-01-02-14.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "surprised"
            }
        ]
    },
    {
        "id": "490",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_10/Actor_10/01-01-08-02-01-02-10.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "surprised"
            }
        ]
    },
    {
        "id": "491",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_21/Actor_21/01-01-08-02-02-02-21.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "surprised"
            }
        ]
    },
    {
        "id": "492",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_23/Actor_23/01-01-08-01-01-02-23.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "surprised"
            }
        ]
    },
    {
        "id": "493",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_22/Actor_22/01-01-08-01-01-02-22.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "surprised"
            }
        ]
    },
    {
        "id": "494",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_12/Actor_12/01-01-08-01-02-01-12.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "surprised"
            }
        ]
    },
    {
        "id": "495",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_16/Actor_16/01-01-08-01-02-02-16.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "surprised"
            }
        ]
    },
    {
        "id": "496",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_02/Actor_02/01-01-01-01-02-01-02.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "neutral"
            }
        ]
    },
    {
        "id": "497",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_02/Actor_02/01-01-01-01-02-02-02.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "neutral"
            }
        ]
    },
    {
        "id": "498",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_03/Actor_03/01-01-01-01-01-02-03.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "neutral"
            }
        ]
    },
    {
        "id": "499",
        "video": "RAVDSS_speech/videos/Video_Speech_Actor_04/Actor_04/01-01-01-01-01-01-04.mp4",
        "conversations": [
            {
                "from": "human",
                "value": "<video>\nYou are a multimodal affective computing expert grounded in theory of mind. Your task is to infer the speaker’s current first-order emotional state during speech by reasoning about their mental world from observable evidence. Complete the speech emotion recognition task as follows: (1) Decode Affective Signals (Perceptual Simulation): Systematically decode speech-related cues as outward evidence of an inner state—acoustic-prosodic features (pitch level/range/contour, loudness/energy, tempo/rhythm, emphasis, voice quality such as breathy/tense/rough), articulatory-visual cues (mouth shaping, facial expression, gaze, head and upper-body movement, posture), and transcript content if provided. Briefly note the present cues and their likely emotion implications. (2) Synthesize a Mental State Hypothesis (Cognitive Empathy): Integrate the decoded cues into a coherent here-and-now emotion hypothesis. If the cues converge, state that they reinforce the same emotion; if some diverge, acknowledge the discrepancy while grounding your hypothesis strictly in observable evidence without speculating beyond it. (3) Attribute from the Speaker’s Perspective (Perspective-Taking): Refine the hypothesis to attribute the single most plausible immediate emotion from the speaker’s perspective (not the listener’s reaction and not a stable trait). (4) Conclude and Map to Label: Provide your reasoning following the steps above, then choose exactly one label from: neutral, calm, happy, sad, angry, fearful, surprised, disgust, and output exactly {'emotion': 'label'}."
            },
            {
                "from": "gpt",
                "value": "neutral"
            }
        ]
    }
]