{
    "question_token_to_idx": {
        "<NULL>": 0,
        "<UNK>": 1,
        "is": 2,
        "there": 3,
        "a": 4,
        "voiceover": 5,
        "narrator": 6,
        "present": 7,
        "exists": 8,
        "does": 9,
        "it": 10,
        "have": 11,
        "do": 12,
        "you": 13,
        "hear": 14,
        "narration": 15,
        "can": 16,
        "voice-over": 17,
        "voice": 18,
        "actor": 19,
        "artist": 20,
        "commentary": 21,
        "dubbing": 22,
        "recording": 23,
        "performance": 24,
        "presentation": 25,
        "interpretation": 26,
        "available": 27,
        "included": 28,
        "featured": 29,
        "provided": 30,
        "the": 31,
        "trumpet": 32,
        "in": 33,
        "video": 34,
        "always": 35,
        "playing": 36,
        "'s": 37,
        "constantly": 38,
        "on": 39,
        "all": 40,
        "time": 41,
        "plays": 42,
        "continuously": 43,
        "right": 44,
        "correct": 45,
        "that": 46,
        "keeps": 47,
        "continually": 48,
        "play": 49,
        "anyone": 50,
        "entire": 51,
        "i": 52,
        "believe": 53,
        "am": 54,
        "played": 55,
        "somebody": 56,
        "never": 57,
        "stops": 58,
        "nonstop": 59,
        "throughout": 60,
        "continuous": 61,
        "how": 62,
        "many": 63,
        "instruments": 64,
        "are": 65,
        "sounding": 66,
        "be": 67,
        "heard": 68,
        "different": 69,
        "has": 70,
        "name": 71,
        "from": 72,
        "musical": 73,
        "audible": 74,
        "distinct": 75,
        "being": 76,
        "what": 77,
        "variety": 78,
        "of": 79,
        "count": 80,
        ",": 81,
        "contains": 82,
        "unique": 83,
        "congas": 84,
        "sound": 85,
        "sounds": 86,
        "number": 87,
        "by": 88,
        "people": 89,
        "made": 90,
        "make": 91,
        "out": 92,
        "sound-producing": 93,
        "audio-only": 94,
        "tuba": 95,
        "violin": 96,
        "pipa": 97,
        "types": 98,
        "kinds": 99,
        "identify": 100,
        "audio": 101,
        "player": 102,
        "musicians": 103,
        "used": 104,
        "did": 105,
        "use": 106,
        "were": 107,
        "musician": 108,
        "pieces": 109,
        "this": 110,
        "identified": 111,
        "together": 112,
        "players": 113,
        "acoustic_guitar": 114,
        "ukulele": 115,
        "clarinet": 116,
        "electric_bass": 117,
        "banjo": 118,
        "erhu": 119,
        "bagpipe": 120,
        "suona": 121,
        "piano": 122,
        "cello": 123,
        "bassoon": 124,
        "instrument": 125,
        "correspond": 126,
        "to": 127,
        "coming": 128,
        "music": 129,
        "match": 130,
        "appearing": 131,
        "makes": 132,
        "comes": 133,
        "'": 134,
        "making": 135,
        "come": 136,
        "and": 137,
        "produced": 138,
        "generated": 139,
        "created": 140,
        "originating": 141,
        "derived": 142,
        "caused": 143,
        "triggered": 144,
        "resulting": 145,
        "issuing": 146,
        "drum": 147,
        "accordion": 148,
        "saxophone": 149,
        "flute": 150,
        "guzheng": 151,
        "not": 152,
        "beginning": 153,
        "end": 154,
        "start": 155,
        "finish": 156,
        "n't": 157,
        "at": 158,
        "but": 159,
        "none": 160,
        "them": 161,
        "any": 162,
        "get": 163,
        "touched": 164,
        "noise": 165,
        "been": 166,
        "way": 167,
        "through": 168,
        "appear": 169,
        "performer": 170,
        "touch": 171,
        "completely": 172,
        "unplayed": 173,
        "entirely": 174,
        "silent": 175,
        "sounded": 176,
        "performed": 177,
        "instances": 178,
        "performers": 179,
        "even": 180,
        "left": 181,
        "which": 182,
        "kind": 183,
        "type": 184,
        "see": 185,
        "may": 186,
        "seen": 187,
        "side": 188,
        "called": 189,
        "located": 190,
        "next": 191,
        "equipment": 192,
        "situated": 193,
        "whole": 194,
        "feature": 195,
        "point": 196,
        "scene": 197,
        "with": 198,
        "include": 199,
        "viewed": 200,
        "film": 201,
        "footage": 202,
        "notice": 203,
        "during": 204,
        "times": 205,
        "shown": 206,
        "section": 207,
        "contain": 208,
        "among": 209,
        "displayed": 210,
        "ever": 211,
        "if": 212,
        "includes": 213,
        "including": 214,
        "show": 215,
        "presented": 216,
        "display": 217,
        "demonstrated": 218,
        "demonstrate": 219,
        "xylophone": 220,
        "where": 221,
        "held": 222,
        "takes": 223,
        "place": 224,
        "going": 225,
        "take": 226,
        "taking": 227,
        "conducted": 228,
        "event": 229,
        "happening": 230,
        "staged": 231,
        "executed": 232,
        "carried": 233,
        "done": 234,
        "put": 235,
        "third": 236,
        "enters": 237,
        "room": 238,
        "enter": 239,
        "appears": 240,
        "arrive": 241,
        "arrives": 242,
        "shows": 243,
        "up": 244,
        "thing": 245,
        "was": 246,
        "piece": 247,
        "entered": 248,
        "overall": 249,
        "visible": 250,
        "total": 251,
        "consist": 252,
        "recognize": 253,
        "spot": 254,
        "found": 255,
        "amount": 256,
        "first": 257,
        "for": 258,
        "associated": 259,
        "think": 260,
        "tone": 261,
        "existing": 262,
        "reverberating": 263,
        "resonating": 264,
        "emanating": 265,
        "ear": 266,
        "detected": 267,
        "perceived": 268,
        "produce": 269,
        "sound-creating": 270,
        "tool": 271,
        "sound-emitting": 272,
        "sound-making": 273,
        "sound-generating": 274,
        "noise-producing": 275,
        "device": 276,
        "appeared": 277,
        "apparatus": 278,
        "noise-emitting": 279,
        "occurred": 280,
        "noise-generating": 281,
        "louder": 282,
        "than": 283,
        "when": 284,
        "compared": 285,
        "overpowered": 286,
        "more": 287,
        "resonant": 288,
        "clamorous": 289,
        "dominate": 290,
        "booming": 291,
        "thunderous": 292,
        "overpower": 293,
        "strident": 294,
        "vociferous": 295,
        "drowned": 296,
        "sonorous": 297,
        "deafening": 298,
        "raucous": 299,
        "exist": 300,
        "currently": 301,
        "producing": 302,
        "last": 303,
        "responsible": 304,
        "generates": 305,
        "emits": 306,
        "produces": 307,
        "creates": 308,
        "gives": 309,
        "off": 310,
        "rhythmic": 311,
        "less": 312,
        "cadenced": 313,
        "metronomic": 314,
        "stronger": 315,
        "rhythm": 316,
        "weaker": 317,
        "cadence": 318,
        "beat": 319,
        "rhythmically": 320,
        "inclined": 321,
        "pleasing": 322,
        "engaging": 323,
        "longer": 324,
        "shorter": 325,
        "actually": 326,
        "period": 327,
        "duration": 328,
        "stretch": 329,
        "span": 330,
        "interval": 331,
        "run": 332,
        "length": 333,
        "space": 334,
        "measure": 335,
        "quantum": 336,
        "second": 337,
        "after": 338,
        "whose": 339,
        "capable": 340,
        "create": 341,
        "generate": 342,
        "emit": 343,
        "deliver": 344,
        "provide": 345,
        "manufacture": 346,
        "fabricate": 347,
        "construct": 348,
        "compose": 349,
        "leftest": 350,
        "sort": 351,
        "classified": 352,
        "as": 353,
        "categorized": 354,
        "considered": 355,
        "rightest": 356,
        "before": 357,
        "class": 358,
        "form": 359,
        "style": 360,
        "middle": 361,
        "sorts": 362,
        "quantity": 363,
        "varieties": 364,
        "could": 365,
        "much": 366,
        "course": 367,
        "sense": 368,
        "one": 369,
        "better": 370,
        "perform": 371,
        "guy": 372,
        "person": 373,
        "stage": 374,
        "greater": 375,
        "tempo": 376,
        "same": 377,
        "simultaneously": 378,
        "unison": 379,
        "background": 380,
        "concurrently": 381,
        "work": 382,
        "composition": 383,
        "other": 384,
        "tandem": 385,
        "conjunction": 386,
        "along": 387,
        "harmony": 388,
        "loudest": 389,
        "location": 390,
        "source": 391,
        "please": 392,
        "locate": 393,
        "determine": 394,
        "lowest": 395,
        "higher": 396,
        "loudness": 397,
        "level": 398,
        "volume": 399,
        "exceed": 400,
        "ones": 401,
        "boisterous": 402,
        "version": 403,
        "noisier": 404,
        "component": 405,
        "element": 406,
        "part": 407,
        "gadget": 408,
        "implement": 409,
        "mechanism": 410,
        "tooling": 411,
        "utensil": 412,
        "instrumentation": 413,
        "machinery": 414,
        "contraption": 415,
        "fifth": 416,
        "fourth": 417,
        "?": 418
    },
    "answer_token_to_idx": {
        "yes": 0,
        "no": 1,
        "two": 2,
        "one": 3,
        "indoor": 4,
        "violin": 5,
        "zero": 6,
        "three": 7,
        "piano": 8,
        "left": 9,
        "right": 10,
        "cello": 11,
        "acoustic_guitar": 12,
        "simultaneously": 13,
        "flute": 14,
        "accordion": 15,
        "saxophone": 16,
        "middle": 17,
        "four": 18,
        "ukulele": 19,
        "outdoor": 20,
        "clarinet": 21,
        "trumpet": 22,
        "bassoon": 23,
        "tuba": 24,
        "guzheng": 25,
        "erhu": 26,
        "drum": 27,
        "banjo": 28,
        "pipa": 29,
        "xylophone": 30,
        "bagpipe": 31,
        "electric_bass": 32,
        "suona": 33,
        "five": 34,
        "congas": 35,
        "six": 36,
        "more than ten": 37,
        "eight": 38,
        "seven": 39,
        "nine": 40,
        "ten": 41
    },
    "question_answer_token_to_idx": {
        "<NULL>": 0,
        "<UNK>": 1
    }
}