[
    {
        "question_id": "bc8526d4805e2554adb2e9c01736d3f3a3b19895",
        "doc_id": "1604.02038",
        "question": "What baselines did they compare with?"
    },
    {
        "question_id": "a0fd0c0fe042ad045b8d5095c81643ef3a352b81",
        "doc_id": "1604.02038",
        "question": "Which tasks are explored in this paper?"
    },
    {
        "question_id": "2d91554c3f320a4bcfeb00aa466309074a206712",
        "doc_id": "1903.01411",
        "question": "Which metrics do they use to evaluate results?"
    },
    {
        "question_id": "53362c2870cf76b7981c27b3520a71eb1e3e7965",
        "doc_id": "1903.01411",
        "question": "Does the performance increase with the number of used languages?"
    },
    {
        "question_id": "5138121b9e9bd56962e69bfe49d5df5301cb7745",
        "doc_id": "1903.01411",
        "question": "By how much do they outperform translating without contextual information?"
    },
    {
        "question_id": "181027f398a6b79b1ba44d8d41cc1aba0d6f5212",
        "doc_id": "1709.09749",
        "question": "what other representations do they compare with?"
    },
    {
        "question_id": "ab097db03652b8b38edddc074f23e2adf9278cba",
        "doc_id": "1709.09749",
        "question": "how many layers are in the neural network?"
    },
    {
        "question_id": "5d4190403eb800bb17eec71e979788e11cf74e67",
        "doc_id": "1709.09749",
        "question": "what empirical evaluations performed?"
    },
    {
        "question_id": "56d41e0fcc288c1e65806ae77097d685c83e22db",
        "doc_id": "1709.09749",
        "question": "which document understanding tasks did they evaluate on?"
    },
    {
        "question_id": "1237b6fcc64b43901415f3ded17cc210a54ab698",
        "doc_id": "1709.09749",
        "question": "what dataset was used?"
    },
    {
        "question_id": "742d5e182b57bfa5f589fde645717ed0ac3f49c2",
        "doc_id": "1610.09722",
        "question": "what metrics are used to evaluate the models?"
    },
    {
        "question_id": "726c5c1b6951287f4bae22978f9a91d22d9bef61",
        "doc_id": "1610.09722",
        "question": "what are the baselines?"
    },
    {
        "question_id": "dfdd309e56b71589b25412ba85b0a5d79a467ceb",
        "doc_id": "1610.09722",
        "question": "what is the size of the dataset?"
    },
    {
        "question_id": "7ae95716977d39d96e871e552c35ca0753115229",
        "doc_id": "1610.09722",
        "question": "what dataset did they use?"
    },
    {
        "question_id": "96459b02efa82993a0b413530ed0b517c6633eea",
        "doc_id": "1908.10090",
        "question": "what were the length constraints they set?"
    },
    {
        "question_id": "6c1614991647705265fb348d28ba60dd3b63b799",
        "doc_id": "1908.10090",
        "question": "what is the test set size?"
    },
    {
        "question_id": "693cdb9978749db04ba34d9c168e71534f00a226",
        "doc_id": "1905.07562",
        "question": "Do they report results only on English data?"
    },
    {
        "question_id": "71fd0efea1b441d86d9a75255815ba3efe09779b",
        "doc_id": "1905.07562",
        "question": "How do the authors measure the extent to which LGI has learned the task?"
    },
    {
        "question_id": "fb9e333a4e5d5141fe8e97b24b8f7e5685afbf09",
        "doc_id": "1905.07562",
        "question": "Which 8 tasks has LGI learned?"
    },
    {
        "question_id": "cb029240d4dedde74fcafad6a46c1cfc2621b934",
        "doc_id": "1905.07562",
        "question": "In what was does an LSTM mimic the prefrontal cortex?"
    },
    {
        "question_id": "11a8531699952f5a2286a4311f0fe80ed1befa1e",
        "doc_id": "1905.07562",
        "question": "In what way does an LSTM mimic the intra parietal sulcus?"
    },
    {
        "question_id": "bcf222ad4bb537b01019ed354ea03cd6bf2c1f8e",
        "doc_id": "1905.07562",
        "question": "How do the authors define imagination, or imagined scenarios?"
    },
    {
        "question_id": "b70e4c49300dc3eab18e907ab903afd2a0c6075a",
        "doc_id": "1909.00437",
        "question": "What are the five downstream tasks?"
    },
    {
        "question_id": "088d42ecb1e15515f6a97a0da2fed81b61d61a23",
        "doc_id": "1909.00437",
        "question": "Is this more effective for low-resource than high-resource languages?"
    },
    {
        "question_id": "8599d6d14ac157169920c73b98a79737c7a68cf5",
        "doc_id": "1909.00437",
        "question": "Is mBERT fine-tuned for each language?"
    },
    {
        "question_id": "f1d61b44105e651925d02a51e6d7ea10ea28ebd8",
        "doc_id": "1909.00437",
        "question": "How did they select the 50 languages they test?"
    },
    {
        "question_id": "52f5249a9a2cb7210eeb8e52cb29d18912f6c3aa",
        "doc_id": "1911.06815",
        "question": "How many layers does the neural network have?"
    },
    {
        "question_id": "baad4b6f834d5944f61bd12f30908e3cf3739dcd",
        "doc_id": "1911.06815",
        "question": "Which BERT-based baselines do they compare to?"
    },
    {
        "question_id": "37b972a3afae04193411dc569f672d802c16ad71",
        "doc_id": "1911.06815",
        "question": "What are the propaganda types?"
    },
    {
        "question_id": "a01af34c7f630ba0e79e0a0120d2e1c92d022df5",
        "doc_id": "1911.06815",
        "question": "Do they look at various languages?"
    },
    {
        "question_id": "0c4e419fe57bf01d58a44f3e263777c22cdd90dc",
        "doc_id": "1911.06815",
        "question": "What datasets did they use in their experiment?"
    },
    {
        "question_id": "d66c31f24f582c499309a435ec3c688dc3a41313",
        "doc_id": "1907.00937",
        "question": "What were the baseline methods?"
    },
    {
        "question_id": "c47312f2ca834ee75fa9bfbf912ea04239064117",
        "doc_id": "1907.00937",
        "question": "What dataset is used for training?"
    },
    {
        "question_id": "f8fe4049bea86d0518d1881f32049e60526d0f34",
        "doc_id": "1709.05700",
        "question": "What existing techniques do the authors compare against?"
    },
    {
        "question_id": "5499440674f0e4a9d6912b9ac29fa1f7b7cd5253",
        "doc_id": "1811.01183",
        "question": "Do they compare to previous work?"
    },
    {
        "question_id": "de313b5061fc22e8ffef1706445728de298eae31",
        "doc_id": "1811.01183",
        "question": "What is the source of their data?"
    },
    {
        "question_id": "47b7bc232af7bf93338bd3926345e23e9e80c0c1",
        "doc_id": "1811.01183",
        "question": "What is their binary classifier?"
    },
    {
        "question_id": "0b5c599195973c563c4b1a0fe5d8fc77204d71a0",
        "doc_id": "1811.01183",
        "question": "How long is their dataset?"
    },
    {
        "question_id": "1397b1c51f722a4ee2b6c64dc9fc6afc8bd3e880",
        "doc_id": "1811.01183",
        "question": "What is a study descriptor?"
    },
    {
        "question_id": "d4b84f48460517bc0a6d4e0c38f6853c58081166",
        "doc_id": "1804.01155",
        "question": "How do they combine the socioeconomic maps with Twitter data? "
    },
    {
        "question_id": "90756bdcd812b7ecc1c5df2298aa7561fd2eb02c",
        "doc_id": "1804.01155",
        "question": "Does the fact that people are active during the day time define their SEC?"
    },
    {
        "question_id": "028d0d9b7a71133e51a14a32cd09dea1e2f39f05",
        "doc_id": "1804.01155",
        "question": "How did they define standard language?"
    },
    {
        "question_id": "cfc73e0c82cf1630b923681c450a541a964688b9",
        "doc_id": "1804.01155",
        "question": "How do they operationalize socioeconomic status from twitter user data?"
    },
    {
        "question_id": "7c398615141ca416a32c9f72dbb785d3a6986a0f",
        "doc_id": "1911.03090",
        "question": "In what tasks does fine-tuning all layers hurt performance?"
    },
    {
        "question_id": "441be93e2830cc0fc65afad6959db92754c9f5a8",
        "doc_id": "1911.03090",
        "question": "Do they test against the large version of RoBERTa?"
    },
    {
        "question_id": "5f25b57a1765682331e90a46c592a4cea9e3a336",
        "doc_id": "1912.04979",
        "question": "Are face tracking, identification, localization etc multimodal inputs in some ML model or system is programmed by hand?"
    },
    {
        "question_id": "2ba2ff6c21a16bd295b07af1ef635b3b4c5bd17e",
        "doc_id": "1912.04979",
        "question": "What are baselines used?"
    },
    {
        "question_id": "3e5162e6399c7d03ecc7007efd21d06c04cf2843",
        "doc_id": "1911.11025",
        "question": "Do the authors report only on English data?"
    },
    {
        "question_id": "bd255aadf099854541d06997f83a0e478f526120",
        "doc_id": "1911.11025",
        "question": "How is the impact of ParityBOT analyzed?"
    },
    {
        "question_id": "a9ff35f77615b3a4e7fd7b3a53d0b288a46f06ce",
        "doc_id": "1911.11025",
        "question": "What public online harassment datasets was the system validated on?"
    },
    {
        "question_id": "69a46a227269c3aac9bf9d7c3d698c787642f806",
        "doc_id": "1911.11025",
        "question": "Where do the supportive tweets about women come from? Are they automatically or manually generated?"
    },
    {
        "question_id": "ebe6b8ec141172f7fea66f0a896b3124276d4884",
        "doc_id": "1911.11025",
        "question": "How are the hateful tweets aimed at women detected/classified?"
    },
    {
        "question_id": "fa5f5f58f6277a1e433f80c9a92a5629d6d9a271",
        "doc_id": "2002.09616",
        "question": "What evaluation metrics did they use?"
    },
    {
        "question_id": "3b9da1af1550e01d2e6ba2b9edf55a289f5fa8e2",
        "doc_id": "2002.09616",
        "question": "By how much does their model outperform the baseline?"
    },
    {
        "question_id": "f88f45ef563ea9e40c5767ab2eaa77f4700f95f8",
        "doc_id": "2002.09616",
        "question": "Which models did they compare with?"
    },
    {
        "question_id": "99e99f2c25706085cd4de4d55afe0ac43213d7c8",
        "doc_id": "2002.09616",
        "question": "What is the source of their datasets?"
    },
    {
        "question_id": "7c561db6847fb0416bca8a6cb5eebf689a4b1438",
        "doc_id": "1910.10762",
        "question": "What sizes were their datasets?"
    },
    {
        "question_id": "13eb64957478ade79a1e81d32e36ee319209c19a",
        "doc_id": "1910.10762",
        "question": "How many layers does their model have?"
    },
    {
        "question_id": "3cfe464052f0a248b6e22c9351279403dfe34f3c",
        "doc_id": "1910.10762",
        "question": "What is their model's architecture?"
    },
    {
        "question_id": "119c404da6e42d4879eee10edeab4b2851162659",
        "doc_id": "1910.10762",
        "question": "What languages did they use?"
    },
    {
        "question_id": "c728fe6137f114c02e921f9be4a02a5bd83ae787",
        "doc_id": "1809.08935",
        "question": "what features of the essays are extracted?"
    },
    {
        "question_id": "50bda708293532f07a3193aaea0519d433fcc040",
        "doc_id": "1809.08935",
        "question": "what were the evaluation metrics?"
    },
    {
        "question_id": "46e660becd727c994a2a35c6587e15ea8bf8272d",
        "doc_id": "1809.08935",
        "question": "what model is used?"
    },
    {
        "question_id": "d1a4529ea32aaab5ca3b9d9ae5c16f146c23af6b",
        "doc_id": "1809.08935",
        "question": "what future work is described?"
    },
    {
        "question_id": "7fba61426737394304e307cdc7537225f6253150",
        "doc_id": "1809.08935",
        "question": "what was the baseline?"
    },
    {
        "question_id": "37ac705166fa87dc74fe86575bf04bea56cc4930",
        "doc_id": "2004.02363",
        "question": "what evaluation metrics were used?"
    },
    {
        "question_id": "90aba75508aa145475d7cc9a501bbe987c0e8413",
        "doc_id": "2004.02363",
        "question": "What datasets are used?"
    },
    {
        "question_id": "3a01dc85ac983002fd631f1c28fc1cbe16094c24",
        "doc_id": "1709.05453",
        "question": "How you incorporate commonsense into an LSTM?"
    },
    {
        "question_id": "00ffe2c59a3ba18d6d2b353d6ab062a152c88526",
        "doc_id": "1709.05453",
        "question": "Which domain are the conversations in?"
    },
    {
        "question_id": "042800c3336ed5f4826203616a39747c61382ba6",
        "doc_id": "1709.05453",
        "question": "Which commonsense knowledge base are they using?"
    },
    {
        "question_id": "6157567c5614e1954b801431fec680f044e102c6",
        "doc_id": "1610.03807",
        "question": "Was the filtering based on fluency and domain relevance done automatically?"
    },
    {
        "question_id": "8ea4a75dacf6a39f9d385ba14b3dce715a47d689",
        "doc_id": "1610.03807",
        "question": "How was domain relevance estimated?"
    },
    {
        "question_id": "1e11e74481ead4b7635922bbe0de041dc2dde28d",
        "doc_id": "1610.03807",
        "question": "How many hand-crafted templates did they have to make?"
    },
    {
        "question_id": "597d3fc9b8c0c036f58cea5b757d0109d5211b2f",
        "doc_id": "1610.03807",
        "question": "How was the fluency measured?"
    },
    {
        "question_id": "230f127e83ac62dd65fccf6b1a4960cf0f7316c7",
        "doc_id": "2004.02401",
        "question": "How are experiments designed to measure impact on performance by different choices?"
    },
    {
        "question_id": "75c221920bee14a6153bd5f4c1179591b2f48d59",
        "doc_id": "2004.02401",
        "question": "What impact on performance is shown for different choices of optimizers and learning rate policies?"
    },
    {
        "question_id": "4379a3ece3fdb93b71db43f62833f5f724c49842",
        "doc_id": "1909.13184",
        "question": "Do the authors report results on only English datasets?"
    },
    {
        "question_id": "0abc2499195185c94837e0340d00cd3b83ee795e",
        "doc_id": "1909.13184",
        "question": "What are the characteristics of the dataset of Twitter users?"
    },
    {
        "question_id": "138ad61b43c85d5db166ea9bd3d3b19bb2e2bbfb",
        "doc_id": "1909.13184",
        "question": "How can an existing bot detection system by customized for health-related research?"
    },
    {
        "question_id": "7e906dc00e92088a25df3719104d1750e5a27485",
        "doc_id": "1909.13184",
        "question": "What type of health-related research takes place in social media?"
    },
    {
        "question_id": "641fe5dc93611411582e6a4a0ea2d5773eaf0310",
        "doc_id": "1802.03052",
        "question": "What does it mean for sentences to be \"lexically overlapping\"?"
    },
    {
        "question_id": "7d34cdd9cb1c988e218ce0fd59ba6a3b5de2024a",
        "doc_id": "1802.03052",
        "question": "How many tables are in the tablestore?"
    },
    {
        "question_id": "f71b95001dce46ee35cdbd8d177676de19ca2611",
        "doc_id": "1911.07620",
        "question": "What regularization methods are used?"
    },
    {
        "question_id": "5aa6556ffd7142933f820a015f1294d38e8cd96c",
        "doc_id": "1911.07620",
        "question": "What metrics are used?"
    },
    {
        "question_id": "10edfb9428b8a4652274c13962917662fdf84f8a",
        "doc_id": "1911.07620",
        "question": "How long is the dataset?"
    },
    {
        "question_id": "a836ab8eb5a72af4b0a0c83bf42a2a14d1b38763",
        "doc_id": "1911.07620",
        "question": "What dataset do they use?"
    },
    {
        "question_id": "91e361e85c6d3884694f3c747d61bfcef171bab0",
        "doc_id": "1909.12079",
        "question": "How do they obtain the entity linking results in their model?"
    },
    {
        "question_id": "6295951fda0cfa2eb4259d544b00bc7dade7c01e",
        "doc_id": "1909.12079",
        "question": "Which model architecture do they use?"
    },
    {
        "question_id": "3f717e6eceab0a066af65ddf782c1ebc502c28c0",
        "doc_id": "1909.12079",
        "question": "Which datasets do they evaluate on?"
    },
    {
        "question_id": "0f1f81b6d4aa0da38b4cc8b060926e7df61bb646",
        "doc_id": "1909.05190",
        "question": "What is the machine learning method used to make the predictions?"
    },
    {
        "question_id": "ec62df859ad901bf0848f0a8b91eedc78dba5657",
        "doc_id": "1909.05190",
        "question": "How is the event prediction task evaluated?"
    },
    {
        "question_id": "ccec4f8deff651858f44553f8daa5a19e8ed8d3b",
        "doc_id": "1909.05190",
        "question": "What are the datasets used in the paper?"
    },
    {
        "question_id": "15aeda407ae3912419fd89211cdb98989d9cde58",
        "doc_id": "1908.08788",
        "question": "What pretrained language representations are used?"
    },
    {
        "question_id": "c8b2fb9e0d5fb9014a25b88d559d93b6dceffbc0",
        "doc_id": "1908.08788",
        "question": "How many instances are explored in the few-shot experiments?"
    },
    {
        "question_id": "c24f7c030010ad11e71ef4912fd79093503f3a8d",
        "doc_id": "1908.08788",
        "question": "What tasks are explored?"
    },
    {
        "question_id": "620b6c410a055295d137511d3c99207a47c03b5e",
        "doc_id": "1908.09590",
        "question": "How significant are the improvements over previous approaches?"
    },
    {
        "question_id": "e459760879f662b2205cbdc0f5396dbfe41323ae",
        "doc_id": "1908.09590",
        "question": "Which other tasks are evaluated?"
    },
    {
        "question_id": "1c3a20dceec2a86fb61e70fab97a9fb549b5c54c",
        "doc_id": "1908.09590",
        "question": "What are the performances associated to different attribute placing?"
    },
    {
        "question_id": "68f1df3fb0703ff694a055d23e7ec3f6fb449b8d",
        "doc_id": "1801.03615",
        "question": "what is the previous work they are comparing to?"
    },
    {
        "question_id": "da10e3cefbbd7ec73eabc6c93d338239ce84709e",
        "doc_id": "1612.00866",
        "question": "What new advances are included in this dataset?"
    },
    {
        "question_id": "00c443f8d32d6baf7c7cea8f4ca9fa749532ccfd",
        "doc_id": "1612.00866",
        "question": "What language is this dataset in?"
    },
    {
        "question_id": "5da9e2eef741bd7efccec8e441b8e52e906b2d2d",
        "doc_id": "1804.00982",
        "question": "do they compare their system with other systems?"
    },
    {
        "question_id": "77bc886478925c8e9fb369b1ba5d05c42b3cd79a",
        "doc_id": "1804.00982",
        "question": "what is the architecture of their model?"
    },
    {
        "question_id": "f15bc40960bd3f81bc791f43ab5c94c52378692d",
        "doc_id": "1804.00982",
        "question": "what dataset did they use for this tool?"
    },
    {
        "question_id": "d5d4504f419862275a532b8e53d0ece16e0ae8d1",
        "doc_id": "1711.11118",
        "question": "How many of the attribute-value pairs are found in video?"
    },
    {
        "question_id": "f1e70b63c45ab0fc35dc63de089c802543e30c8f",
        "doc_id": "1711.11118",
        "question": "How many of the attribute-value pairs are found in audio?"
    },
    {
        "question_id": "39d20b396f12f0432770c15b80dc0d740202f98d",
        "doc_id": "1711.11118",
        "question": "How many of the attribute-value pairs are found in images?"
    },
    {
        "question_id": "4e0df856b39055a9ba801cc9c8e56d5b069bda11",
        "doc_id": "1711.11118",
        "question": "How many of the attribute-value pairs are found in semi-structured text?"
    },
    {
        "question_id": "bbc6d0402cae16084261f8558cebb4aa6d5b1ea5",
        "doc_id": "1711.11118",
        "question": "How many of the attribute-value pairs are found in unstructured text?"
    },
    {
        "question_id": "a7e03d24549961b38e15b5386d9df267900ef4c8",
        "doc_id": "1711.11118",
        "question": "How many different semi-structured templates are represented in the data?"
    },
    {
        "question_id": "036c400424357457e42b22df477b7c3cdc2eefe9",
        "doc_id": "1711.11118",
        "question": "Are all datapoints from the same website?"
    },
    {
        "question_id": "63eda2af88c35a507fbbfda0ec1082f58091883a",
        "doc_id": "1711.11118",
        "question": "Do they consider semi-structured webpages?"
    },
    {
        "question_id": "50c441a9cc7345a0fa408d1ce2e13f194c1e82a8",
        "doc_id": "1911.11161",
        "question": "What is the state-of-the-art approach?"
    },
    {
        "question_id": "44bf3047ff7e5c6b727b2aaa0805dd66c907dcd6",
        "doc_id": "1911.12237",
        "question": "How many abstractive summarizations exist for each dialogue?"
    },
    {
        "question_id": "c6f2598b85dc74123fe879bf23aafc7213853f5b",
        "doc_id": "1911.12237",
        "question": "How is human evaluators' judgement measured, what was the criteria?"
    },
    {
        "question_id": "bdae851d4cf1d05506cf3e8359786031ac4f756f",
        "doc_id": "1911.12237",
        "question": "What models have been evaluated?"
    },
    {
        "question_id": "894bbb1e42540894deb31c04cba0e6cfb10ea912",
        "doc_id": "1911.12237",
        "question": "Do authors propose some better metric than ROUGE for measurement of abstractive dialogue summarization?"
    },
    {
        "question_id": "75b3e2d2caec56e5c8fbf6532070b98d70774b95",
        "doc_id": "1911.12237",
        "question": "How big is SAMSum Corpus?"
    },
    {
        "question_id": "014830892d93e3c01cb659ad31c90de4518d48f3",
        "doc_id": "2004.02143",
        "question": "How much did the model outperform"
    },
    {
        "question_id": "ae7c5cf9c2c121097eb00d389cfd7cc2a5a7d577",
        "doc_id": "2004.02143",
        "question": "What language is in the dataset?"
    },
    {
        "question_id": "af948ea91136c700957b438d927f58d9b051c97c",
        "doc_id": "2004.02143",
        "question": "How big is the HotPotQA dataset?"
    },
    {
        "question_id": "00e4c9aa87411dfc5455fc92f10e5c9266e7b95e",
        "doc_id": "1710.06923",
        "question": "Which dataset do they use?"
    },
    {
        "question_id": "54b0d2df6ee27aaacdaf7f9c76c897b27e534667",
        "doc_id": "1710.06923",
        "question": "Do they compare their proposed domain adaptation methods to some existing methods?"
    },
    {
        "question_id": "b9a3836cff16af7454c7a8b0e5ff90206d0db1f5",
        "doc_id": "1710.06923",
        "question": "Which of their proposed domain adaptation methods proves best overall?"
    },
    {
        "question_id": "99554d0c76fbaef90bce972700fa4c315f961c31",
        "doc_id": "1710.06923",
        "question": "Do they use evolutionary-based optimization algorithms as one of their domain adaptation approaches?"
    },
    {
        "question_id": "5dfd58f91e7740899c23ebfe04b7176edce9ead2",
        "doc_id": "1809.03391",
        "question": "what is the size of the idn tagged corpus?"
    },
    {
        "question_id": "c09bceea67273c10a0621da1a83b409f53342fd9",
        "doc_id": "1809.03391",
        "question": "what neural network models were explored?"
    },
    {
        "question_id": "732bd97ae34541f215c436e2a1b98db1649cba27",
        "doc_id": "1809.03391",
        "question": "what rule based models were evaluated?"
    },
    {
        "question_id": "183b385fb59ff1e3f658d4555a08b67c005a8734",
        "doc_id": "1809.03391",
        "question": "what datasets have been used for this task?"
    },
    {
        "question_id": "1a0794ebbc9ee61bbb7ef2422d576a10576d9d96",
        "doc_id": "1909.11232",
        "question": "What is the sign language recognition task investigated?"
    },
    {
        "question_id": "256dfa501a71d7784520a527f43aec0549b1afea",
        "doc_id": "1909.11232",
        "question": "What is the performance of the best model in the sign language recognition task?"
    },
    {
        "question_id": "f85520bbc594918968d7d9f33d11639055458344",
        "doc_id": "1909.11232",
        "question": "What are the deep learning architectures used?"
    },
    {
        "question_id": "3588988f2230f3329d7523fbb881b20bf177280d",
        "doc_id": "1811.00051",
        "question": "Which ontologies did they use?"
    },
    {
        "question_id": "ed44f7e698d6124cb86791841d02fc6f8b4d862a",
        "doc_id": "1908.11049",
        "question": "What is their definition of hate speech?"
    },
    {
        "question_id": "d9e7633004ed1bc1ee45be58409bcc1fa6db59b2",
        "doc_id": "1908.11049",
        "question": "What languages does the new dataset contain?"
    },
    {
        "question_id": "c58ef13abe5fa91a761362ca962d7290312c74e4",
        "doc_id": "1908.11049",
        "question": "What aspects are considered?"
    },
    {
        "question_id": "9ef0d2365bde0d18054511fbb53cec5fa2cda5ee",
        "doc_id": "1908.11049",
        "question": "How big is their dataset?"
    },
    {
        "question_id": "9d80ad8cf4d5941a32d33273dc5678195ad1e0d2",
        "doc_id": "1809.00129",
        "question": "Do they evaluate whether local or global context proves more important?"
    },
    {
        "question_id": "bd817a520a62ddd77e65e74e5a7e9006cdfb19b3",
        "doc_id": "1809.00129",
        "question": "How many layers of recurrent neural networks do they use for encoding the global context?"
    },
    {
        "question_id": "c635295c2b77aaab28faecca3b5767b0c4ab3728",
        "doc_id": "1809.00129",
        "question": "How did their model rank in three CMU WMT2018 tracks it didn't rank first?"
    },
    {
        "question_id": "fc62549a8f0922c09996a119b2b6a8b5e829e989",
        "doc_id": "1701.03578",
        "question": "Which metrics are used for quantitative analysis?"
    },
    {
        "question_id": "e2a507749a4a3201edd6413c77ad0d4c23e9c6ce",
        "doc_id": "1701.03578",
        "question": "Is their data open sourced?"
    },
    {
        "question_id": "a3a867f7b3557c168d05c517c468ff6c7337bff9",
        "doc_id": "1701.03578",
        "question": "What dataset did they use?"
    },
    {
        "question_id": "8bb2280483af8013a32e0d294e97d44444f08ab0",
        "doc_id": "1701.03578",
        "question": "What metric did they use for qualitative evaluation?"
    },
    {
        "question_id": "a68acd8364764d5601dc12e4b31d9102fb7d5f7e",
        "doc_id": "1701.03578",
        "question": "What metric did they use for quantitative evaluation?"
    },
    {
        "question_id": "6d55e377335815b7ad134d1a2977d231ad34a25b",
        "doc_id": "1701.03578",
        "question": "Which similarity metrics are used for quantitative analysis?"
    },
    {
        "question_id": "e86b9633dc691976dd00ed57d1675e1460f7167b",
        "doc_id": "1910.12477",
        "question": "What is the state-of-the-art model in this task?"
    },
    {
        "question_id": "b0edb9023f35a5a02eb8fb968e880e36233e66b3",
        "doc_id": "1910.12477",
        "question": "How does this result compare to other methods KB QA in CCKS2019?"
    },
    {
        "question_id": "50bcbb730aa74637503c227f022a10f57d43f1f7",
        "doc_id": "1703.05320",
        "question": "what is the baseline model"
    },
    {
        "question_id": "fac273ecb3e72f2dc94cdbc797582d7225a8e070",
        "doc_id": "1703.05320",
        "question": "What contribute to improve the accuracy on legal question answering task?"
    },
    {
        "question_id": "361f330d3232681f1a13c6d59abb6c18246e7b35",
        "doc_id": "1909.00369",
        "question": "Do they use multitask learning?"
    },
    {
        "question_id": "f7d61648ae4bd46c603a271185c3adfac5fc5114",
        "doc_id": "1909.00369",
        "question": "Is Chinese a pro-drop language?"
    },
    {
        "question_id": "c9a323c152c5d9bc2d244e0ed10afbdb0f93062a",
        "doc_id": "1909.00369",
        "question": "Is English a pro-drop language?"
    },
    {
        "question_id": "9b868c7d17852f46a8fe725f24cb9548fdbd2b05",
        "doc_id": "1903.02930",
        "question": "what dataset was used for training?"
    },
    {
        "question_id": "243cf21c4e34c4b91fcc4905aa4dc15a72087f0c",
        "doc_id": "1903.02930",
        "question": "what is the size of the training data?"
    },
    {
        "question_id": "488e3c4fd1103c46e12815d1bf414a0356fb0d0e",
        "doc_id": "1903.02930",
        "question": "what features were derived from the videos?"
    },
    {
        "question_id": "5f7f4a1d4380c118a58ed506c057d3b7aa234c1e",
        "doc_id": "1906.04287",
        "question": "How much data do they use to train the embeddings?"
    },
    {
        "question_id": "a79a23573d74ec62cbed5d5457a51419a66f6296",
        "doc_id": "1906.04287",
        "question": "Do they evaluate their embeddings in any downstream task appart from word similarity and word analogy?"
    },
    {
        "question_id": "d427e9d181434078c78b7ee33a26b269f160f6d2",
        "doc_id": "1906.04287",
        "question": "What dialects of Chinese are explored?"
    },
    {
        "question_id": "e12166fa9d6f63c4e92252c95c6a7bc96977ebf4",
        "doc_id": "1901.10619",
        "question": "Is this an English language corpus?"
    },
    {
        "question_id": "d4cb704e93086a2246a8caa5c1035e8297b8f4c0",
        "doc_id": "1901.10619",
        "question": "The authors point out a relevant constraint on the previous corpora of workplace, do they authors mention any relevant constrains on this corpus?"
    },
    {
        "question_id": "a11b5eb928a6db9a0e3bb290ace468ff1685d253",
        "doc_id": "1901.10619",
        "question": "What type of annotation is performed?"
    },
    {
        "question_id": "275b2c22b6a733d2840324d61b5b101f2bbc5653",
        "doc_id": "1901.10619",
        "question": "How are the tweets selected?"
    },
    {
        "question_id": "2288f567d2f5cfbfc5097d8eddf9abd238ffbe25",
        "doc_id": "1710.11334",
        "question": "Do they manage to consistenly outperform the best performing methods?"
    },
    {
        "question_id": "caebea05935cae1f5d88749a2fc748e62976eab7",
        "doc_id": "1710.11334",
        "question": "Do they try to use other models aside from Maximum Entropy?"
    },
    {
        "question_id": "e381f1811774806be109f9b05896a2a3c5e1ef43",
        "doc_id": "1710.11334",
        "question": "What methods to they compare to?"
    },
    {
        "question_id": "9eec16e560f9ccafd7ba6f1e0db742b330b42ba9",
        "doc_id": "1710.11334",
        "question": "Which dataset to they train and evaluate on?"
    },
    {
        "question_id": "d788076c0d19781ff3f6525bd9c05b0ef0ecd0f1",
        "doc_id": "1710.11334",
        "question": "Do they attempt to jointly learn connectives, arguments, senses and non-explicit identiifers end-to-end?"
    },
    {
        "question_id": "cbb3c1c1e6e1818b6480f929f1c299eaa5ffd07a",
        "doc_id": "1907.10676",
        "question": "What are the opportunities presented by the use of Semantic Web technologies in Machine Translation?"
    },
    {
        "question_id": "9f74f3991b8681619d95ab93a7c8733a843ddffe",
        "doc_id": "1907.10676",
        "question": "What are the challenges associated with the use of Semantic Web technologies in Machine Translation?"
    },
    {
        "question_id": "7c2c15ea3f1b1375b8aaef1103a001069d9915bb",
        "doc_id": "1907.10676",
        "question": "What are the other obstacles to automatic translations which are not mentioned in the abstract?"
    },
    {
        "question_id": "475e698a801be0ad9e4f74756d1fff4fe0728009",
        "doc_id": "1805.11598",
        "question": "what resources are combined to build the labeler?"
    },
    {
        "question_id": "8246d1eee1482555d075127ac84f2e1d0781a446",
        "doc_id": "1805.11598",
        "question": "what datasets were used?"
    },
    {
        "question_id": "1ec0be667a6594eb2e07c50258b120e693e040a8",
        "doc_id": "1805.11598",
        "question": "what is the monolingual baseline?"
    },
    {
        "question_id": "e3bafa432cd3e1225170ff04de2fdf1ede38c6ef",
        "doc_id": "1805.11598",
        "question": "what languages are explored in this paper?"
    },
    {
        "question_id": "075d6ab5dd132666e85d0b6ad238118271dfc147",
        "doc_id": "1909.00786",
        "question": "How big is benefit in experiments of this editing approach compared to generating entire SQL from scratch?"
    },
    {
        "question_id": "f2b1e87f61c65aaa99bcf9825de11ae237260270",
        "doc_id": "1909.00786",
        "question": "What are state-of-the-art baselines?"
    },
    {
        "question_id": "80d6b9123a10358f57f259b8996a792cac08cb88",
        "doc_id": "1809.06083",
        "question": "Did they build a dataset?"
    },
    {
        "question_id": "5181aefb8a7272b4c83a1f7cb61f864ead6a1f1f",
        "doc_id": "1809.06083",
        "question": "Do they compare to other methods?"
    },
    {
        "question_id": "f010f9aa4ba1b4360a78c00aa0747d7730a61805",
        "doc_id": "1809.06083",
        "question": "How large is the dataset?"
    },
    {
        "question_id": "16b816925567deb734049416c149747118e13963",
        "doc_id": "2001.11316",
        "question": "How long is the dataset?"
    },
    {
        "question_id": "9b536f4428206ef7afabc4ff0a2ebcbabd68b985",
        "doc_id": "2001.11316",
        "question": "How are adversarial examples generated?"
    },
    {
        "question_id": "9d04fc997689f44e5c9a551b8571a60b621d35c2",
        "doc_id": "2001.11316",
        "question": "Is BAT smaller (in number of parameters) than post-trained BERT?"
    },
    {
        "question_id": "8a0e1a298716698a305153c524bf03d18969b1c6",
        "doc_id": "2001.11316",
        "question": "What are the modifications made to post-trained BERT?"
    },
    {
        "question_id": "538430077b1820011c609c8ae147389b960932c8",
        "doc_id": "2001.11316",
        "question": "What aspects are considered?"
    },
    {
        "question_id": "de344aeb089affebd15a8c370ae9ab5734e99203",
        "doc_id": "1803.06745",
        "question": "which social media platforms was the data collected from?"
    },
    {
        "question_id": "84327a0a9321bf266e22d155dfa94828784595ce",
        "doc_id": "1803.06745",
        "question": "how many data pairs were there for each dataset?"
    },
    {
        "question_id": "c2037887945abbdf959389dc839a86bc82594505",
        "doc_id": "1803.06745",
        "question": "how many systems were there?"
    },
    {
        "question_id": "e9a0a69eacd554141f56b60ab2d1912cc33f526a",
        "doc_id": "1803.06745",
        "question": "what was the baseline?"
    },
    {
        "question_id": "5b2839bef513e5d441f0bb8352807f673f4b2070",
        "doc_id": "1803.06745",
        "question": "what metrics did they use for evaluation?"
    },
    {
        "question_id": "2abf916bc03222d3b2a3d66851d87921ff35c0d2",
        "doc_id": "1803.06745",
        "question": "what datasets did they use?"
    },
    {
        "question_id": "0b5a7ccf09810ff5a86162d502697d16b3536249",
        "doc_id": "1911.03353",
        "question": "What simplification of the architecture is performed that resulted in same performance?"
    },
    {
        "question_id": "8f00859f74fc77832fa7d38c22f23f74ba13a07e",
        "doc_id": "1911.03353",
        "question": "How much better is performance of SEPT compared to previous state-of-the-art?"
    },
    {
        "question_id": "6167618e0c53964f3a706758bdf5e807bc5d7760",
        "doc_id": "1909.01860",
        "question": "What are remaining challenges in VQA?"
    },
    {
        "question_id": "78a0c25b83cdeaeaf0a4781f502105a514b2af0e",
        "doc_id": "1909.01860",
        "question": "How quickly is this hybrid model trained?  "
    },
    {
        "question_id": "08202b800a946b8283c2684e23b51c0ec1e8b2ac",
        "doc_id": "1909.01860",
        "question": "What are the new deep learning models discussed in the paper?  "
    },
    {
        "question_id": "00aea97f69290b496ed11eb45a201ad28d741460",
        "doc_id": "1909.01860",
        "question": "What was the architecture of the 2017 Challenge Winner model?"
    },
    {
        "question_id": "4e1293592e41646a6f5f0cb00c75ee8de14eb668",
        "doc_id": "1909.01860",
        "question": "What is an example of a common sense question?"
    },
    {
        "question_id": "a82a12a22a45d9507bc359635ffe9574f15e0810",
        "doc_id": "1702.02584",
        "question": "What linguistic model does the conventional method use?"
    },
    {
        "question_id": "355cf303ba61f84b580e2016fcb24e438abeafa7",
        "doc_id": "1702.02584",
        "question": "What is novel about the newly emerging CNN method, in comparison to well-established conventional method?"
    },
    {
        "question_id": "88757bc49ccab76e587fba7521f0981d6a1af2f7",
        "doc_id": "1702.02584",
        "question": "What lexical cues are used for humor recogition?"
    },
    {
        "question_id": "2f9a31f5a2b668acf3bce8958f5daa67ab8b2c83",
        "doc_id": "1702.02584",
        "question": "Do they evaluate only on English data?"
    },
    {
        "question_id": "4830459e3d1d204e431025ce7e596ef3f8d757d2",
        "doc_id": "1702.02584",
        "question": "How many speakers are included in the dataset?"
    },
    {
        "question_id": "74ebfba06f37cc95dfe59c3790ebe6165e6be19c",
        "doc_id": "1702.02584",
        "question": "How are the positive instances annotated? e.g. by annotators, or by laughter from the audience?"
    },
    {
        "question_id": "bab4ae97afd598a11d1fc7c05c6fdb98c30cafe0",
        "doc_id": "1711.03759",
        "question": "how many sentences did they annotate?"
    },
    {
        "question_id": "f5913e37039b9517a323ec700b712e898316161b",
        "doc_id": "1711.03759",
        "question": "what dataset was used in their experiment?"
    },
    {
        "question_id": "a064d01d45a33814947161ff208abb88d4353b26",
        "doc_id": "1711.03759",
        "question": "what are the existing annotation tools?"
    },
    {
        "question_id": "d976c22e9d068e4e31fb46e929023459f8290a63",
        "doc_id": "1903.10548",
        "question": "Are the two paragraphs encoded independently?"
    },
    {
        "question_id": "deed225dfa94120fafcc522d4bfd9ea57085ef8d",
        "doc_id": "1605.05195",
        "question": "Do the authors mention any possible confounds in this study?"
    },
    {
        "question_id": "3df6d18d7b25d1c814e9dcc8ba78b3cdfe15edcd",
        "doc_id": "1605.05195",
        "question": "Do they report results only on English data?"
    },
    {
        "question_id": "9aabcba3d44ee7d0bbf6a2c019ab9e0f02fab244",
        "doc_id": "1605.05195",
        "question": "Are there any other standard linguistic features used, other than ngrams?"
    },
    {
        "question_id": "242c626e89bca648b65af135caaa7ceae74e9720",
        "doc_id": "1605.05195",
        "question": "What is the relationship between author and emotional valence?"
    },
    {
        "question_id": "bba677d1a1fe38a41f61274648b386bdb44f1851",
        "doc_id": "1605.05195",
        "question": "What is the relationship between time and emotional valence?"
    },
    {
        "question_id": "b6c2a391c4a94eaa768150f151040bb67872c0bf",
        "doc_id": "1605.05195",
        "question": "What is the relationship between location and emotional valence?"
    },
    {
        "question_id": "ff36168caf48161db7039e3bd4732cef31d4de99",
        "doc_id": "1909.11706",
        "question": "which had better results, the svm or the random forest model?"
    },
    {
        "question_id": "556782bb96f8fc07d14865f122362ebcc79134ec",
        "doc_id": "1909.11706",
        "question": "which network community detection dataset was used?"
    },
    {
        "question_id": "cb58605a7c230043bd0d6e8d5b068f8b533f45fe",
        "doc_id": "1909.11706",
        "question": "did they collect the human labeled data?"
    },
    {
        "question_id": "7969b8d80e12aa3ebb89b5622bc564f44e98329f",
        "doc_id": "1909.11706",
        "question": "how many classes are they classifying?"
    },
    {
        "question_id": "6d6a9b855ec70f170b854baab6d8f7e94d3b5614",
        "doc_id": "1905.07894",
        "question": "What is the proposed algorithm or model architecture?"
    },
    {
        "question_id": "870358f28a520cb4f01e7f5f780d599dfec510b4",
        "doc_id": "1905.07894",
        "question": "Do they attain state-of-the-art performance?"
    },
    {
        "question_id": "98aa86ee948096d6fe16c02c1e49920da00e32d4",
        "doc_id": "1905.07894",
        "question": "What fusion methods are applied?"
    },
    {
        "question_id": "c463136ba9a312a096034c872b5c74b9d58cef95",
        "doc_id": "1905.07894",
        "question": "What graph-based features are considered?"
    },
    {
        "question_id": "a4cf0cf372f62b2dbc7f31c600c6c66246263328",
        "doc_id": "1912.11980",
        "question": "Which baselines to they compare to?"
    },
    {
        "question_id": "f7b91b99279833f9f489635eb8f77c6d13136098",
        "doc_id": "1912.11980",
        "question": "Which sentence compression technique works best?"
    },
    {
        "question_id": "99e514acc0109b7efa4e3860ce1e8c455f5bb790",
        "doc_id": "1912.11980",
        "question": "Do they compare performance against state of the art systems?"
    },
    {
        "question_id": "3499d5feeb3a45411d8e893516adbdc14e72002a",
        "doc_id": "1901.01590",
        "question": "What is reordering in the context of the paper?"
    },
    {
        "question_id": "d0048ef1cba3f63b5d60c568d5d0ba62ac4d7e75",
        "doc_id": "1901.01590",
        "question": "How does the paper use language model for context aware search?"
    },
    {
        "question_id": "74a17eb3bf1d4f36e2db1459a342c529b9785f6e",
        "doc_id": "1805.09960",
        "question": "what were the evaluation metrics?"
    },
    {
        "question_id": "4b6745982aa64fbafe09f7c88c8d54d520b3f687",
        "doc_id": "1805.09960",
        "question": "what language pairs are explored?"
    },
    {
        "question_id": "6656a9472499331f4eda45182ea697a4d63e943c",
        "doc_id": "1805.09960",
        "question": "what datasets did they use?"
    },
    {
        "question_id": "430ad71a0fd715a038f3c0fe8d7510e9730fba23",
        "doc_id": "1805.09960",
        "question": "which attention based nmt method did they compare with?"
    },
    {
        "question_id": "b79ff0a50bf9f361c5e5fed68525283856662076",
        "doc_id": "1805.09960",
        "question": "by how much did their system improve?"
    },
    {
        "question_id": "55bafa0f7394163f4afd1d73340aac94c2d9f36c",
        "doc_id": "1603.00957",
        "question": "Are experiments conducted on multiple datasets?"
    },
    {
        "question_id": "cbb4eba59434d596749408be5b923efda7560890",
        "doc_id": "1603.00957",
        "question": "What baselines is the neural relation extractor compared to?"
    },
    {
        "question_id": "1d9d7c96c5e826ac06741eb40e89fca6b4b022bd",
        "doc_id": "1603.00957",
        "question": "What additional evidence they use?"
    },
    {
        "question_id": "d1d37dec9053d465c8b6f0470e06316bccf344b3",
        "doc_id": "1603.00957",
        "question": "How much improvement they get from the previous state-of-the-art?"
    },
    {
        "question_id": "90eeb1b27f84c83ffcc8a88bc914a947c01a0c8b",
        "doc_id": "1603.00957",
        "question": "What is the previous state-of-the-art?"
    },
    {
        "question_id": "4e63454275380787ebd0e38aa885977332ab33af",
        "doc_id": "1708.01065",
        "question": "what evaluation metrics were used?"
    },
    {
        "question_id": "dfaeb8faf04505a4178945c933ba217e472979d8",
        "doc_id": "1708.01065",
        "question": "what is the source of their dataset?"
    },
    {
        "question_id": "342ada55bd4d7408e1fcabf1810b92d84c1dbc41",
        "doc_id": "1708.01065",
        "question": "by how much did the performance improve?"
    },
    {
        "question_id": "86d1c990c1639490c239c3dbf5492ecc44ab6652",
        "doc_id": "1708.01065",
        "question": "how many experts were there?"
    },
    {
        "question_id": "b065c2846817f3969b39e355d5d017e326d6f42e",
        "doc_id": "1708.01065",
        "question": "what is the size of the data collected?"
    },
    {
        "question_id": "9536e4a2455008007067f23cc873768374c8f664",
        "doc_id": "1708.01065",
        "question": "did they use a crowdsourcing platform?"
    },
    {
        "question_id": "cfa44bb587b0c05906d8325491ca9e0f024269e8",
        "doc_id": "1708.01065",
        "question": "how was annotation conducted?"
    },
    {
        "question_id": "b3dc9a35e8c3ed7abcc4ca0bf308dea75be9c016",
        "doc_id": "1708.01065",
        "question": "what does their dataset contain?"
    },
    {
        "question_id": "d4456e9029fcdcb6e0149dd8f57b77d16ead1bc4",
        "doc_id": "1703.10152",
        "question": "What metric is considered?"
    },
    {
        "question_id": "d0b967bfca2039c7fb05b931c8b9955f99a468dc",
        "doc_id": "1703.10152",
        "question": "What hand-crafted features are used?"
    },
    {
        "question_id": "31e6062ba45d8956791e1b86bad7efcb6d1b191a",
        "doc_id": "1703.10152",
        "question": "What word embeddings are used?"
    },
    {
        "question_id": "38b29b0dcb87868680f9934af71ef245ebb122e4",
        "doc_id": "1703.10152",
        "question": "Do they annotate their own dataset?"
    },
    {
        "question_id": "6e134d51a795c385d72f38f36bca4259522bcf51",
        "doc_id": "1703.10152",
        "question": "How are the sentence embeddings generated?"
    },
    {
        "question_id": "0778cbbd093f8b779f7cf26302b2a8e081ccfb40",
        "doc_id": "1703.10152",
        "question": "What is argumentative zoning?"
    },
    {
        "question_id": "0871827cfeceed4ee78ce7407aaf6e85dd1f9c25",
        "doc_id": "1901.09381",
        "question": "Do they evaluate their model on datasets other than RACE?"
    },
    {
        "question_id": "240058371e91c6b9509c0398cbe900855b46c328",
        "doc_id": "1901.09381",
        "question": "What is their model's performance on RACE?"
    },
    {
        "question_id": "1d7b99646a1bc05beec633d7a3beb083ad1e8734",
        "doc_id": "1803.02155",
        "question": "How is the training time compared to the original position encoding? "
    },
    {
        "question_id": "4d887ce7dc43528098e7a3d9cd13c6c36f158c53",
        "doc_id": "1803.02155",
        "question": "Does the new relative position encoder require more parameters?"
    },
    {
        "question_id": "d48b5e4a7cf1f96c5b939ba9b46350887c5e5268",
        "doc_id": "1803.02155",
        "question": "Can the new position representation be generalized to other tasks?"
    },
    {
        "question_id": "f94cea545f745994800c1fb4654d64d1384f2c26",
        "doc_id": "2003.08769",
        "question": "Is this done in form of unsupervised (clustering) or suppervised learning?"
    },
    {
        "question_id": "f3b851c9063192c86a3cc33b2328c02efa41b668",
        "doc_id": "2003.08769",
        "question": "Does this study perform experiments to prove their claim that indeed personalized profiles will have inclination towards particular cuisines?"
    },
    {
        "question_id": "0a5fd0e5f4ab12be57be20416a5ea7c3db5fb662",
        "doc_id": "1912.10162",
        "question": "What are the issues identified for out-of-vocabulary words?"
    },
    {
        "question_id": "5d03a82a70f7b1ab9829891403ec31607828cbd5",
        "doc_id": "1912.10162",
        "question": "Is the morphology detection task evaluated?"
    },
    {
        "question_id": "6cad6f074b0486210ffa4982c8d1632f5aa91d91",
        "doc_id": "1912.10162",
        "question": "How does the model proposed extend ENAMEX?"
    },
    {
        "question_id": "d38b3e0896b105d171e69ce34c689e4a7e934522",
        "doc_id": "1912.10162",
        "question": "Which morphological features are extracted?"
    },
    {
        "question_id": "e4f2d59030b17867449cf5456118ab722296bebd",
        "doc_id": "1808.09180",
        "question": "Who made the stated claim (that \"this is because character-level models learn morphology\")?"
    },
    {
        "question_id": "e664b58ea034a638e7142f8a393a88aadd1e215e",
        "doc_id": "1808.09180",
        "question": "Which languages do they use?"
    },
    {
        "question_id": "c4b621f573bbb411bdaa84a7562c9c4795a7eb3a",
        "doc_id": "1808.09180",
        "question": "Do the character-level models perform better than models with access to morphological analyses only?"
    },
    {
        "question_id": "3ccc4ccebc3b0de5546b1208e8094a839fd4a4ab",
        "doc_id": "1808.09180",
        "question": "What is case syncretism?"
    },
    {
        "question_id": "f741d32b92630328df30f674af16fbbefcad3f93",
        "doc_id": "1805.09821",
        "question": "Which baselines were they used for evaluation?"
    },
    {
        "question_id": "fe7f7bcf37ca964b4dc9e9c7ebf35286e1ee042b",
        "doc_id": "1805.09821",
        "question": "What is the difference in size compare to the previous model?"
    },
    {
        "question_id": "b2dc0c813da92cf13d86528bd32c12286ec9b9cd",
        "doc_id": "1908.10461",
        "question": "How many lexical features are considered?"
    },
    {
        "question_id": "c4c06f36454fbfdc5d218fb84ce74eaf7f78fc98",
        "doc_id": "1908.10461",
        "question": "What is the performance for the three languages tested?"
    },
    {
        "question_id": "347dc2fd6427b39cf2358d43864750044437dff8",
        "doc_id": "1908.10461",
        "question": "How many Universal Dependency features are considered?"
    },
    {
        "question_id": "6911e8724dfdb178fa81bf58019947b71ef8fbe7",
        "doc_id": "1908.10461",
        "question": "Do they evaluate any non-zero-shot parsers on the three languages?"
    },
    {
        "question_id": "b012df09fa2a3d6b581032d68991768cf4bc9d7b",
        "doc_id": "1908.10461",
        "question": "How big is the Parallel Meaning Bank?"
    },
    {
        "question_id": "62edffd051d056cf60e17deafcc55a8c9af398cb",
        "doc_id": "1908.10461",
        "question": "What is the source of the crosslingual word embeddings?"
    },
    {
        "question_id": "6ec267f66a1c5f996519aed8aa0befb5e5aec205",
        "doc_id": "2002.04374",
        "question": "Is dataset balanced in terms of available data per language?"
    },
    {
        "question_id": "f9ae1b31c1a60aacb9ef869e1cc6b0e70c6e5d8e",
        "doc_id": "2002.04374",
        "question": "What datasets are used?"
    },
    {
        "question_id": "5e29f16d7302f24ab93b7707d115f4265a0d14b0",
        "doc_id": "1808.10290",
        "question": "How much additional data do they manage to generate from translations?"
    },
    {
        "question_id": "26844cec57df6ff0f02245ea862af316b89edffe",
        "doc_id": "1808.10290",
        "question": "Do they train discourse relation models with augmented data?"
    },
    {
        "question_id": "d1d59bca40b8b308c0a35fed1b4b7826c85bc9f8",
        "doc_id": "1808.10290",
        "question": "How many languages do they at most attempt to use to generate discourse relation labelled data?"
    },
    {
        "question_id": "89373db8ced1fe420eae0093b2736f06b565616e",
        "doc_id": "1901.02780",
        "question": "Did the authors collect new data for evaluation?"
    },
    {
        "question_id": "c8cf20afd75eb583aef70fcb508c4f7e37f234e1",
        "doc_id": "1809.02208",
        "question": "Do the authors examine the real-world distribution of female workers in the country/countries where the gender neutral languages are spoken?"
    },
    {
        "question_id": "3567241b3fafef281d213f49f241071f1c60a303",
        "doc_id": "1809.02208",
        "question": "Which of the 12 languages showed the strongest tendency towards male defaults?"
    },
    {
        "question_id": "d5d48b812576470edbf978fc18c00bd24930a7b7",
        "doc_id": "1809.02208",
        "question": "How many different sentence constructions are translated in gender neutral languages?"
    },
    {
        "question_id": "2e70d25f14357ad74c085a9454a2ce33bb988a6f",
        "doc_id": "1912.06905",
        "question": "What are their results on this task?"
    },
    {
        "question_id": "de84972c5d1bbf664d0f8b702fce5f161449ec23",
        "doc_id": "1912.06905",
        "question": "How is the text segmented?"
    },
    {
        "question_id": "74396ead9f88a9efc7626240ce128582ab69ef2b",
        "doc_id": "1806.03369",
        "question": "by how much did their approach outperform previous work?"
    },
    {
        "question_id": "8a7a9d205014c42cb0e24a0f3f38de2176fe74c0",
        "doc_id": "1806.03369",
        "question": "what was the previous best results model?"
    },
    {
        "question_id": "eaed0b721cc3137b964f5265c7ecf76f565053e9",
        "doc_id": "1806.03369",
        "question": "what are the baseline models?"
    },
    {
        "question_id": "ba7fea78b0b888a714cb7d89944b69c5038a1ef1",
        "doc_id": "1806.03369",
        "question": "what domains are explored?"
    },
    {
        "question_id": "38af3f25c36c3725a31304ab96e2c200c55792b4",
        "doc_id": "1806.03369",
        "question": "what training data was used?"
    },
    {
        "question_id": "a9cae57f494deb0245b40217d699e9a22db0ea6e",
        "doc_id": "1611.02378",
        "question": "How many TV series are considered?"
    },
    {
        "question_id": "0a736e0e3305a50d771dfc059c7d94b8bd27032e",
        "doc_id": "1611.02378",
        "question": "How long is the dataset?"
    },
    {
        "question_id": "283d358606341c399e369f2ba7952cd955326f73",
        "doc_id": "1611.02378",
        "question": "Is manual annotation performed?"
    },
    {
        "question_id": "818c85ee26f10622c42ae7bcd0dfbdf84df3a5e0",
        "doc_id": "1611.02378",
        "question": "What are the eight predefined categories?"
    },
    {
        "question_id": "044cb5ef850c0a2073682bb31d919d504667f907",
        "doc_id": "1911.05652",
        "question": "What IS versification?"
    },
    {
        "question_id": "c845110efee2f633d47f5682573bc6091e8f5023",
        "doc_id": "1911.05652",
        "question": "How confident is the conclusion about Shakespeare vs Flectcher?"
    },
    {
        "question_id": "2301424672cb79297cf7ad95f23b58515e4acce8",
        "doc_id": "1911.05652",
        "question": "Is Henry VIII reflective of Shakespeare in general?"
    },
    {
        "question_id": "6c05376cd0f011e00d1ada0254f6db808f33c3b7",
        "doc_id": "1911.05652",
        "question": "Is vocabulary or versification more important for the analysis?"
    },
    {
        "question_id": "9925e7d8757e8fd7411bcb5250bc08158a244fb3",
        "doc_id": "1911.05652",
        "question": "What are the modifications by Thomas Merriam?"
    },
    {
        "question_id": "fa468c31dd0f9095d7cec010f2262eeed565a7d2",
        "doc_id": "1911.05652",
        "question": "What are stop words in Shakespeare?"
    },
    {
        "question_id": "74cc0300e22f60232812019011a09df92bbec803",
        "doc_id": "1709.05295",
        "question": "Do they report results only on English data?"
    },
    {
        "question_id": "865811dcf63a1dd3f22c62ec39ffbca4b182de31",
        "doc_id": "1709.05295",
        "question": "What are the most discriminating patterns which are analyzed?"
    },
    {
        "question_id": "9e378361b6462034aaf752adf04595ef56370b86",
        "doc_id": "1709.05295",
        "question": "What bootstrapping methodology was used to find new patterns?"
    },
    {
        "question_id": "667dce60255d8ab959869eaf8671312df8c0004b",
        "doc_id": "1709.05295",
        "question": "What patterns were extracted which were correlated with emotional arguments?"
    },
    {
        "question_id": "d5e716c1386b6485e63075e980f80d44564d0aa2",
        "doc_id": "1709.05295",
        "question": "What patterns were extracted which were correlated with factual arguments?"
    },
    {
        "question_id": "1fd31fdfff93d65f36e93f6919f6976f5f172197",
        "doc_id": "1709.05295",
        "question": "How were the factual and feeling forum posts annotated?"
    },
    {
        "question_id": "38289bd9592db4d3670b65a0fef1fe8a309fee61",
        "doc_id": "1603.08868",
        "question": "what classifiers did they train?"
    },
    {
        "question_id": "cb7a00233502c4b7801d34bc95d6d22d79776ae8",
        "doc_id": "1603.08868",
        "question": "what dataset did they use?"
    },
    {
        "question_id": "35d2eae3a7c9bed54196334a09344591f9cbb5c8",
        "doc_id": "1603.08868",
        "question": "what combination of features helped improve the classification?"
    },
    {
        "question_id": "a70656fc61bf526dd21db7d2ec697b29a5a9c24e",
        "doc_id": "1603.08868",
        "question": "what linguistics features did they apply?"
    },
    {
        "question_id": "f381b0ef693243d67657f6c34bbce015f6b1fd07",
        "doc_id": "1603.08868",
        "question": "what is the state of the art in English?"
    },
    {
        "question_id": "c7f43c95db3d0c870407cd0e7becdd802463683b",
        "doc_id": "1910.09362",
        "question": "Do they use skip-gram word2vec?"
    },
    {
        "question_id": "4e2b12cfc530a4682b06f8f5243bc9f64bd41135",
        "doc_id": "1910.09362",
        "question": "How is quality of the word vectors measured?"
    },
    {
        "question_id": "8126c6b8a0cab3e22661d3d71d96aa57360da65c",
        "doc_id": "1905.10039",
        "question": "what evaluation metrics were used?"
    },
    {
        "question_id": "2f01d3e5120d1fef4b01028536cb5fe0abad1968",
        "doc_id": "1905.10039",
        "question": "what state of the art models did they compare with?"
    },
    {
        "question_id": "3b4077776f4e828f0d1687d0ce8018c9bce4fdc6",
        "doc_id": "1904.02306",
        "question": "what previous work do they also look at?"
    },
    {
        "question_id": "d1a88fe6655c742421da93cf88b5c541c09866d6",
        "doc_id": "1904.02306",
        "question": "what languages did they experiment with?"
    },
    {
        "question_id": "c138a45301713c1a9f6edafeef338ba2f99220ce",
        "doc_id": "1912.08084",
        "question": "What is the size of the dataset?"
    },
    {
        "question_id": "56d788af4694c1cd1eebee0b83c585836d1f5f99",
        "doc_id": "1912.08084",
        "question": "What models are trained?"
    },
    {
        "question_id": "34b434825f0ca3225dc8914f9da865d2b4674f08",
        "doc_id": "1912.08084",
        "question": "Does the baseline use any contextual information?"
    },
    {
        "question_id": "61a2599acfbd3d75de58e97ecdba2d9cf0978324",
        "doc_id": "1912.08084",
        "question": "What is the strong rivaling system?"
    },
    {
        "question_id": "cf58d25bfa2561a359fdd7b6b20aef0b41dc634e",
        "doc_id": "1912.08084",
        "question": "Where are the debates from?"
    },
    {
        "question_id": "d206f2cbcc3d2a6bd0ccaa3b57fece396159f609",
        "doc_id": "1805.00195",
        "question": "are the protocols manually annotated?"
    },
    {
        "question_id": "633e2210c740b4558b1eea3f041b3ae8e0813293",
        "doc_id": "1805.00195",
        "question": "what ML approaches did they experiment with?"
    },
    {
        "question_id": "1702985a3528e876bb19b8e223399729d778b4e4",
        "doc_id": "2003.12450",
        "question": "How many annotators were used for sentiment labeling?"
    },
    {
        "question_id": "f44a9ed166a655df1d54683c91935ab5e566a04f",
        "doc_id": "2003.12450",
        "question": "How is data collected?"
    },
    {
        "question_id": "0ba1514fb193c52a15c31ffdcd5c3ddbb2bb2c40",
        "doc_id": "2003.12450",
        "question": "How much better is performance of Nigerian Pitdgin English sentiment classification of models that use additional Nigerian English data compared to orginal English-only models?"
    },
    {
        "question_id": "d14118b18ee94dafe170439291e20cb19ab7a43c",
        "doc_id": "2003.12450",
        "question": "What full English language based sentiment analysis models are tried?"
    },
    {
        "question_id": "f5603271a04452cbdbb07697859bef2a2030d75c",
        "doc_id": "2003.11687",
        "question": "How many domain experts were involved into creation of dataset?"
    },
    {
        "question_id": "6575ffec1844e6fde5a668bce2afb16b67b65c1f",
        "doc_id": "2003.11687",
        "question": "What metrics are used for evaluation?"
    },
    {
        "question_id": "77c3416578b52994227bae7f2529600f02183e12",
        "doc_id": "2003.11687",
        "question": "What is the performance of fine tuned model on this dataset?"
    },
    {
        "question_id": "2abcff4fdedf9b17f76875cc338ba4ab8d1eccd3",
        "doc_id": "2003.11687",
        "question": "Are constructed datasets open sourced?"
    },
    {
        "question_id": "6df57a21ca875e63fb39adece6a9ace5bb2b2cfa",
        "doc_id": "2003.11687",
        "question": "How does labeling scheme look like?"
    },
    {
        "question_id": "b39b278aa1cf2f87ad4159725dff77b387f2df84",
        "doc_id": "2003.11687",
        "question": "What pretrained language model is used?"
    },
    {
        "question_id": "814e945668e2b6f31b088918758b120fb00ada7d",
        "doc_id": "2003.11687",
        "question": "How big is constructed dataset?"
    },
    {
        "question_id": "f2bcfdbebb418e7da165c19b8c7167719432ee48",
        "doc_id": "1603.07252",
        "question": "What domain of text are they working with?"
    },
    {
        "question_id": "0fe49431db5ffaa24372919daf24d8f84117bfda",
        "doc_id": "1603.07252",
        "question": "What dataset do they use?"
    },
    {
        "question_id": "0f9c1586f1b4b531fa4fd113e767d06af90b1ae8",
        "doc_id": "1603.07252",
        "question": "Do they compare to abstractive summarization methods?"
    },
    {
        "question_id": "6b1a6517b343fdb79f246955091ff25e440b9511",
        "doc_id": "1902.04094",
        "question": "Which metrics are used for evaluating the quality?"
    },
    {
        "question_id": "d5c393df758dec6ea6827ae5b887eb6c303a4f4d",
        "doc_id": "1612.05202",
        "question": "Do they compare against manually-created lexicons?"
    },
    {
        "question_id": "11a3af3f056e0fb5559fe5cbff1640e022732735",
        "doc_id": "1612.05202",
        "question": "Do they compare to non-lexicon methods?"
    },
    {
        "question_id": "07a214748a69b31400585aef7aba6af3e3d9cce2",
        "doc_id": "1612.05202",
        "question": "What language pairs are considered?"
    },
    {
        "question_id": "ff3e93b9b5f08775ebd1a7408d7f0ed2f6942dde",
        "doc_id": "1908.05925",
        "question": "How is the quality of the translation evaluated?"
    },
    {
        "question_id": "59a3d4cdd1c3797962bf8d72c226c847e06e1d44",
        "doc_id": "1908.05925",
        "question": "What are the post-processing approaches applied to the output?"
    },
    {
        "question_id": "49474a3047fa3f35e1bcd63991e6f15e012ac10b",
        "doc_id": "1908.05925",
        "question": "Is the MUSE alignment independently evaluated?"
    },
    {
        "question_id": "63279ecb2ba4e51c1225e63b81cb021abc10d0d1",
        "doc_id": "1908.05925",
        "question": "How does byte-pair encoding work?"
    },
    {
        "question_id": "0aca0a208a1e28857fab44e397dc7880e010dbca",
        "doc_id": "2003.12139",
        "question": "Which was the most helpful strategy?"
    },
    {
        "question_id": "471683ba6251b631f38a24d42b6dba6f52dee429",
        "doc_id": "2003.12139",
        "question": "How large is their tweets dataset?"
    },
    {
        "question_id": "4d824b49728649432371ecb08f66ba44e50569e0",
        "doc_id": "1612.04118",
        "question": "by how much did the system improve?"
    },
    {
        "question_id": "02a5acb484bda77ef32a13f5d93d336472cf8cd4",
        "doc_id": "1612.04118",
        "question": "what existing databases were used?"
    },
    {
        "question_id": "863d8d32a1605402e11f0bf63968a14bcfd15337",
        "doc_id": "1612.04118",
        "question": "what existing parser is used?"
    },
    {
        "question_id": "48ff9645a506aa2c17810d2654d1f0f0d9e609ee",
        "doc_id": "1910.01108",
        "question": "What downstream tasks are analyzed?"
    },
    {
        "question_id": "84ee6180d3267115ad27852027d147fb86a33135",
        "doc_id": "1910.01108",
        "question": "How much time takes the training of DistilBERT?"
    },
    {
        "question_id": "6d17dc00f7e5331128b6b585e78cac0b9082e13d",
        "doc_id": "1911.12722",
        "question": "Was the entire annotation process done manually?"
    },
    {
        "question_id": "de0154affd86c608c457bf83d888bbd1f879df93",
        "doc_id": "1911.12722",
        "question": "What were the results of their experiment?"
    },
    {
        "question_id": "9887ca3d25e2109f41d1da80eeea05c465053fbc",
        "doc_id": "1911.12722",
        "question": "How big is the dataset?"
    },
    {
        "question_id": "87b65b538d79e1218fa19aaac71e32e9b49208df",
        "doc_id": "1911.12722",
        "question": "What are all the domains the corpus came from?"
    },
    {
        "question_id": "65e30c842e4c140a6cb8b2f9498fcc6223ed49c0",
        "doc_id": "1803.04579",
        "question": "what pruning did they perform?"
    },
    {
        "question_id": "22375aac4cbafd252436b756bdf492a05f97eed8",
        "doc_id": "1708.07252",
        "question": "What languages are used for the experiments?"
    },
    {
        "question_id": "d2f91303cec132750a416192f67c8ac1d3cf6fc0",
        "doc_id": "1708.07252",
        "question": "What is the caching mechanism?"
    },
    {
        "question_id": "9f065e787a0d40bb4550be1e0d64796925459005",
        "doc_id": "1708.07252",
        "question": "What language model architectures are examined?"
    },
    {
        "question_id": "e6f5444b7c08d79d4349e35d5298a63bb30e7004",
        "doc_id": "1708.07252",
        "question": "What directions are suggested to improve language models?"
    },
    {
        "question_id": "6371c6863fe9a14bf67560e754ce531d70de10ab",
        "doc_id": "2002.04326",
        "question": "How big is this dataset?"
    },
    {
        "question_id": "28a8a1542b45f67674a2f1d54fff7a1e45bfad66",
        "doc_id": "2002.04326",
        "question": "How are biases identified in the dataset?"
    },
    {
        "question_id": "a4b77a20e067789691e0ab246bc5b11913d77ae1",
        "doc_id": "1703.04009",
        "question": "What is their definition of hate speech?"
    },
    {
        "question_id": "ba39317e918b4386765f88e8c8ae99f9a098c935",
        "doc_id": "1703.04009",
        "question": "What type of model do they train?"
    },
    {
        "question_id": "22c125c461f565f5437dac74bf19c2ef317bad86",
        "doc_id": "1703.04009",
        "question": "How many users does their dataset have?"
    },
    {
        "question_id": "4a91432abe3f54fcbdd00bb85dc0df95b16edf42",
        "doc_id": "1703.04009",
        "question": "How long is their dataset?"
    },
    {
        "question_id": "c176eb1ccaa0e50fb7512153f0716e60bf74aa53",
        "doc_id": "1910.01340",
        "question": "Are results reported only on English data?"
    },
    {
        "question_id": "e0b54906184a4ad87d127bed22194e62de38222b",
        "doc_id": "1910.01340",
        "question": "What type of model were the features used in?"
    },
    {
        "question_id": "1f8044487af39244d723582b8a68f94750eed2cc",
        "doc_id": "1910.01340",
        "question": "What unsupervised approach was used to deduce the thematic information?"
    },
    {
        "question_id": "595fe416a100bc7247444f25b11baca6e08d9291",
        "doc_id": "1910.01340",
        "question": "What profile features are used?"
    },
    {
        "question_id": "1f011fa772ce802e74eda89f706cdb1aa2833686",
        "doc_id": "1910.01340",
        "question": "What textual features are used?"
    },
    {
        "question_id": "578add9d3dadf86cd0876d42b03bf0114f83d0e7",
        "doc_id": "1907.04072",
        "question": "How did they obtain the tweets?"
    },
    {
        "question_id": "4d5b74499804ea5bc5520beb88d0f9816f67205a",
        "doc_id": "1907.04072",
        "question": "What baseline do they compare to?"
    },
    {
        "question_id": "baec99756b80eec7c0234a08bc2855e6770bcaeb",
        "doc_id": "1907.04072",
        "question": "What language is explored in this paper?"
    },
    {
        "question_id": "46d051b8924ad0ef8cfba9c7b5b84707ee72f26a",
        "doc_id": "1907.04072",
        "question": "What blackmarket services do they look at?"
    },
    {
        "question_id": "c79f168503a60d1b08bb2c9aac124199d210b06d",
        "doc_id": "1904.02954",
        "question": "Which downstream tasks are used for evaluation in this paper?"
    },
    {
        "question_id": "9dd8ce48a2a59a63ae6366ab8b2b8828e5ae7f35",
        "doc_id": "1904.02954",
        "question": "Which datasets are used for evaluation?"
    },
    {
        "question_id": "6e3e9818551fc2f8450bbf09b0fe82ac2506bc7a",
        "doc_id": "1906.06349",
        "question": "How do they prove that RNNs with arbitrary precision are as powerful as a pushdown automata?"
    },
    {
        "question_id": "0b5a505c1fca92258b9e83f53bb8cfeb81cb655a",
        "doc_id": "1906.06349",
        "question": "What are edge weights?"
    },
    {
        "question_id": "8908d1b865137bc309dde10a93735ec76037e5f9",
        "doc_id": "1705.02023",
        "question": "what were the evaluation metrics?"
    },
    {
        "question_id": "d207f78beb6cd754268881bf575c8f98000667ea",
        "doc_id": "1705.02023",
        "question": "how many sentiment labels do they explore?"
    },
    {
        "question_id": "2686e8d51caff9a19684e0c9984bcb5a1937d08d",
        "doc_id": "1606.02892",
        "question": "What morphological features are considered?"
    },
    {
        "question_id": "df623717255ea2c9e0f846859d8a9ef51dc1102b",
        "doc_id": "1606.02892",
        "question": "What type of attention do they use in the decoder?"
    },
    {
        "question_id": "5d0a3f8ca3882f87773cf8c2ef1b4f72b9cc241e",
        "doc_id": "1808.10006",
        "question": "How is a per-word reward tuned with the perceptron algorithm?"
    },
    {
        "question_id": "dce27c49b9bf1919ca545e04663507d83bb42dbe",
        "doc_id": "1808.10006",
        "question": "What methods are used to correct the brevity problem?"
    },
    {
        "question_id": "991ea04072b3412928be5e6e903cfa54eeac3951",
        "doc_id": "1808.10006",
        "question": "Why does wider beam search hurt NMT?"
    },
    {
        "question_id": "00f507053c47e55d7e72bebdbd8a75b3ca88cf85",
        "doc_id": "1806.07042",
        "question": "Which aspects of response generation do they evaluate on?"
    },
    {
        "question_id": "e14e3e0944ec3290d1985e9a3da82a7df17575cd",
        "doc_id": "1806.07042",
        "question": "Which dataset do they evaluate on?"
    },
    {
        "question_id": "f637bba86cfb94ca8ac4b058faf839c257d5eaa0",
        "doc_id": "1806.07042",
        "question": "What model architecture do they use for the decoder?"
    },
    {
        "question_id": "0b5bf00d2788c534c4c6c007b72290c48be21e16",
        "doc_id": "1806.07042",
        "question": "Do they ensure the edited response is grammatical?"
    },
    {
        "question_id": "86c867b393db0ec4ad09abb48cc1353cac47ea4c",
        "doc_id": "1806.07042",
        "question": "What do they use as the pre-defined index of prototype responses?"
    },
    {
        "question_id": "9c2cacf77041e02d38f92a4c490df1e04552f96f",
        "doc_id": "1905.00472",
        "question": "Did they pre-train on existing sentiment corpora?"
    },
    {
        "question_id": "35cdaa0fff007add4a795850b139df80af7d1ffc",
        "doc_id": "1905.00472",
        "question": "What were the most salient features extracted by the models?"
    },
    {
        "question_id": "3de3a083b8ba3086792d38ae9667e095070f7f37",
        "doc_id": "1905.00472",
        "question": "How many languages are in the dataset?"
    },
    {
        "question_id": "04914917d01c9cd8718cd551dc253eb3827915d8",
        "doc_id": "1905.00472",
        "question": "Did the system perform well on low-resource languages?"
    },
    {
        "question_id": "37b0ee4a9d0df3ae3493e3b9114c3f385746da5c",
        "doc_id": "1905.08392",
        "question": "Do they report results only on English data?"
    },
    {
        "question_id": "bba70f3cf4ca1e0bb8c4821e3339c655cdf515d6",
        "doc_id": "1905.08392",
        "question": "When the authors say their method largely outperforms the baseline, does this mean that the baseline performed better in some cases? If so, which ones?"
    },
    {
        "question_id": "c5f9894397b1a0bf6479f5fd9ee7ef3e38cfd607",
        "doc_id": "1905.08392",
        "question": "What baseline method was used?"
    },
    {
        "question_id": "9f8c0e02a7a8e9ee69f4c1757817cde85c7944bd",
        "doc_id": "1905.08392",
        "question": "What was the motivation for using a dependency tree based recursive architecture?"
    },
    {
        "question_id": "6cbbedb34da50286f44a0f3f6312346e876e2be5",
        "doc_id": "1905.08392",
        "question": "How was a causal diagram used to carefully remove this bias?"
    },
    {
        "question_id": "173060673cb15910cc310058bbb9750614abda52",
        "doc_id": "1905.08392",
        "question": "How does publicity bias the dataset?"
    },
    {
        "question_id": "98c8ed9019e43839ffb53a714bc37fbb1c28fe2c",
        "doc_id": "1905.08392",
        "question": "How do the speakers' reputations bias the dataset?"
    },
    {
        "question_id": "2ca3ca39d59f448e30be6798514709be7e3c62d8",
        "doc_id": "1603.01547",
        "question": "Which datasets did they use to train the model?"
    },
    {
        "question_id": "df7fb8e6e44c9c5af3f19dde762c75cbf2f8452f",
        "doc_id": "1603.01547",
        "question": "What is the performance of their model?"
    },
    {
        "question_id": "20e2b517fddb0350f5099c39b16c2ca66186d09b",
        "doc_id": "1603.01547",
        "question": "What baseline do they compare against?"
    },
    {
        "question_id": "70512cc9dcd45157e40c8d1f85e82d21ade7645b",
        "doc_id": "1603.01547",
        "question": "What datasets is the model evaluated on?"
    },
    {
        "question_id": "9133a85730c4090fe8b8d08eb3d9146efe7d7037",
        "doc_id": "1803.05160",
        "question": "Do the authors offer any potential reasons why cross-validation variants tend to overestimate the performance, while the sequential methods tend to underestimate it?"
    },
    {
        "question_id": "42279c3a202a93cfb4aef49212ccaf401a3f8761",
        "doc_id": "1803.05160",
        "question": "Which three variants of sequential validation are examined?"
    },
    {
        "question_id": "9ca85242ebeeafa88a0246986aa760014f6094f2",
        "doc_id": "1803.05160",
        "question": "Which three variants of cross-validation are examined?"
    },
    {
        "question_id": "8641156c4d67e143ebbabbd79860349242a11451",
        "doc_id": "1803.05160",
        "question": "Which European languages are targeted?"
    },
    {
        "question_id": "2a120f358f50c377b5b63fb32633223fa4ee2149",
        "doc_id": "1803.05160",
        "question": "In what way are sentiment classes ordered?"
    },
    {
        "question_id": "b57ad10468e1ba2a7a34396688dbb10a575d89f5",
        "doc_id": "1706.02427",
        "question": "What loss function is used?"
    },
    {
        "question_id": "9d6d17120c42a834b2b5d96f2120d646218ed4bb",
        "doc_id": "1706.02427",
        "question": "Do they use the unstructured text on the webpage that was the source of the table?"
    },
    {
        "question_id": "965e0ce975a0b8612a30cfc31bbfd4b8a57aa138",
        "doc_id": "1706.02427",
        "question": "Does their method rely on the column headings of the table?"
    },
    {
        "question_id": "8dfdd1ed805bb23c774fbb032ef1d97c6802e07c",
        "doc_id": "1706.02427",
        "question": "Are all the tables in the dataset from the same website?"
    },
    {
        "question_id": "c21675d8a90bda624d27e5535d1c10f08fcbc16b",
        "doc_id": "1706.02427",
        "question": "How are the tables extracted from the HTML?"
    },
    {
        "question_id": "14fdc8087f2a62baea9d50c4aa3a3f8310b38d17",
        "doc_id": "1909.12208",
        "question": "What supports the claim that enhancement in training is advisable as long as enhancement in test is at least as strong as in training?"
    },
    {
        "question_id": "3d2b5359259cd3518f361d760bacc49d84c40d82",
        "doc_id": "1909.12208",
        "question": "How does this single-system compares to system combination ones?"
    },
    {
        "question_id": "26a321e242e58ea5f2ceaf37f26566dd0d0a0da1",
        "doc_id": "1909.12208",
        "question": "What was previous single-system state of the art result on the CHiME-5 data?"
    },
    {
        "question_id": "6920fd470e6a99c859971828e20276a1b9912280",
        "doc_id": "1909.12208",
        "question": "How much is error rate reduced by cleaning up training data?"
    },
    {
        "question_id": "5370a0062aae7fa4e700ae47aa143be5c5fc6b22",
        "doc_id": "1811.04791",
        "question": "With how many languages do they experiment in the multilingual setup?"
    },
    {
        "question_id": "9a52a33d0ae5491c07f125454aea9a41b9babb82",
        "doc_id": "1811.04791",
        "question": "How do they extract target language bottleneck features?"
    },
    {
        "question_id": "8c46a26f9b0b41c656b5b55142d491600663defa",
        "doc_id": "1811.04791",
        "question": "Which dataset do they use?"
    },
    {
        "question_id": "e5f8d2fc1332e982a54ee4b4c1f7f55e900d0b86",
        "doc_id": "1811.04791",
        "question": "Which intrisic measures do they use do evaluate obtained representations?"
    },
    {
        "question_id": "a4a9971799c8860b50f219c93f050ebf6a627b3d",
        "doc_id": "1905.08067",
        "question": "What languages feature in the dataset?"
    },
    {
        "question_id": "778c6a27182349dc5275282c3e9577bda2555c3d",
        "doc_id": "1905.08067",
        "question": "What textual, psychological and behavioural patterns are observed in radical users?"
    },
    {
        "question_id": "42dcf1bb19b8470993c05e55413eed487b0f2559",
        "doc_id": "1905.08067",
        "question": "Where is the propaganda material sourced from?"
    },
    {
        "question_id": "2ecd12069388fd58ad5f8f4ae7ac1bb4f56497b9",
        "doc_id": "1905.08067",
        "question": "Which behavioural features are used?"
    },
    {
        "question_id": "824629b36a75753b1500d9dcaee0fc3c758297b1",
        "doc_id": "1905.08067",
        "question": "Which psychological features are used?"
    },
    {
        "question_id": "31894361833b3e329a1fb9ebf85a78841cff229f",
        "doc_id": "1905.08067",
        "question": "Which textual features are used?"
    },
    {
        "question_id": "2973fe3f5b4bf70ada02ac4a9087dd156cc3016e",
        "doc_id": "1608.01972",
        "question": "By how much does their similarity measure outperform BM25?"
    },
    {
        "question_id": "42269ed04e986ec5dc4164bf57ef306aec4a1ae1",
        "doc_id": "1608.01972",
        "question": "How do they represent documents when using their proposed similarity measure?"
    },
    {
        "question_id": "31a3ec8d550054465e55a26b0136f4d50d72d354",
        "doc_id": "1608.01972",
        "question": "How do they propose to combine BM25 and word embedding similarity?"
    },
    {
        "question_id": "a7e1b13cc42bfe78d37b9c943de6288e5f00f01b",
        "doc_id": "1608.01972",
        "question": "Do they use pretrained word embeddings to calculate Word Mover's distance?"
    },
    {
        "question_id": "e500948fa01c74e5cb3e6774f66aaa9ad4b3e435",
        "doc_id": "1812.05813",
        "question": "What was the inter-annotator agreement between the expert annotators?"
    },
    {
        "question_id": "b8b79a6123716cb9fabf751b31dff424235a2ee2",
        "doc_id": "1812.05813",
        "question": "How were missing hypotheses discovered?"
    },
    {
        "question_id": "7bd24920163a4801b34d0a50aed957ba8efed0ab",
        "doc_id": "2002.04815",
        "question": "How long is their sentiment analysis dataset?"
    },
    {
        "question_id": "df01e98095ba8765d9ab0d40c9e8ef34b64d3700",
        "doc_id": "2002.04815",
        "question": "What NLI dataset was used?"
    },
    {
        "question_id": "a7a433de17d0ee4dd7442d7df7de17e508baf169",
        "doc_id": "2002.04815",
        "question": "What aspects are considered?"
    },
    {
        "question_id": "abfa3daaa984dfe51289054f4fb062ce93f31d19",
        "doc_id": "2002.04815",
        "question": "What layer gave the better results?"
    },
    {
        "question_id": "71e1f06daf6310609d00850340e64a846fbe2dfb",
        "doc_id": "1908.09355",
        "question": "How many GPUs do they train their models on?"
    },
    {
        "question_id": "ebb4db9c24aa36db9954dd65ea079a798df80558",
        "doc_id": "1908.09355",
        "question": "What of the two strategies works best?"
    },
    {
        "question_id": "7a212a34e9dbb0ba52c40471842b2e0e3e14f276",
        "doc_id": "1908.09355",
        "question": "What downstream tasks are tested?"
    },
    {
        "question_id": "55bde89fc5822572f794614df3130d23537f7cf2",
        "doc_id": "2002.04745",
        "question": "How much is training speeded up?"
    },
    {
        "question_id": "523bc4e3482e1c9a8e0cb92cfe51eea92c20e8fd",
        "doc_id": "2002.04745",
        "question": "What experiments do they perform?"
    },
    {
        "question_id": "6073be8b88f0378cd0c4ffcad87e1327bc98b991",
        "doc_id": "2002.04745",
        "question": "What is mean field theory?"
    },
    {
        "question_id": "46aa61557c8d20b1223a30366a0704d7af68bbbe",
        "doc_id": "1910.07924",
        "question": "How is the sentence alignment quality evaluated?"
    },
    {
        "question_id": "b3b9d7c8722e8ec41cbbae40e68458485a5ba25c",
        "doc_id": "1910.07924",
        "question": "How is the speech alignment quality evaluated?"
    },
    {
        "question_id": "9186b2c5b7000ab7f15a46a47da73ea45544bace",
        "doc_id": "2003.03131",
        "question": "How is the model evaluated against the original recursive training algorithm?"
    },
    {
        "question_id": "d30b2fb5b29faf05cf5e04d0c587a7310a908d8c",
        "doc_id": "2003.03131",
        "question": "What is the improvement in performance compared to the linguistic gold standard?"
    },
    {
        "question_id": "526dc757a686a1fe41e77f7e3848e3507940bfc4",
        "doc_id": "2003.03131",
        "question": "What is the improvement in performance brought by lexicon pruning on a simple EM algorithm?"
    },
    {
        "question_id": "041529e15b70b21986adb781fd9b94b595e451ed",
        "doc_id": "1602.04341",
        "question": "what was the margin their system outperformed previous ones?"
    },
    {
        "question_id": "da2350395867b5fd4dbf968b5a1cd6921ab6dd37",
        "doc_id": "1602.04341",
        "question": "what prior approaches did they compare to?"
    },
    {
        "question_id": "f92c344e9b1a986754277fd0f08a47dc3e5f9feb",
        "doc_id": "1803.08419",
        "question": "What are the limitations of the currently used quantitative metrics? e.g. why are they not 'good'?"
    },
    {
        "question_id": "b10388e343868ca8e5c7c601ebb903f52e756e61",
        "doc_id": "1803.08419",
        "question": "What metrics are typically used to compare models?"
    },
    {
        "question_id": "e8cdeb3a081d51cc143c7090a54c82d393f1a2ca",
        "doc_id": "1803.08419",
        "question": "Is there a benchmark to compare the different approaches?"
    },
    {
        "question_id": "833d3ae7613500f2867ed8b33d233d71781014e7",
        "doc_id": "1803.08419",
        "question": "What GAN and RL approaches are used?"
    },
    {
        "question_id": "a1a0365bf6968cbdfd1072cf3923c26250bc955c",
        "doc_id": "1803.08419",
        "question": "What type of neural models are used?"
    },
    {
        "question_id": "64f7337970e8d1989b2e1f7106d86f73c4a3d0af",
        "doc_id": "1803.08419",
        "question": "What type of statistical models were used initially?"
    },
    {
        "question_id": "8fdb4f521d3ba4179f8ccc4c28ba399aab6c3550",
        "doc_id": "1803.08419",
        "question": "What was the proposed use of conversational agents in pioneering work?"
    },
    {
        "question_id": "a0d45b71feb74774cfdc0d5c6e23cd41bc6bc1f2",
        "doc_id": "1803.08419",
        "question": "What work pioneered the field of conversational agents?"
    },
    {
        "question_id": "7f8fc3c7d59aba80a3e7c839db6892a1fc329210",
        "doc_id": "1710.11027",
        "question": "Do they evaluate only on English datasets?"
    },
    {
        "question_id": "2d92ae6b36567e7edb6afdd72f97b06ac144fbdf",
        "doc_id": "1710.11027",
        "question": "What is the Ritter dataset?"
    },
    {
        "question_id": "a5df7361ae37b9512fb57cb93efbece9ded8cab1",
        "doc_id": "1710.11027",
        "question": "Does this model perform better than the state of the art?"
    },
    {
        "question_id": "915e4d0b3cb03789a20380ead961d473cb95bfc3",
        "doc_id": "1710.11027",
        "question": "What features are extracted from text?"
    },
    {
        "question_id": "c01a8b42fd27b0a3bec717ededd98b6d085a0f5c",
        "doc_id": "1710.11027",
        "question": "What features are extracted from images?"
    },
    {
        "question_id": "5d790459b05c5a3e6f1e698824444e55fc11890c",
        "doc_id": "1911.01770",
        "question": "What are two baseline methods?"
    },
    {
        "question_id": "1ef6471cc3e1eb10d2e92656c77020ca1612f08e",
        "doc_id": "1911.01770",
        "question": "How does model compare to the baselines?"
    },
    {
        "question_id": "cef3a26d8b46cd057bcc2abd3d648dc15336a2bf",
        "doc_id": "1910.03943",
        "question": "what is the cold-start problem?"
    },
    {
        "question_id": "636ac549cf4917c5922cd09a655abf278924c930",
        "doc_id": "1910.03943",
        "question": "how was the experiment evaluated?"
    },
    {
        "question_id": "c61c0b25f9de4a7ca2013d2e4aba8a5047e14ce4",
        "doc_id": "1910.03943",
        "question": "what other applications did they experiment in?"
    },
    {
        "question_id": "1d047286ac63e5dca1ab811172b89d7d125679e5",
        "doc_id": "1910.03943",
        "question": "what dataset was used for training?"
    },
    {
        "question_id": "e3c2b6fcf77a7b1c76add2e6e1420d07c29996ea",
        "doc_id": "1606.07947",
        "question": "Which knowledge destilation methods do they introduce?"
    },
    {
        "question_id": "ee2c2fb01d67f4c58855bf23186cbd45cecbfa56",
        "doc_id": "1606.07947",
        "question": "What type of weight pruning do they use?"
    },
    {
        "question_id": "f77d7cddef3e021d70e16b9e16cecfd4b8ee80d3",
        "doc_id": "1606.07947",
        "question": "Which dataset do they train on?"
    },
    {
        "question_id": "a0197894ee94b01766fa2051f50f84e16b5c9370",
        "doc_id": "1606.07947",
        "question": "Do they reason why greedy decoding works better then beam search?"
    },
    {
        "question_id": "0bde3ecfdd7c4a9af23f53da2cda6cd7a8398220",
        "doc_id": "1804.07445",
        "question": "what language was the data in?"
    },
    {
        "question_id": "f7ee48dd32c666ef83a4ae4aa06bcde85dd8ec4b",
        "doc_id": "1804.07445",
        "question": "what was the baseline?"
    },
    {
        "question_id": "051034cc94f2c02d3041575c53f969b3311c9ea1",
        "doc_id": "1804.07445",
        "question": "which automatic metrics were used in evaluation?"
    },
    {
        "question_id": "511e46b5aa8e1ee9e7dc890f47fa15ef94d4a0af",
        "doc_id": "1804.07445",
        "question": "how do humans judge the simplified sentences?"
    },
    {
        "question_id": "6b4006a90aeaaff8914052d72d28851a9c0c0146",
        "doc_id": "1804.07445",
        "question": "what datasets were used?"
    },
    {
        "question_id": "2dba0b83fc22995f83e7ac66cc8f68bcdcc70ee9",
        "doc_id": "1909.04251",
        "question": "Do humans assess the quality of the generated responses?"
    },
    {
        "question_id": "a8cc891bb8dccf0d32c1c9cd1699d5ead0eed711",
        "doc_id": "1909.04251",
        "question": "What models are used to generate responses?"
    },
    {
        "question_id": "8330242b56b63708a23c6a92db4d4bcf927a4576",
        "doc_id": "1909.04251",
        "question": "What types of hate speech are considered?"
    },
    {
        "question_id": "c19e9fd2f1c969e023fb99b74e78eb1f3db8e162",
        "doc_id": "2003.13016",
        "question": "Was the automatic annotation evaluated?"
    },
    {
        "question_id": "d6b0c71721ed24ef1d9bd31ed3a266b0c7fc9b57",
        "doc_id": "1704.06960",
        "question": "What dataset is used?"
    },
    {
        "question_id": "22225ba18a6efe74b1315cc08405011d5431498e",
        "doc_id": "1705.00571",
        "question": "Do they use external financial knowledge in their approach?"
    },
    {
        "question_id": "bd3562d2b3c162e9d27404d56b77e15f707d8b0f",
        "doc_id": "1705.00571",
        "question": "Which evaluation metrics do they use?"
    },
    {
        "question_id": "9c529bd3f7565b2178a79aae01c98c90f9d372ad",
        "doc_id": "1705.00571",
        "question": "Which finance specific word embedding model do they use?"
    },
    {
        "question_id": "344238de7208902f7b3a46819cc6d83cc37448a0",
        "doc_id": "1908.06024",
        "question": "Did the survey provide insight into features commonly found to be predictive of abusive content on online platforms?"
    },
    {
        "question_id": "56bbca3fe24c2e9384cc57f55f35f7f5ad5c5716",
        "doc_id": "1908.06024",
        "question": "Is deep learning the state-of-the-art method in automated abuse detection"
    },
    {
        "question_id": "4c40fa01f626def0b69d1cb7bf9181b574ff6382",
        "doc_id": "1908.06024",
        "question": "What datasets were used in this work?"
    },
    {
        "question_id": "71b29ab3ddcdd11dcc63b0bb55e75914c07a2217",
        "doc_id": "1908.06024",
        "question": "How is abuse defined for the purposes of this research?"
    },
    {
        "question_id": "7cf44877dae8873139aede381fb9908dd0c546c4",
        "doc_id": "1703.04357",
        "question": "What is the architecture of the model?"
    },
    {
        "question_id": "86de8de906e30bb2224a2f70f6e5cf5e5ad4be72",
        "doc_id": "1703.04357",
        "question": "How many translation pairs are used for training?"
    },
    {
        "question_id": "fea9b4d136156f23a88e5c7841874a467f2ba86d",
        "doc_id": "1908.06259",
        "question": "How do they determine that a decoder handles an easier task than the encoder?"
    },
    {
        "question_id": "4e59808a7f73ac499b9838d3c0ce814196a02473",
        "doc_id": "1908.06259",
        "question": "How do they measure conditional information strength?"
    },
    {
        "question_id": "7ef7a5867060f91eac8ad857c186e51b767c734b",
        "doc_id": "1908.06259",
        "question": "How do they generate input noise for the encoder and decoder?"
    },
    {
        "question_id": "31cba86bc45970337ba035ecf36d8954a9a5206a",
        "doc_id": "2003.09244",
        "question": "What private companies are members of consortium?"
    },
    {
        "question_id": "3a25f82512d56d9e1ffba72f977f515ae3ba3cca",
        "doc_id": "2003.09244",
        "question": "Does programme plans gathering and open sourcing some large dataset for Icelandic language?"
    },
    {
        "question_id": "b59f3a58939f7ac007d3263a459c56ebefc4b49a",
        "doc_id": "2003.09244",
        "question": "What concrete software is planned to be developed by the end of the programme?"
    },
    {
        "question_id": "b4b7333805cb6fdde44907747887a971422dc298",
        "doc_id": "2003.09244",
        "question": "What other national language technology programs are described in the paper?"
    },
    {
        "question_id": "871f7661f5a3da366b0b5feaa36f54fd3dedae8e",
        "doc_id": "2003.09244",
        "question": "When did language technology start in Iceland?"
    },
    {
        "question_id": "3746aaa1a81d9c725bc7a4a67086634c11998d39",
        "doc_id": "1711.11017",
        "question": "Do the authors provide any benchmark tasks in this new environment?"
    },
    {
        "question_id": "80d425258d027e3ca3750375d170debb9d92fbc6",
        "doc_id": "1903.00384",
        "question": "Can their method be transferred to other Q&A platforms (in other languages)?"
    },
    {
        "question_id": "2ae66798333b905172e2c0954e9808662ab7f221",
        "doc_id": "1903.00384",
        "question": "What measures of quality do they use for a Q&A platform?"
    },
    {
        "question_id": "6e040e80f2da69d50386a90a38ed6d2fa4f77bbd",
        "doc_id": "1911.04474",
        "question": "Which NER dataset do they use?"
    },
    {
        "question_id": "aebd1f0d728d0de5f76238844da044a44109f76f",
        "doc_id": "1911.04474",
        "question": "How do they incorporate direction and relative distance in attention?"
    },
    {
        "question_id": "cb4086ad022197da79f28dc609d0de90108c4543",
        "doc_id": "1911.04474",
        "question": "Do they outperform current NER state-of-the-art models?"
    },
    {
        "question_id": "b8a3ab219be6c1e6893fe80e1fbf14f0c0c3c97c",
        "doc_id": "1909.09070",
        "question": "What datasets are used in this paper?"
    },
    {
        "question_id": "780c7993d446cd63907bb38992a60bbac9cb42b1",
        "doc_id": "1909.09070",
        "question": "What language are the captions in?"
    },
    {
        "question_id": "3da4606a884593f7702d098277b9a6ce207c080b",
        "doc_id": "1909.09070",
        "question": "What ad-hoc approaches are explored?"
    },
    {
        "question_id": "91336f12ab94a844b66b607f8621eb8bbd209f32",
        "doc_id": "1909.09070",
        "question": "What supervised baselines did they compare with?"
    },
    {
        "question_id": "c5221bb28e58a4f13cf2eccce0e1b1bec7dd3c13",
        "doc_id": "1909.09070",
        "question": "Is the data specific to a domain?"
    },
    {
        "question_id": "42a4ab4607a9eec42c427a817b7e898230d26444",
        "doc_id": "1909.09070",
        "question": "Where do their figure and captions come from?"
    },
    {
        "question_id": "622efbecd9350a0f4487bdff2b8b362ef2541f3c",
        "doc_id": "1708.05521",
        "question": "did the top teams experiment with lexicons?"
    },
    {
        "question_id": "f54e19f7ecece1bb0ef3171403ae322ad572ff00",
        "doc_id": "1708.05521",
        "question": "did they experiment with lexicons?"
    },
    {
        "question_id": "4137a82d7752be7a6c142ceb48ce784fd475fb06",
        "doc_id": "1708.05521",
        "question": "what was the baseline?"
    },
    {
        "question_id": "6c50871294562e4886ede804574e6acfa8d1a5f9",
        "doc_id": "1708.05521",
        "question": "what was their result?"
    },
    {
        "question_id": "0ac6fbd81e2dd95b800283dc7e59ce969d45fc02",
        "doc_id": "1708.05521",
        "question": "what dataset was used?"
    },
    {
        "question_id": "8da8c4651979a4b1d1d3008c1f77bc7e9397183b",
        "doc_id": "1802.02614",
        "question": "how does end of utterance and token tags affect the performance"
    },
    {
        "question_id": "8cf52ba480d372fc15024b3db704952f10fdca27",
        "doc_id": "1802.02614",
        "question": "what are the baselines?"
    },
    {
        "question_id": "d8ae36ae1b4d3af5b59ebd24efe94796101c1c12",
        "doc_id": "1802.02614",
        "question": "what kind of conversations are in the douban conversation corpus?"
    },
    {
        "question_id": "2bd702174e915d97884d1571539fb1b5b0b7123a",
        "doc_id": "1802.02614",
        "question": "what pretrained word embeddings are used?"
    },
    {
        "question_id": "6da1320fa25b2b6768358d3233a5ecf99cc73db5",
        "doc_id": "1810.12897",
        "question": "What set topics are looked at?"
    },
    {
        "question_id": "351f7b254e80348221e0654478663a5e53d3fe65",
        "doc_id": "1810.12897",
        "question": "What were the baselines?"
    },
    {
        "question_id": "d323f0d65b57b30ae85fb9f24298927a3d1216e9",
        "doc_id": "1810.12897",
        "question": "Which widely used dataset did the authors use?"
    },
    {
        "question_id": "36cb7ebdd39e0b8a89ff946d3a3aef8a76a6bb43",
        "doc_id": "1908.09919",
        "question": "Are LSA-reduced n-gram features considered hand-crafted features?"
    },
    {
        "question_id": "28e50459da60ceda49fe1578c12f3f805b288bd0",
        "doc_id": "1908.09919",
        "question": "What is the performance of the model on English, Spanish and Arabic?"
    },
    {
        "question_id": "e1f61500eb733f2b95692b6a9a53f8aaa6f1e1f6",
        "doc_id": "1908.09919",
        "question": "How is this model different from a LSTM?"
    },
    {
        "question_id": "380e71848d4b0d1e983d504b1249119612f00bcb",
        "doc_id": "1706.00188",
        "question": "What deep learning methods do they look at?"
    },
    {
        "question_id": "21c89ee0281f093b209533453196306b9699b552",
        "doc_id": "1706.00188",
        "question": "What is their baseline?"
    },
    {
        "question_id": "5096aaea2d0f4bea4c12e14f4f7735e1aea1bfa6",
        "doc_id": "1706.00188",
        "question": "Which architectures do they experiment with?"
    },
    {
        "question_id": "452e2d7d7d9e1bb4914903479cd7caff9f6fae42",
        "doc_id": "1706.00188",
        "question": "Are pretrained embeddings used?"
    },
    {
        "question_id": "1e582319df1739dcd07ba0ba39e8f70187fba049",
        "doc_id": "1709.04005",
        "question": "what is the average number of speakers in the dataset?"
    },
    {
        "question_id": "aaf2445e78348dba66d7208b7430d25364e11e46",
        "doc_id": "1709.04005",
        "question": "by how much is accuracy improved?"
    },
    {
        "question_id": "d98148f65d893101fa9e18aaf549058712485436",
        "doc_id": "1709.04005",
        "question": "what are the previous state of the art systems?"
    },
    {
        "question_id": "3f8a42eb0e904ce84c3fded2103f674e9cbc893d",
        "doc_id": "1911.00202",
        "question": "What is the training objective in the method introduced in this paper?"
    },
    {
        "question_id": "521a3e7300567f6e8e4c531f223dbc9fc306c393",
        "doc_id": "1911.00202",
        "question": "Does regularization of the fine-tuning process hurt performance in the target domain?"
    },
    {
        "question_id": "b6dae03d56dff0db8ad2a1bff9c7dd3f87551cd1",
        "doc_id": "1811.00127",
        "question": "Do they release their code?"
    },
    {
        "question_id": "f93bad406e004014618dd64f6c604b1a9ee6a371",
        "doc_id": "1811.00127",
        "question": "What media sources do they use?"
    },
    {
        "question_id": "4eb42c5d56d695030dd47ea7f6d65164924c4017",
        "doc_id": "1910.09387",
        "question": "What domain do the audio samples fall under?"
    },
    {
        "question_id": "eff9192e05d23e9a67d10be0c89a7ab2b873995b",
        "doc_id": "1910.09387",
        "question": "How did they evaluate the quality of annotations?"
    },
    {
        "question_id": "87523fb927354ddc8ad1357a81f766b7ea95f53c",
        "doc_id": "1910.09387",
        "question": "How many annotators did they have?"
    },
    {
        "question_id": "9e9aa8af4b49e2e1e8cd9995293a7982ea1aba0e",
        "doc_id": "1910.09387",
        "question": "What is their baseline method?"
    },
    {
        "question_id": "9baca9bdb8e7d5a750f8cbe3282beb371347c164",
        "doc_id": "1901.05389",
        "question": "How do they preprocess Tweets?"
    },
    {
        "question_id": "2cb20bae085b67e357ab1e18ebafeac4bbde5b4a",
        "doc_id": "1901.05389",
        "question": "What kind of inference model do they build to estimate socioeconomic status?"
    },
    {
        "question_id": "892ee7c2765b3764312c3c2b6f4538322efbed4e",
        "doc_id": "1901.05389",
        "question": "How much data do they gather in total?"
    },
    {
        "question_id": "c68946ae2e548ec8517c7902585c032b3f3876e6",
        "doc_id": "1901.05389",
        "question": "Do they analyze features which help indicate socioeconomic status?"
    },
    {
        "question_id": "7557f2c3424ae70e2a79c51f9752adc99a9bdd39",
        "doc_id": "1901.05389",
        "question": "What inference models are used?"
    },
    {
        "question_id": "b03249984c26baffb67e7736458b320148675900",
        "doc_id": "1901.05389",
        "question": "What baseline model is used?"
    },
    {
        "question_id": "9595fdf7b51251679cd39bc4f6befc81f09c853c",
        "doc_id": "1901.05389",
        "question": "How is the remotely sensed data annotated?"
    },
    {
        "question_id": "08c0d4db14773cbed8a63e69381a2265e85f8765",
        "doc_id": "1901.05389",
        "question": "Where are the professional profiles crawled from?"
    },
    {
        "question_id": "389ff1927ba9fc8bac50959fc09f30c2143cc14e",
        "doc_id": "1804.09692",
        "question": "What downstream tasks are explored?"
    },
    {
        "question_id": "b968bd264995cd03d7aaad1baba1838c585ec909",
        "doc_id": "1804.09692",
        "question": "What factors contribute to the stability of the word embeddings?"
    },
    {
        "question_id": "afcd1806b931a97c0679f873a71b825e668f2b75",
        "doc_id": "1804.09692",
        "question": "How is unstability defined?"
    },
    {
        "question_id": "01c8c3836467a4399cc37e86244b5bdc5dda2401",
        "doc_id": "1804.09692",
        "question": "What embedding algorithms are explored?"
    },
    {
        "question_id": "65c9aee2051ff7c47112b2aee0d928d9b6a8c2fe",
        "doc_id": "2003.04978",
        "question": "Which datasets do they use?"
    },
    {
        "question_id": "f8264609a44f059b74168995ffee150182a0c14f",
        "doc_id": "2003.04978",
        "question": "What models are explored in this paper?"
    },
    {
        "question_id": "d9354c0bb32ec037ff2aacfed58d57887a713163",
        "doc_id": "1707.07212",
        "question": "What languages are used as input?"
    },
    {
        "question_id": "c035a011b737b0a10deeafc3abe6a282b389d48b",
        "doc_id": "1707.07212",
        "question": "What are the components of the classifier?"
    },
    {
        "question_id": "d3fb0d84d763cb38f400b7de3daaa59ed2a1b0ab",
        "doc_id": "1707.07212",
        "question": "Which uncertain outcomes are forecast using the wisdom of crowds?"
    },
    {
        "question_id": "63cdac43a643fc1e06da44910458e89b2c7cd921",
        "doc_id": "1904.03670",
        "question": "How was the dataset collected?"
    },
    {
        "question_id": "74acaa205a5998af4ad7edbed66837a6f2b5c58b",
        "doc_id": "1712.00733",
        "question": "What are the baselines for this paper?"
    },
    {
        "question_id": "cfcf94b81589e7da215b4f743a3f8de92a6dda7a",
        "doc_id": "1712.00733",
        "question": "What VQA datasets are used for evaluating this task? "
    },
    {
        "question_id": "d147117ef24217c43252d917d45dff6e66ff807c",
        "doc_id": "1712.00733",
        "question": "How do they model external knowledge? "
    },
    {
        "question_id": "1a2b69dfa81dfeadd67b133229476086f2cc74a8",
        "doc_id": "1712.00733",
        "question": "What type of external knowledge has been used for this paper? "
    },
    {
        "question_id": "cdb211be0340bb18ba5a9ee988e9df0e2ba8b793",
        "doc_id": "1609.02075",
        "question": "Does the paper discuss limitations of considering only data from Twitter?"
    },
    {
        "question_id": "4cb2e80da73ae36de372190b4c1c490b72977ef8",
        "doc_id": "1609.02075",
        "question": "Did they represent tie strength only as number of social ties in a networks? "
    },
    {
        "question_id": "a064337bafca8cf01e222950ea97ebc184c47bc0",
        "doc_id": "1609.02075",
        "question": "What sociolinguistic variables (phonetic spellings) did they analyze? "
    },
    {
        "question_id": "993d5bef2bf1c0cd537342ef76d4b952f0588b83",
        "doc_id": "1609.02075",
        "question": "What older dialect markers did they explore?"
    },
    {
        "question_id": "5ed02ae6c534cd49d405489990f0e4ba0330ff1b",
        "doc_id": "2004.04124",
        "question": "Does LadaBERT ever outperform its knowledge destilation teacher in terms of accuracy on some problems?"
    },
    {
        "question_id": "f6346828c2f44529dc307abf04dd246bfeb4a9b2",
        "doc_id": "2004.04124",
        "question": "Do they evaluate which compression method yields the most gains?"
    },
    {
        "question_id": "935873b97872820b7b6100d6a785fba286b94900",
        "doc_id": "2004.04124",
        "question": "On which datasets does LadaBERT achieve state-of-the-art?"
    },
    {
        "question_id": "952fe4fbf4e0bcfcf44fab2dbd3ed85dd961eff3",
        "doc_id": "1912.07940",
        "question": "Do the tweets fall under a specific domain?"
    },
    {
        "question_id": "1dc5bf9dca7de2ba21db10e9056d3906267ef5d5",
        "doc_id": "1912.07940",
        "question": "How many tweets are in the dataset?"
    },
    {
        "question_id": "8faec509406d33444bd620afc829adc9eae97644",
        "doc_id": "1912.07940",
        "question": "What categories do they look at?"
    },
    {
        "question_id": "5ae005917efc17a505ba1ba5e996c4266d6c74b6",
        "doc_id": "1806.06571",
        "question": "Did they use the same dataset as Skip-gram to train?"
    },
    {
        "question_id": "72c04eb3fc323c720f7f8da75c70f09a35abf3e6",
        "doc_id": "1806.06571",
        "question": "How much were the gains they obtained?"
    },
    {
        "question_id": "a554cd1ba2a8d1348a898e0cb4b4c16cc8998257",
        "doc_id": "1911.00461",
        "question": "Do the authors evaluate only on English datasets?"
    },
    {
        "question_id": "3cc9a820c4a2cd2ff61da920c41ed09f3c0135be",
        "doc_id": "1911.00461",
        "question": "What metrics of gender bias amplification are used to demonstrate the effectiveness of this approach?"
    },
    {
        "question_id": "95ef89dc29ff291bdbe48cb956329a6a06d36db8",
        "doc_id": "1911.00461",
        "question": "How is representation learning decoupled from memory management in this architecture?"
    },
    {
        "question_id": "da21bcaa8e3a9eadc8a5194fd57ae797e93c3049",
        "doc_id": "2004.01970",
        "question": "what text classification datasets do they evaluate on?"
    },
    {
        "question_id": "363a24ecb8ab45215134935e7e8165fff72ff90f",
        "doc_id": "2004.01970",
        "question": "which models is their approach compared to?"
    },
    {
        "question_id": "3d1ad8a4aaa2653d0095bafba74738bd20795acf",
        "doc_id": "1909.07158",
        "question": "what dataset were used?"
    },
    {
        "question_id": "ec54ae2f4811196fcaafa45e76130239e69995f9",
        "doc_id": "1909.07158",
        "question": "what was the baseline?"
    },
    {
        "question_id": "5102dc911913e9ca0311253e44fd31c73eed0a57",
        "doc_id": "1909.07158",
        "question": "what text embedding methods were used in their approach?"
    },
    {
        "question_id": "f0404673085517eea708c5e91f32fb0f7728fa08",
        "doc_id": "1607.03895",
        "question": "What data is used in this work?"
    },
    {
        "question_id": "810e6d09813486a64e87ef6c1fb9b1e205871632",
        "doc_id": "1912.03010",
        "question": "How do they define their tokens (words, word-piece)?"
    },
    {
        "question_id": "ab8b0e6912a7ca22cf39afdac5531371cda66514",
        "doc_id": "1912.03010",
        "question": "By how much do they outperform existing state-of-the-art model on end-to-end Speech recognition?s "
    },
    {
        "question_id": "58259f2e22363aab20c448e5dd7b6f432556b32d",
        "doc_id": "2003.04707",
        "question": "How do they interpret the model?"
    },
    {
        "question_id": "b9e0b1940805a5056f71c66d176cc87829e314d4",
        "doc_id": "2003.04707",
        "question": "Do they compare their approach to data-driven only methods?"
    },
    {
        "question_id": "b54525a0057aa82b73773fa4dacfd115d8f86f1c",
        "doc_id": "2003.04707",
        "question": "What are the two applications of neuro-symbolism?"
    },
    {
        "question_id": "2d924e888a92dc0b14cdb5584e73e87254c3d1ee",
        "doc_id": "1707.09816",
        "question": "Do they reduce language variation of text by enhancing frequencies?"
    },
    {
        "question_id": "3ed8ac1ba4df6609fa7de5077d83e820641edc5e",
        "doc_id": "1707.09816",
        "question": "Which domains do they explore?"
    },
    {
        "question_id": "e1ab241059ef1700738f885f051d724a7fcf283a",
        "doc_id": "1707.09816",
        "question": "Which thesauri did they use?"
    },
    {
        "question_id": "5752c8d333afc1e6c666b18d1477c8f669b7a602",
        "doc_id": "1707.02892",
        "question": "Do they compare against state-of-the-art?"
    },
    {
        "question_id": "fcdafaea5b1c9edee305b81f6865efc8b8dc50d3",
        "doc_id": "1707.02892",
        "question": "What are the benchmark datasets?"
    },
    {
        "question_id": "91d4fd5796c13005fe306bcd895caaed7fa77030",
        "doc_id": "1707.02892",
        "question": "What tasks are the models trained on?"
    },
    {
        "question_id": "27d7a30e42921e77cfffafac5cb0d16ce5a7df99",
        "doc_id": "1707.02892",
        "question": "What recurrent neural networks are explored?"
    },
    {
        "question_id": "58a3cfbbf209174fcffe44ce99840c758b448364",
        "doc_id": "1707.05589",
        "question": "what are the recent models they compare with?"
    },
    {
        "question_id": "6c6e06f7bfb6d30003fd3801fdaf34649ef1b8f4",
        "doc_id": "1707.05589",
        "question": "what were their results on the hutter prize dataset?"
    },
    {
        "question_id": "b6e97d1b1565732b1b3f1d74e6d2800dd21be37a",
        "doc_id": "1707.05589",
        "question": "what was their newly established state of the art results?"
    },
    {
        "question_id": "4f8b078b9f60be30520fd32a3d8601ab3babb5c0",
        "doc_id": "1707.05589",
        "question": "what regularisation methods did they look at?"
    },
    {
        "question_id": "54517cded8267ea6c9a3f3cf9c37a8d24b3f7c2c",
        "doc_id": "1707.05589",
        "question": "what architectures were reevaluated?"
    },
    {
        "question_id": "211c242c028b35bb9cbd5e303bb6c750f859fd34",
        "doc_id": "1803.08614",
        "question": "Do any of their reviews contain translations for both Catalan and Basque?"
    },
    {
        "question_id": "9b05d5f723a8a452522907778a084b52e27fd924",
        "doc_id": "1803.08614",
        "question": "What is the size of their published dataset?"
    },
    {
        "question_id": "21175d8853fd906266f884bced85c598c35b1cbc",
        "doc_id": "1803.08614",
        "question": "How many annotators do they have for their dataset?"
    },
    {
        "question_id": "79e61134a6e29141cd19252571ffc92a0b4bc97f",
        "doc_id": "1907.05338",
        "question": "did they test with other pretrained models besides bert?"
    },
    {
        "question_id": "18fbfb1f88c5487f739aceffd23210a7d4057145",
        "doc_id": "1907.05338",
        "question": "what models did they compare with?"
    },
    {
        "question_id": "5d3e87937ecebf0695bece08eccefb2f88ad4a0f",
        "doc_id": "1907.05338",
        "question": "what datasets were used for testing?"
    },
    {
        "question_id": "475ef4ad32a8589dae9d97048166d732ae5d7beb",
        "doc_id": "1906.01502",
        "question": "Which languages with different script do they look at?"
    },
    {
        "question_id": "3fd8eab282569b1c18b82f20d579b335ae70e79f",
        "doc_id": "1906.01502",
        "question": "What languages do they experiment with?"
    },
    {
        "question_id": "8e9561541f2e928eb239860c2455a254b5aceaeb",
        "doc_id": "1906.01502",
        "question": "What language pairs are affected?"
    },
    {
        "question_id": "50c1bf8b928069f3ffc7f0cb00aa056a163ef336",
        "doc_id": "1906.01502",
        "question": "What evaluation metrics are used?"
    },
    {
        "question_id": "2ddfb40a9e73f382a2eb641c8e22bbb80cef017b",
        "doc_id": "1906.01502",
        "question": "What datasets did they use?"
    },
    {
        "question_id": "06d5de706348dbe8c29bfacb68ce65a2c55d0391",
        "doc_id": "1604.05559",
        "question": "What is the computational complexity of old method"
    },
    {
        "question_id": "6014c2219d29bae17279625716e7c2a1f8a2bd05",
        "doc_id": "1604.05559",
        "question": "Could you tell me more about the old method?"
    },
    {
        "question_id": "fd8a8eb69f07c584a76633f8802c2746f7236d64",
        "doc_id": "1911.03642",
        "question": "Do the authors report only on English"
    },
    {
        "question_id": "452e978bd597411b65be757bf47dc6a78f3c67c9",
        "doc_id": "1911.03642",
        "question": "How does counterfactual data augmentation affect gender bias in predictions and performance?"
    },
    {
        "question_id": "159025c44c0115ab4cdc253885384f72e592e83a",
        "doc_id": "1911.03642",
        "question": "How does hard debiasing affect gender bias in prediction and performance?"
    },
    {
        "question_id": "6590055fb033cb32826f2afecb3d7f607dd97d57",
        "doc_id": "1911.03642",
        "question": "How does name anonymization affect gender bias in predictions and performance?"
    },
    {
        "question_id": "3435e365adf7866e45670c865dc33bb7d2a6a0c6",
        "doc_id": "1911.03642",
        "question": "How are the sentences in WikiGenderBias curated?"
    },
    {
        "question_id": "1b1849ad0bdd79c6645572849fe7873ec7bd7e6d",
        "doc_id": "1608.07836",
        "question": "Are reddit and twitter datasets, which are fairly prevalent, not effective in addressing these problems?"
    },
    {
        "question_id": "ff2b58c90784eda6dddd8a92028e6432442c1093",
        "doc_id": "1809.03680",
        "question": "By how much do they outperform baselines?"
    },
    {
        "question_id": "5e4eac0b0a73d465d74568c21819acaec557b700",
        "doc_id": "1809.03680",
        "question": "Which baselines do they use?"
    },
    {
        "question_id": "bc6ad5964f444cf414b661a4b942dafb7640c564",
        "doc_id": "1809.03680",
        "question": "Which datasets do they evaluate on?"
    },
    {
        "question_id": "4c7ec282697f4f6646eb1c19f46bbaf8670b0de6",
        "doc_id": "1908.05908",
        "question": "What is the weak supervision signal used in Baidu Baike corpus?"
    },
    {
        "question_id": "07104dd36a0e7fdd2c211ad710de9a605495b697",
        "doc_id": "1908.05908",
        "question": "How is BERT optimized for this task?"
    },
    {
        "question_id": "3e88fcc94d0f451e87b65658751834f6103b2030",
        "doc_id": "1908.05908",
        "question": "What is a soft label?"
    },
    {
        "question_id": "3fafde90eebc1c00ba6c3fb4c5b984009393ce7f",
        "doc_id": "1709.04491",
        "question": "what was their accuracy result?"
    },
    {
        "question_id": "e6bc11bd6cfd4b2138c29602b9b56fc5378a4293",
        "doc_id": "1709.04491",
        "question": "what domain do the opinions fall under?"
    },
    {
        "question_id": "90829d5fde4f5f0ffc184c5e8fc64c8ac5ece521",
        "doc_id": "1709.04491",
        "question": "what was the baseline?"
    },
    {
        "question_id": "4748a50c96acb1aa03f7efd1b43376c193b2450a",
        "doc_id": "1709.04491",
        "question": "what dataset was used?"
    },
    {
        "question_id": "ac87dd34d28c3edd9419fa0145f3d38c87d696aa",
        "doc_id": "1807.08089",
        "question": "What is the dataset that is used to train the embeddings?"
    },
    {
        "question_id": "e66a88eecf8d5d093caec1f487603534f88dd7e7",
        "doc_id": "1807.08089",
        "question": "What speaker characteristics are used?"
    },
    {
        "question_id": "fef5b65263c81299acc350a101dabaf5a8cb9c6e",
        "doc_id": "1807.08089",
        "question": "What language is used for the experiments?"
    },
    {
        "question_id": "f40e23adc8245562c8677f0f86fa5175179b5422",
        "doc_id": "1807.08089",
        "question": "Is the embedding model test in any downstream task?"
    },
    {
        "question_id": "82b93ecd2397e417e1e80f93b7cf49c7bd9aeec3",
        "doc_id": "1708.03699",
        "question": "How much gain in performance was obtained with user embeddings?"
    },
    {
        "question_id": "bb7c80ab28c2aebfdd0bd90b22a55dbdf3a8ed5b",
        "doc_id": "1612.02695",
        "question": "What type of attention is used in the recognition system?"
    },
    {
        "question_id": "6c4e1a1ccc0c5c48115864a6928385c248f4d8ad",
        "doc_id": "1612.02695",
        "question": "What are the solutions proposed for the seq2seq shortcomings?"
    },
    {
        "question_id": "552b1c813f25bf39ace6cd5eefa56f4e4dd70c84",
        "doc_id": "1911.03854",
        "question": "What classification tasks do they experiment on?"
    },
    {
        "question_id": "1100e442e00c9914538a32aca7af994ce42e1b66",
        "doc_id": "1911.03854",
        "question": "What categories of fake news are in the dataset?"
    },
    {
        "question_id": "7d8cd7d6c86349ef0bd4fdbd84c8dc49c7678f46",
        "doc_id": "1709.06365",
        "question": "Which real world datasets do they experiment on?"
    },
    {
        "question_id": "0fee37ebe0a010cf8bd665fa566306d8e7d12631",
        "doc_id": "1709.06365",
        "question": "Which other models that incorporate meta information do they compare against?"
    },
    {
        "question_id": "f8bba20d1781ce2b14fad28d6eff024e5a6c2c02",
        "doc_id": "1709.06365",
        "question": "How do they measure topic quality?"
    },
    {
        "question_id": "252599e53f52b3375b26d4e8e8b66322a42d2563",
        "doc_id": "1709.06365",
        "question": "Which data augmentation techniques do they use?"
    },
    {
        "question_id": "dde29d9ea5859aa5a4bcd613dca80aec501ef03a",
        "doc_id": "1610.03955",
        "question": "Does their model use MFCC?"
    },
    {
        "question_id": "9b1382b44dc69f7ee20acf952f7ceb1c3ef83965",
        "doc_id": "1610.03955",
        "question": "What is the problem of session segmentation?"
    },
    {
        "question_id": "3c414f7fbf577dfd3363be6bbc9eba8bdd01f45f",
        "doc_id": "1610.03955",
        "question": "What dataset do they use?"
    },
    {
        "question_id": "81e101b2c803257492d67a00e8a1d9a07cbab136",
        "doc_id": "1909.05372",
        "question": "How does Overton handles contradictory or incomplete supervision data?"
    },
    {
        "question_id": "b942d94e4187e4fdc706cfdf92e3a869fc294911",
        "doc_id": "1909.05372",
        "question": "What are high level declarative abstractions Overton provides?"
    },
    {
        "question_id": "8ffae517bc0efa453b7e316d41bd9f1b6679b158",
        "doc_id": "1909.05372",
        "question": "How are applications presented in Overton?"
    },
    {
        "question_id": "0fd2854dd8d8191f00c8d12483b5a81a04de859f",
        "doc_id": "1909.05372",
        "question": "Does Overton support customizing deep learning models without writing any code?"
    },
    {
        "question_id": "000549a217ea24432c0656598279dbb85378c113",
        "doc_id": "1804.05253",
        "question": "Do they evaluate only on English datasets?"
    },
    {
        "question_id": "63d2e97657419a0185127534f4ff9d0039cb1a63",
        "doc_id": "1804.05253",
        "question": "What type of frequency analysis was used?"
    },
    {
        "question_id": "43f43b135109ebd1d2d1f9af979c64ce550b5f0f",
        "doc_id": "1804.05253",
        "question": "What type of classifiers were used?"
    },
    {
        "question_id": "e797634fa77e490783b349034f9e095ee570b7a9",
        "doc_id": "1804.05253",
        "question": "Who annotated the Twitter and Reddit data for irony?"
    },
    {
        "question_id": "5181527e6a61a9a192db5f8064e56ec263c42661",
        "doc_id": "1909.11980",
        "question": "What language(s) does the system answer questions in?"
    },
    {
        "question_id": "334aa5540c207768931a0fe78aa4981a895ba37c",
        "doc_id": "1909.11980",
        "question": "What metrics are used for evaluation?"
    },
    {
        "question_id": "b8bbdc3987bb456739544426c6037c78ede01b77",
        "doc_id": "1909.11980",
        "question": "Is the proposed system compared to existing systems?"
    },
    {
        "question_id": "8c89f1d1b3c2a45c0254c4c8d6e700ab9a4b4ffb",
        "doc_id": "1703.10090",
        "question": "What sources of less sensitive data are available?"
    },
    {
        "question_id": "f5bc07df5c61dcb589a848bd36f4ce9c22abd46a",
        "doc_id": "1703.10090",
        "question": "Other than privacy, what are the other major ethical challenges in clinical data?"
    },
    {
        "question_id": "ed7ce13cd95f7664a5e4fc530dcf72dc3808dced",
        "doc_id": "1712.02555",
        "question": "Do they ensure the that the architecture is differentiable everywhere after adding the Hungarian layer?"
    },
    {
        "question_id": "26eceba0e6e4c0b6dfa94e5708dd74b63f701731",
        "doc_id": "1712.02555",
        "question": "Which dataset(s) do they train on?"
    },
    {
        "question_id": "ff69b363ca604f80b2aa7afdc6a32d2ffd2d1f85",
        "doc_id": "1712.02555",
        "question": "By how much does their model outperform state-of-the-art baselines?"
    },
    {
        "question_id": "ac482ab8a5c113db7c1e5f106a5070db66e7ba37",
        "doc_id": "1808.09716",
        "question": "What set of semantic tags did they use?"
    },
    {
        "question_id": "24897f57e3b0550be1212c0d9ebfcf83bad4164e",
        "doc_id": "1808.09716",
        "question": "How much improvement did they see on the NLI task?"
    },
    {
        "question_id": "d20fd6330cb9d03734e2632166d6c8f780359a94",
        "doc_id": "1808.10059",
        "question": "How large the improvement margin is?"
    },
    {
        "question_id": "a913aa14d4e05cc9d658bf6697fe5b2652589b1b",
        "doc_id": "1907.01339",
        "question": "Which labeling scheme do they use?"
    },
    {
        "question_id": "b065a3f598560fdeba447f0a100dd6c963586268",
        "doc_id": "1907.01339",
        "question": "What parts of their multitask model are shared?"
    },
    {
        "question_id": "9d963d385bd495a7e193f8a498d64c1612e6c20c",
        "doc_id": "1907.01339",
        "question": "Which dataset do they use?"
    },
    {
        "question_id": "a1ac4463031bbc42c80893b57c0055b860f12e10",
        "doc_id": "1908.06893",
        "question": "What is their baseline?"
    },
    {
        "question_id": "3216dfc233be68206bd342407e2ba7da3843b31d",
        "doc_id": "1908.06893",
        "question": "Is human evaluation of the malicious content performed?"
    },
    {
        "question_id": "4f57ac24f3f4689a2f885715cd84b7d867fe3f12",
        "doc_id": "1908.06893",
        "question": "Do they compare to previous work?"
    },
    {
        "question_id": "b0e894536857cb249bd75188c3ca5a04e49ff0b6",
        "doc_id": "1906.01615",
        "question": "How do attention, recurrent and convolutional networks differ on the language classes they accept?"
    },
    {
        "question_id": "94c22f72665dfac3e6e72e40f2ffbc8c99bf849c",
        "doc_id": "1906.01615",
        "question": "What type of languages do they test LSTMs on?"
    },
    {
        "question_id": "2a9c7243744b42f1e9fed9ff2ab17c6f156b1ba4",
        "doc_id": "1804.05306",
        "question": "What was the baseline?"
    },
    {
        "question_id": "f8f64da7172e72e684f0e024a19411b43629ff55",
        "doc_id": "1804.05306",
        "question": "How many songs were collected?"
    },
    {
        "question_id": "fd556a038c36abc88a800d9d4f2cfa0aef6f5aba",
        "doc_id": "1912.00239",
        "question": "What is the percentage of human judgment agreement on the set?"
    },
    {
        "question_id": "9119fbfba84d298014d1b74e0e3d30330320002c",
        "doc_id": "1912.00239",
        "question": "Are the orders of case assignment biases motivated by frequency considerations?"
    },
    {
        "question_id": "058b6e3fdbb607fa7dbfc688628b3e13e130c35a",
        "doc_id": "1912.00239",
        "question": "Does the paper list other heuristic biases in the LSTMs?"
    },
    {
        "question_id": "5b95665d44666a1dc9e568d2471e5edf8614859f",
        "doc_id": "1912.00239",
        "question": "What are the performances of LSTMs and humans on the task?"
    },
    {
        "question_id": "d3d6a4a721b8bc9776f62759b8d9be1a19c6b0d2",
        "doc_id": "1710.01789",
        "question": "How long did the training take?"
    },
    {
        "question_id": "cc8f495cac0af12054c746a5b796e989ff0e5d5f",
        "doc_id": "1710.01789",
        "question": "Is the proposed model smaller or bigger than the conventional NMT system?"
    },
    {
        "question_id": "64c45fdb536ae294cf06716ac20d08b5fdb7944d",
        "doc_id": "1710.01789",
        "question": "Do they compare to state-of-the-art models?"
    },
    {
        "question_id": "182eb91090017a7c8ea38a88b219b641842664e4",
        "doc_id": "1901.09501",
        "question": "How do they obtain structured data?"
    },
    {
        "question_id": "0ef114d24a7a32821967e912dff23c016c4eab41",
        "doc_id": "1901.09501",
        "question": "Which prior approaches for style transfer do they test with?"
    },
    {
        "question_id": "67672648e7ebcbef18921006e2c8787966f8cdf2",
        "doc_id": "1901.09501",
        "question": "Which competing objectives for their unsupevised method do they use?"
    },
    {
        "question_id": "c32fc488f0527f330273263fa8956788bd071efc",
        "doc_id": "1901.09501",
        "question": "Which content coverage constraints do they design?"
    },
    {
        "question_id": "b9686a168366aafbab1737df426e031ad74a6284",
        "doc_id": "2004.00809",
        "question": "Do they authors offer a hypothesis for why Twitter data makes better predictions about the inventory of languages used in each country?"
    },
    {
        "question_id": "740cc392c0c8bfadfe6b3a60c0be635c03e17f2a",
        "doc_id": "2004.00809",
        "question": "What social media platforms are represented?"
    },
    {
        "question_id": "845bdcd900c0f96b2ae091d086fb1ab8bb1063f0",
        "doc_id": "2004.00809",
        "question": "Which websites were used in the web crawl?"
    },
    {
        "question_id": "8d1b6c88f06ee195d75af32ede85dbd6477c8497",
        "doc_id": "2004.00809",
        "question": "What countries and languages are represented in the datasets?"
    },
    {
        "question_id": "a9d5f83f4b32c52105f2ae1c570f1c590ac52487",
        "doc_id": "1909.08211",
        "question": "How do they split the dataset when training and evaluating their models?"
    },
    {
        "question_id": "288f0c003cad82b3db5e7231c189c0108ae7423e",
        "doc_id": "1909.08211",
        "question": "Do they demonstrate the relationship between veracity and stance over time in the Twitter dataset?"
    },
    {
        "question_id": "562a995dfc8d95777aa2a3c6353ee5cd4a9aeb08",
        "doc_id": "1909.08211",
        "question": "How much improvement does their model yield over previous methods?"
    },
    {
        "question_id": "0b10cfa61595b21bf3ff13b4df0fe1c17bbbf4e9",
        "doc_id": "1908.07721",
        "question": "How do they perform the joint training?"
    },
    {
        "question_id": "67104a5111bf8ea626532581f20b33b851b5abc1",
        "doc_id": "1908.07721",
        "question": "How many parameters does their model have?"
    },
    {
        "question_id": "1d40d177c5e410cef1142ec9a5fab9204db22ae1",
        "doc_id": "1908.07721",
        "question": "What is the previous model that achieved state-of-the-art?"
    },
    {
        "question_id": "3d662fb442d5fc332194770aac835f401c2148d9",
        "doc_id": "1911.09247",
        "question": "Do they report results only on English data?"
    },
    {
        "question_id": "2280ed1e2b3e99921e2bca21231af43b58ca04f0",
        "doc_id": "1911.09247",
        "question": "What is the baseline method?"
    },
    {
        "question_id": "961a97149127e1123c94fbf7e2021eb1aa580ecb",
        "doc_id": "1911.09247",
        "question": "What aspects are used to judge question quality?"
    },
    {
        "question_id": "1e4f45c956dfb40fadb8e10d4c1bfafa8968be4d",
        "doc_id": "1911.09247",
        "question": "What did the human annotations consist of?"
    },
    {
        "question_id": "627ce8a1db08a732d5a8f7e1f8a72e3de89847e6",
        "doc_id": "1911.09247",
        "question": "What characterizes the 303 domains? e.g. is this different subject tags?"
    },
    {
        "question_id": "dae2f135e50d77867c3f57fc3cb0427b2443e126",
        "doc_id": "1909.10481",
        "question": "What languages do they use during pretraining?"
    },
    {
        "question_id": "38055717edf833566d912f14137b92a1d9c4f65a",
        "doc_id": "1909.10481",
        "question": "What is the architecture of the decoder?"
    },
    {
        "question_id": "b6aa5665c981e3b582db4760759217e2979d5626",
        "doc_id": "1909.10481",
        "question": "What is the architecture of the encoder?"
    },
    {
        "question_id": "c0355afc7871bf2e12260592873ffdb5c0c4c919",
        "doc_id": "1909.10481",
        "question": "What is their baseline?"
    },
    {
        "question_id": "9be9354eeb2bb1827eeb1e23a20cfdca59fb349a",
        "doc_id": "2002.03056",
        "question": "How this system recommend features for the new application?"
    },
    {
        "question_id": "5d5c25d68988fa5effe546507c66997785070573",
        "doc_id": "2002.03056",
        "question": "What is the similarity of manually selected features across related applications in different domains?"
    },
    {
        "question_id": "ca595151735444b5b30a003ee7f3a7eb36917208",
        "doc_id": "2002.03056",
        "question": "What type of features are extracted with this language?"
    },
    {
        "question_id": "a2edd0454026811223b8f31512bdae91159677be",
        "doc_id": "2002.03056",
        "question": "What are meta elements of language for specifying NLP features?"
    },
    {
        "question_id": "00db191facf903cef18fb1727d1cab638c277e0a",
        "doc_id": "1906.05506",
        "question": "What sized character n-grams do they use?"
    },
    {
        "question_id": "1edfe390828f02a2db9a88454421c7f3d4cdd611",
        "doc_id": "1906.05506",
        "question": "Do they experiment with fine-tuning their embeddings?"
    },
    {
        "question_id": "3dad6b792044018bb968ac0d0fd4628653f9e4b7",
        "doc_id": "1906.05506",
        "question": "Which word embeddings do they compare against?"
    },
    {
        "question_id": "a28c73a6a8c46a43a1eec2b42b542dd7fde1e30e",
        "doc_id": "1906.05506",
        "question": "Which dataset do they evaluate on for headline generation?"
    },
    {
        "question_id": "5f1ffaa738fedd5b6668ec8b58a027ddea6867ce",
        "doc_id": "1906.05506",
        "question": "What results do their embeddings obtain on machine translation?"
    },
    {
        "question_id": "8e26c471ca0ee1b9779da04c0b81918fd310d0f3",
        "doc_id": "1906.05506",
        "question": "How do they combine ordinary word embeddings and ones constructed from character n-grams?"
    },
    {
        "question_id": "4cf05da602669a4c09c91ff5a1baae6e30adefdf",
        "doc_id": "1612.07486",
        "question": "Do they explore how their word representations vary across languages?"
    },
    {
        "question_id": "7380e62edcb11f728f6d617ee332dc8b5752b185",
        "doc_id": "1612.07486",
        "question": "Which neural language model architecture do they use?"
    },
    {
        "question_id": "f37b01e0c366507308fca44c20d3f69621b94a6e",
        "doc_id": "1612.07486",
        "question": "How do they show genetic relationships between languages?"
    },
    {
        "question_id": "ec70c7c560e08cff2820bad93f5216bc0a469f5a",
        "doc_id": "1908.11664",
        "question": "What settings did they experiment with?"
    },
    {
        "question_id": "940a16e9db8be5b5f4e67d9c7622b3df99ac10a5",
        "doc_id": "1908.11664",
        "question": "what domains are explored in this paper?"
    },
    {
        "question_id": "0b1cc6c0de286eb724b1fd18dbc93e67ab89a236",
        "doc_id": "1908.11664",
        "question": "what multi-domain dataset is repurposed?"
    },
    {
        "question_id": "1c2d4dc1e842b962c6407d6436f3dc73dd44ce55",
        "doc_id": "1908.11664",
        "question": "what four learning strategies are investigated?"
    },
    {
        "question_id": "1c8958ec50976a9b1088c51e8f73a767fb3973fa",
        "doc_id": "1801.04433",
        "question": "what rnn classifiers were used?"
    },
    {
        "question_id": "363d0cb0cd5c9a0b0364d61d95f2eff7347d5a36",
        "doc_id": "1801.04433",
        "question": "what results did their system obtain?"
    },
    {
        "question_id": "cf0b7d8a2449d04078f69ec9717a547adfb67d17",
        "doc_id": "1801.04433",
        "question": "what are the existing approaches?"
    },
    {
        "question_id": "25f699c7a33e77bd552782fb3886b9df9d02abb2",
        "doc_id": "2003.10564",
        "question": "What sources did they get the data from?"
    },
    {
        "question_id": "99f898eb91538cb82bc9a00892d54ae2a740961e",
        "doc_id": "1710.10380",
        "question": "Which downstream tasks are considered?"
    },
    {
        "question_id": "cf68906b7d96ca0c13952a6597d1f23e5184c304",
        "doc_id": "1710.10380",
        "question": "How long are the two unlabelled corpora?"
    },
    {
        "question_id": "c25014b7e57bb2949138d64d49f356d69838bc25",
        "doc_id": "1908.10001",
        "question": "What is the baseline?"
    },
    {
        "question_id": "25a8d432bf94af1662837877bc6c284e2fc3fbe2",
        "doc_id": "1908.10001",
        "question": "How is their NER model trained?"
    },
    {
        "question_id": "be632f0246c2e5f049d12e796812f496e083c33e",
        "doc_id": "1908.10001",
        "question": "Do they use pretrained word embeddings such as BERT?"
    },
    {
        "question_id": "415b42ef6ff92553d04bd44ed0cbf6b3d6c83e51",
        "doc_id": "1908.10001",
        "question": "How well does the system perform?"
    },
    {
        "question_id": "9da181ac8f2600eb19364c1b1e3cdeb569811a11",
        "doc_id": "1908.10001",
        "question": "Where does their information come from?"
    },
    {
        "question_id": "67f1b8a9f72e62cd74ec42e9631ef763a9b098c7",
        "doc_id": "1908.10001",
        "question": "What intents do they have?"
    },
    {
        "question_id": "371433bd3fb5042bacec4dfad3cfff66147c14f0",
        "doc_id": "1909.04387",
        "question": "How do data-driven models usually respond to abuse?"
    },
    {
        "question_id": "f64449a21c452bc5395a0f0a49fb49825e6385f4",
        "doc_id": "1909.04387",
        "question": "How much data did they gather from crowdsourcing?"
    },
    {
        "question_id": "3aeb25e334c8129b376f11c7077bcb2dd54f7e0e",
        "doc_id": "1909.04387",
        "question": "How many different strategies were evaluated?"
    },
    {
        "question_id": "49cd18448101da146c3187a44412628f8c722d7b",
        "doc_id": "1705.01306",
        "question": "Which Twitter sentiment treebank is used?"
    },
    {
        "question_id": "e9260f6419c35cbd74143f658dbde887ef263886",
        "doc_id": "1705.01306",
        "question": "Where did the system place in the other sub-tasks?"
    },
    {
        "question_id": "2834a340116026d5995e537d474a47d6a74c3745",
        "doc_id": "1705.01306",
        "question": "What were the five labels to be predicted in sub-task C?"
    },
    {
        "question_id": "a9eb8039431e2cb885cfcf96eb58c0675b36b3bd",
        "doc_id": "1908.05758",
        "question": "Is the dataset completely automatically generated?"
    },
    {
        "question_id": "998fa38634000f2d7b52d16518b9e18e898ce933",
        "doc_id": "1908.05758",
        "question": "Does the SESAME dataset include discontiguous entities?"
    },
    {
        "question_id": "a82686c054b96f214521e468b17f0435e6cdf7cf",
        "doc_id": "1908.05758",
        "question": "How big is the resulting SESAME dataset?"
    },
    {
        "question_id": "32f2aa2df0152050cbcd27dd2f408b2fa5894031",
        "doc_id": "1711.01567",
        "question": "Are there experiments with real data?"
    },
    {
        "question_id": "3eb107f35f4f5f5f527a93ffb487aa2e3fe51efd",
        "doc_id": "1805.07882",
        "question": "which pretrained embeddings were experimented with?"
    },
    {
        "question_id": "47d54a6dd50cab8dab64bfa1f9a1947a8190080c",
        "doc_id": "1805.07882",
        "question": "what datasets where used?"
    },
    {
        "question_id": "67cb001f8ca122ea859724804b41529fea5faeef",
        "doc_id": "1805.07882",
        "question": "what are the state of the art methods they compare with?"
    },
    {
        "question_id": "8de0e1fdcca81b49615a6839076f8d42226bf1fe",
        "doc_id": "1908.07888",
        "question": "Which dataset do they use?"
    },
    {
        "question_id": "909ecf675f874421eecc926a9f7486475aa1423c",
        "doc_id": "1908.07888",
        "question": "How do they use extracted intent to rescore?"
    },
    {
        "question_id": "29477c8e28a703cacb716a272055b49e2439a695",
        "doc_id": "1908.07888",
        "question": "Do they evaluate by how much does ASR improve compared to state-of-the-art just by using their FST?"
    },
    {
        "question_id": "af45ff2c4209f14235482329d0729864fb2bd4b0",
        "doc_id": "1911.12893",
        "question": "Which classifiers did they experiment with?"
    },
    {
        "question_id": "d2451d32c5a11a0eb8356a5e9d94a9231b59f198",
        "doc_id": "1911.12893",
        "question": "Is the distribution of the edits uniform across all languages?"
    },
    {
        "question_id": "90dde59e1857a0d2b1ee4615ab017fee0741f29f",
        "doc_id": "1911.12893",
        "question": "How did they identify what language the text was?"
    },
    {
        "question_id": "811b67460e65232b8f363dc3f329ffecdfcc4ab2",
        "doc_id": "1911.12893",
        "question": "Which repositories did they collect from?"
    },
    {
        "question_id": "68aa460ad357b4228b16b31b2cbec986215813bf",
        "doc_id": "1911.12893",
        "question": "Which three features do they use?"
    },
    {
        "question_id": "4542b162a5be00206fd14570898a7925cb267599",
        "doc_id": "1911.12893",
        "question": "Which languages are covered in the corpus?"
    },
    {
        "question_id": "da077b385d619305033785af5b204696d6145bd8",
        "doc_id": "1911.02747",
        "question": "Does the query-bag matching model use a neural network?"
    },
    {
        "question_id": "6d8a51e2790043497ed2637a1abc36bdffb39b71",
        "doc_id": "1911.02747",
        "question": "What datasets are used for experiments?"
    },
    {
        "question_id": "de4cc9e7fa5d700f5046d60789770f47911b3dd7",
        "doc_id": "1911.02747",
        "question": "Which natural language(s) is/are studied?"
    },
    {
        "question_id": "8ad5ebca2f69023b60ccfa3aac0ed426234437ac",
        "doc_id": "1911.02747",
        "question": "Is model compared to some baseline?"
    },
    {
        "question_id": "4afd4cfcb30433714b135b977baff346323af1e3",
        "doc_id": "1911.02747",
        "question": "What datasets are used in experiments?"
    },
    {
        "question_id": "184382af8f58031c6e357dbee32c90ec95288cb3",
        "doc_id": "2003.00864",
        "question": "What are state of the art results on OSA and PD corpora used for testing?"
    },
    {
        "question_id": "97abc2e7b39869f660986b91fc68be4ba196805c",
        "doc_id": "2003.00864",
        "question": "How better does x-vectors perform than knowlege-based features in same-language corpora?"
    },
    {
        "question_id": "9ec0527bda2c302f4e82949cc0ae7f7769b7bfb8",
        "doc_id": "2003.00864",
        "question": "What is meant by domain missmatch occuring?"
    },
    {
        "question_id": "330fe3815f74037a9be93a4c16610c736a2a27b3",
        "doc_id": "2003.00864",
        "question": "How big are OSA and PD corporas used for testing?"
    },
    {
        "question_id": "11dd2913d1517a1d47b367acb29fe9d79a9c95d1",
        "doc_id": "1908.05731",
        "question": "How many parameters does their noisy channel model have?"
    },
    {
        "question_id": "8701ec7345ccc2c35eca4e132a8e16d58585cd63",
        "doc_id": "1908.05731",
        "question": "Which language pairs do they evaluate on?"
    },
    {
        "question_id": "45e6532ac06a59cb6a90624513242b06d7391501",
        "doc_id": "1912.11637",
        "question": "What do they mean by explicit selection of most relevant segments?"
    },
    {
        "question_id": "a98ae529b47362f917a398015c8525af3646abf0",
        "doc_id": "1912.11637",
        "question": "What datasets they used for evaluation?"
    },
    {
        "question_id": "bc7081aaa207de2362e0bea7bc8108d338aee36f",
        "doc_id": "1911.12559",
        "question": "Do they report results only on English data?"
    },
    {
        "question_id": "c72e05dd41ed5a85335ffeca5a03e71514e60e84",
        "doc_id": "1911.12559",
        "question": "Where do the news texts come from?"
    },
    {
        "question_id": "07edc082eb86aecef3db5cad2534459c1310d6e8",
        "doc_id": "1911.12559",
        "question": "What baseline is used for this task?"
    },
    {
        "question_id": "eaacee4246f003d29a108fe857b5dd317287ecf1",
        "doc_id": "1911.12559",
        "question": "What type of nerual keyphrase generation models are trained?"
    },
    {
        "question_id": "3ea82a5ca495ffbd1e30e8655aef1be4ba423efe",
        "doc_id": "1911.12559",
        "question": "How do the editors' annotations differ from those in existing datasets?"
    },
    {
        "question_id": "79258cea30cd6c0662df4bb712bf667589498a1f",
        "doc_id": "1707.07568",
        "question": "What method did the highest scoring team use?"
    },
    {
        "question_id": "8e5ce0d2635e7bdec4ba1b8d695cd06790c8cdaa",
        "doc_id": "1707.07568",
        "question": "What descriptive statistics are provided about the data?"
    },
    {
        "question_id": "4e568134c896c4616bc7ab4924686d8d59b57ea1",
        "doc_id": "1707.07568",
        "question": "What was the level of inter-annotator agreement?"
    },
    {
        "question_id": "55612e92791296baf18013d2c8dd0474f35af770",
        "doc_id": "1707.07568",
        "question": "What questions were asked in the annotation process?"
    },
    {
        "question_id": "2f23bd86a9e27dcd88007c9058ddfce78a1a377b",
        "doc_id": "1707.07568",
        "question": "Why is NER for tweets more challenging as the number of entities increases?"
    },
    {
        "question_id": "e0b8a2649e384bbdb17472f8da2c3df4134b1e57",
        "doc_id": "1707.07568",
        "question": "What data preparation steps were used to construct the dataset?"
    },
    {
        "question_id": "b210c3e48c15cdc8c47cf6f4b6eb1c29a1933654",
        "doc_id": "1604.02201",
        "question": "What high-resource language pair is the parent model trained on?"
    },
    {
        "question_id": "00341a46a67d31d36e6dc54d5297626319584891",
        "doc_id": "1604.02201",
        "question": "Did they use any regularization method to constrain the training?"
    },
    {
        "question_id": "d0dc6729b689561370b6700b892c9de8871bb44d",
        "doc_id": "1604.02201",
        "question": "How did they constrain training using the parameters?"
    },
    {
        "question_id": "46146ff3ef3430924e6b673a28df96ccb869dee4",
        "doc_id": "2002.12699",
        "question": "by how much did their model outperform the other models?"
    },
    {
        "question_id": "97708d93bccc832ea671dc31a76dad6a121fcd60",
        "doc_id": "1707.06875",
        "question": "Which metrics were considered?"
    },
    {
        "question_id": "f11856814a57b86667179e1e275e4f99ff1bcad8",
        "doc_id": "1707.06875",
        "question": "What NLG tasks were considered?"
    },
    {
        "question_id": "c3ce95658eea1e62193570955f105839de3d7e2d",
        "doc_id": "1911.03324",
        "question": "How does their BERT-based model work?"
    },
    {
        "question_id": "389cc454ac97609e9d0f2b2fe70bf43218dd8ba7",
        "doc_id": "1911.03324",
        "question": "How do they use Wikipedia to automatically collect a query-focused summarization dataset?"
    },
    {
        "question_id": "8c872236e4475d5d0969fb90d2df94589c7ab1c4",
        "doc_id": "1610.00479",
        "question": "Do they have an elementary unit of text?"
    },
    {
        "question_id": "f6ba0a5cfd5b35219efe5e52b0a5b86ae85c5abd",
        "doc_id": "1610.00479",
        "question": "By how much do they outpeform existing text denoising models?"
    },
    {
        "question_id": "b21f61c0f95fefdb1bdb90d51cbba4655cd59896",
        "doc_id": "1610.00479",
        "question": "In their nonsymbolic representation can they represent two same string differently depending on the context?"
    },
    {
        "question_id": "0dbb5309d2be97f6eda29d7ae220aa16cafbabb7",
        "doc_id": "1610.00479",
        "question": "On which datasets do they evaluate their models?"
    },
    {
        "question_id": "e90425ac05a15dc145bbf3034e78b56e7cec36ac",
        "doc_id": "1803.09000",
        "question": "what dataset did they use?"
    },
    {
        "question_id": "b677952cabfec0150e028530d5d4d708d796eedc",
        "doc_id": "1803.09000",
        "question": "what was their model's f1 score?"
    },
    {
        "question_id": "d7799d26fe39302c4aff5b530aa691e8653fffe8",
        "doc_id": "1803.09000",
        "question": "what are the state of the art models?"
    },
    {
        "question_id": "23252644c04a043f630a855b563666dd57179d98",
        "doc_id": "2002.00175",
        "question": "What are the other two Vietnamese datasets?"
    },
    {
        "question_id": "2f75b0498cf6a1fc35f1fb1cac44fc2fbd3d7878",
        "doc_id": "2002.00175",
        "question": "Which English dataset do they evaluate on?"
    },
    {
        "question_id": "0d3193d17c0a4edc8fa9854f279c2a1b878e8b29",
        "doc_id": "2002.00175",
        "question": "What neural network models do they use in their evaluation?"
    },
    {
        "question_id": "b424ad7f9214076b963a0077d7345d7bb5a7a205",
        "doc_id": "2002.00175",
        "question": "Do they use crowdsourcing for the captions?"
    },
    {
        "question_id": "0dfe43985dea45d93ae2504cccca15ae1e207ccf",
        "doc_id": "2002.00175",
        "question": "What methods are used to build two other Viatnamese datsets?"
    },
    {
        "question_id": "8276671a4d4d1fbc097cd4a4b7f5e7fadd7b9833",
        "doc_id": "2002.00175",
        "question": "What deep neural network models are used in evaluation?"
    },
    {
        "question_id": "79885526713cc16eb734c88ff1169ae802cad589",
        "doc_id": "2002.00175",
        "question": "How authors evaluate datasets using models trained on different datasets?"
    },
    {
        "question_id": "101d7a355e8bf6d1860917876ee0b9971eae7a2f",
        "doc_id": "1611.04887",
        "question": "Do they report results only for English data?"
    },
    {
        "question_id": "4288621e960ffbfce59ef1c740d30baac1588b9b",
        "doc_id": "1611.04887",
        "question": "What conclusions do the authors draw from their experiments?"
    },
    {
        "question_id": "c3befe7006ca81ce64397df654c31c11482dafbe",
        "doc_id": "1611.04887",
        "question": "In what way does each classifier evaluate one of the syntactic or social properties which are salient for a tweet?"
    },
    {
        "question_id": "0a70af6ba334dfd3574991b1dd06f54fc6a700f2",
        "doc_id": "1910.01160",
        "question": "What nuances between fake news and satire were discovered?"
    },
    {
        "question_id": "98b97d24f31e9c535997e9b6cb126eb99fc72a90",
        "doc_id": "1910.01160",
        "question": "What empirical evaluation was used?"
    },
    {
        "question_id": "71b07d08fb6ac8732aa4060ae94ec7c0657bb1db",
        "doc_id": "1910.01160",
        "question": "What is the baseline?"
    },
    {
        "question_id": "812c974311747f74c3aad23999bfef50539953c8",
        "doc_id": "1910.01160",
        "question": "Which linguistic features are used?"
    },
    {
        "question_id": "180c7bea8caf05ca97d9962b90eb454be4176425",
        "doc_id": "1910.01160",
        "question": "What contextual language model is used?"
    },
    {
        "question_id": "da4d07645edaf7494a8cb5216150a00690da01f7",
        "doc_id": "1910.10670",
        "question": "What does the cache consist of?"
    },
    {
        "question_id": "c0cebef0e29b9d13c165b6f19f6ca8393348c671",
        "doc_id": "1910.10670",
        "question": "What languages is the model tested on?"
    },
    {
        "question_id": "5695908a8c6beb0e3863a1458a1b93aab508fd34",
        "doc_id": "1910.10670",
        "question": "What is a personalized language model?"
    },
    {
        "question_id": "415014a5bcd83df52c9307ad16fab1f03d80f705",
        "doc_id": "1605.05156",
        "question": "What syntactic and semantic features are proposed?"
    },
    {
        "question_id": "b79c85fa84712d3028cb5be2af873c634e51140e",
        "doc_id": "1605.05156",
        "question": "Which six speech acts are included in the taxonomy?"
    },
    {
        "question_id": "dc473819b196c0ea922773e173a6b283fa778791",
        "doc_id": "1605.05156",
        "question": "what classifier had better performance?"
    },
    {
        "question_id": "9207f19e65422bdf28f20e270ede6c725a38e5f9",
        "doc_id": "1605.05156",
        "question": "how many tweets were labeled?"
    },
    {
        "question_id": "8ddf78dbdc6ac964a7102ae84df18582841f2e3c",
        "doc_id": "1605.05156",
        "question": "how many annotators were there?"
    },
    {
        "question_id": "079e654c97508c521c07ab4d24cdaaede5602c61",
        "doc_id": "1605.05156",
        "question": "who labelled the tweets?"
    },
    {
        "question_id": "7efbd9adbc403de4be6b1fb1999dd5bed9d6262c",
        "doc_id": "1605.05156",
        "question": "what are the proposed semantic features?"
    },
    {
        "question_id": "95bbd91badbfe979899cca6655afc945ea8a6926",
        "doc_id": "1605.05156",
        "question": "what syntactic features are proposed?"
    },
    {
        "question_id": "76ae794ced3b5ae565f361451813f2f3bc85b214",
        "doc_id": "1605.05156",
        "question": "what datasets were used?"
    },
    {
        "question_id": "568466c62dd73a025bfd9643417cdb7a611f23a1",
        "doc_id": "1909.12016",
        "question": "Which data-selection algorithms do they use?"
    },
    {
        "question_id": "3a19dc6999aeb936d8a1c4509ebd5bfcda50f0f1",
        "doc_id": "1909.12016",
        "question": "How are the artificial sentences generated?"
    },
    {
        "question_id": "338a3758dccfa438a52d173fbe23a165ef74a0f0",
        "doc_id": "1909.12016",
        "question": "What domain is their test set?"
    },
    {
        "question_id": "b948bb86855b2c0bfc8fad88ff1e29cd94bb6ada",
        "doc_id": "1905.05644",
        "question": "what are the evaluation metrics used?"
    },
    {
        "question_id": "157284acedf13377cbc6d58c8f3648d3a62f5db5",
        "doc_id": "1905.05644",
        "question": "what other training procedures were explored?"
    },
    {
        "question_id": "dfd07a8e2de80c3a8d075a0f400fb13a1f1d4c60",
        "doc_id": "1710.07394",
        "question": "Did they try Roberta?"
    },
    {
        "question_id": "ee19fd54997f2eec7c87c7d4a2169026fe208285",
        "doc_id": "1909.02322",
        "question": "Do they compare to previous work?"
    },
    {
        "question_id": "74fcb741d29892918903702dbb145fef372d1de3",
        "doc_id": "1909.02322",
        "question": "What is the model trained?"
    },
    {
        "question_id": "de0d135b94ba3b3a4f4a0fb03df38a84f9dc9da4",
        "doc_id": "1909.02322",
        "question": "How large is the dataset used?"
    },
    {
        "question_id": "108f99fcaf620fab53077812e8901870896acf36",
        "doc_id": "2003.07568",
        "question": "What kind of evaluations do use to evaluate dialogue?"
    },
    {
        "question_id": "6c8dc31a199b155e73c84173816c1e252137a0af",
        "doc_id": "2003.07568",
        "question": "By how much do their cross-lingual models lag behind other models?"
    },
    {
        "question_id": "7125db8334a7efaf9f7753f2c2f0048a56e74c49",
        "doc_id": "2003.07568",
        "question": "Which translation pipelines do they use to compare against?"
    },
    {
        "question_id": "43729be0effb5defc62bae930ceacf7219934f1e",
        "doc_id": "2003.07568",
        "question": "Which languages does their newly created dataset contain?"
    },
    {
        "question_id": "95083d486769b9b5e8c57fe2ef1b452fc3ea5012",
        "doc_id": "1602.07776",
        "question": "what state of the art models do they compare to?"
    },
    {
        "question_id": "6a20a3220c4edad758b912e2d3e5b99b0b295d96",
        "doc_id": "1805.04579",
        "question": "How exactly do they weigh between different statistical models?"
    },
    {
        "question_id": "c2745e44ebe7dd57126b784ac065f0b7fc2630f1",
        "doc_id": "1805.04579",
        "question": "Do they compare against state-of-the-art summarization approaches?"
    },
    {
        "question_id": "d5dcc89a08924bed9772bc431090cbb52fb7836f",
        "doc_id": "1805.04579",
        "question": "What showed to be the best performing combination of semantic and statistical model on the summarization task in terms of ROUGE score?"
    },
    {
        "question_id": "c9eae337edea0edb12030a7d4b01c3a3c73c16d3",
        "doc_id": "1806.04387",
        "question": "What evaluation was performed on the output?"
    },
    {
        "question_id": "9f1d81b2a6fe6835042a5229690e1951b97ff671",
        "doc_id": "1806.04387",
        "question": "Where did the joke data come from?"
    },
    {
        "question_id": "fae930129c2638ba6f9c9b3383e85aa130a73876",
        "doc_id": "1806.04387",
        "question": "What type of quotes is this system trying to generate?"
    },
    {
        "question_id": "7caeb5ef6f2985b2cf383cd01765d247c936605f",
        "doc_id": "1910.01992",
        "question": "What other model inference optimization schemes authors explore?"
    },
    {
        "question_id": "1fcd25e9a63a53451cac9ad2b8a1b529aff44a97",
        "doc_id": "1910.01992",
        "question": "On what dataset is model trained/tested?"
    },
    {
        "question_id": "9262292ca4cc78de515b5617f6a91e540eb2678c",
        "doc_id": "1705.10754",
        "question": "What dicrimating features are discovered?"
    },
    {
        "question_id": "d796a251792eca01cea31ba5cf3e54ff9acf543f",
        "doc_id": "1705.10754",
        "question": "What results are obtained on the alternate datasets?"
    },
    {
        "question_id": "3d5b4aa1ce99903b1fcd257c1e394f7990431d13",
        "doc_id": "1810.13414",
        "question": "what ontologies did they use?"
    },
    {
        "question_id": "fcf9377fc3fce529d4bab1258db3f46b15ae5872",
        "doc_id": "1911.09709",
        "question": "Which works better according to human evaluation, the concurrent or the modular system?"
    },
    {
        "question_id": "5422a3f2a083395416d6f99c57d28335eb2e44e1",
        "doc_id": "1911.09709",
        "question": "Were the Wikipedia edits that removed framings, presuppositions and attitudes from biased sentences a Wiki community effort, or were annotators trained to do it?"
    },
    {
        "question_id": "7c2d6bc913523d77e8fdc82c60598ee95b445d84",
        "doc_id": "1911.09709",
        "question": "How is subjective text automatically neutralized?"
    },
    {
        "question_id": "89414ef7fcb2709c47827f30a556f543b9a9e6e0",
        "doc_id": "1908.08917",
        "question": "How does this research compare to research going on in the US and USSR at this time?"
    },
    {
        "question_id": "faffcc6ef27c1441e6528f924e320368430d8da3",
        "doc_id": "1908.08917",
        "question": "What is the reason this research was not adopted in the 1960s?"
    },
    {
        "question_id": "afad388a0141bdda5ca9586803ac53d5f10f41f6",
        "doc_id": "1908.08917",
        "question": "What is included in the cybernetic methods mentioned?"
    },
    {
        "question_id": "baaa6ad7148b785429a20f38786cd03ab9a2646e",
        "doc_id": "1908.08917",
        "question": "What were the usual logical approaches of the time period?"
    },
    {
        "question_id": "de346decb1fbca8746b72c78ea9d1208902f5e0a",
        "doc_id": "1908.08917",
        "question": "What language was this research published in?"
    },
    {
        "question_id": "0715d510359eb4c851cf063c8b3a0c61b8a8edc0",
        "doc_id": "1906.00424",
        "question": "What is the extractive technique used for summarization?"
    },
    {
        "question_id": "4e106b03cc2f54373e73d5922e97f7e5e9bf03e4",
        "doc_id": "1906.00424",
        "question": "How big is the dataset?"
    },
    {
        "question_id": "c7ffef8bf0100eb6148bd932d0409b21759060b1",
        "doc_id": "1707.06519",
        "question": "Which datasets do they use?"
    },
    {
        "question_id": "1ff0ffeb2d0b2e150abdb2f559d8b31f4dd8aa2c",
        "doc_id": "1707.06519",
        "question": "How do they compare representations performance obtained from a naive encoder versus ones learned from large amount of source language data?"
    },
    {
        "question_id": "3cc0d773085dc175b85955e95911a2cfaab2cdc4",
        "doc_id": "1707.06519",
        "question": "Which pairs of languages do they consider similar enough to capture phonetic structure?"
    },
    {
        "question_id": "ead7704a64447dccd504951618d3be463eba86bf",
        "doc_id": "1807.03674",
        "question": "How long is the dataset?"
    },
    {
        "question_id": "8476d0bf5962f4ed619a7b87415ebe28c38ce296",
        "doc_id": "1807.03674",
        "question": "Do they use machine learning?"
    },
    {
        "question_id": "bbfe7e131ed776c85f2359b748db1325386c1af5",
        "doc_id": "1807.03674",
        "question": "What are the ICD-10 codes?"
    },
    {
        "question_id": "7546125f43eec5b09a3368c95019cb2bf1478255",
        "doc_id": "1605.04278",
        "question": "How do they think this treebank will support research on second language acquisition?"
    },
    {
        "question_id": "e96b0d64c8d9fdd90235c499bf1ec562d2cbb8b2",
        "doc_id": "1605.04278",
        "question": "What are their baseline models?"
    },
    {
        "question_id": "576a3ed6e4faa4c3893db632e97a52ac6e864aac",
        "doc_id": "1605.04278",
        "question": "How long is the dataset?"
    },
    {
        "question_id": "73c535a7b46f0c2408ea2b1da0a878b376a2bca5",
        "doc_id": "1605.04278",
        "question": "Did they use crowdsourcing to annotate the dataset?"
    },
    {
        "question_id": "e057fa254ea7a4335de22fd97a0f08814b88aea4",
        "doc_id": "1804.08000",
        "question": "What is the architecture of the model?"
    },
    {
        "question_id": "134a66580c363287ec079f353ead8f770ac6d17b",
        "doc_id": "1804.08000",
        "question": "What fine-grained semantic types are considered?"
    },
    {
        "question_id": "610fc593638c5e9809ea9839912d0b282541d42d",
        "doc_id": "1804.08000",
        "question": "What hand-crafted features do other approaches use?"
    },
    {
        "question_id": "52faf319e37aa15fff1ab47f634a5a584dc42e75",
        "doc_id": "1708.00549",
        "question": "What types of commonsense knowledge are they talking about?"
    },
    {
        "question_id": "0c7cb3010ed92b8d46583a67e72946a6c0115f1f",
        "doc_id": "1708.00549",
        "question": "What do they mean by intrinsic geometry of spaces of learned representations?"
    },
    {
        "question_id": "cd82bdaa0c94330f8cccfb1c59b4e6761a5a4f4d",
        "doc_id": "1911.01214",
        "question": "what crowdsourcing platform did they use?"
    },
    {
        "question_id": "753a187c1dd8d96353187fbb193b5f86293a796c",
        "doc_id": "1911.01214",
        "question": "did they crowdsource annotations?"
    },
    {
        "question_id": "29794bda61665a1fbe736111e107fd181eacba1b",
        "doc_id": "1911.01214",
        "question": "where does their data come from?"
    },
    {
        "question_id": "dd80a38e578443496d3720d883ad194ce82c5f39",
        "doc_id": "1911.01214",
        "question": "which existing corpora do they compare with?"
    },
    {
        "question_id": "9a9774eacb8f75bcfa07a4e60ed5eb02646467e3",
        "doc_id": "1911.01214",
        "question": "what is the size of their corpus?"
    },
    {
        "question_id": "4ed58d828cd6bb9beca1471a9fa9f5e77488b1d1",
        "doc_id": "1911.01214",
        "question": "which architectures did they experiment with?"
    },
    {
        "question_id": "de580e43614ee38d2d9fc6263ff96e6ca2b54eb5",
        "doc_id": "1911.01214",
        "question": "what domains are present in the corpus?"
    },
    {
        "question_id": "ae89eed483c11ccd70a34795e9fe416af8a35da2",
        "doc_id": "1911.01214",
        "question": "what was the inter-annotator agreement?"
    },
    {
        "question_id": "3d73cb92d866448ec72a571331967da5d34dfbb1",
        "doc_id": "1910.09916",
        "question": "What language model is trained?"
    },
    {
        "question_id": "708f5f83a3c356b23b27a9175f5c35ac00cdf5db",
        "doc_id": "1910.09916",
        "question": "What machine learning models are considered?"
    },
    {
        "question_id": "9240ee584d4354349601aeca333f1bc92de2165e",
        "doc_id": "1910.09916",
        "question": "What is the agreement of the dataset?"
    },
    {
        "question_id": "b1d255f181b18f7cf8eb3dd2369a082a2a398b7b",
        "doc_id": "1802.09059",
        "question": "How long is their dataset?"
    },
    {
        "question_id": "4ae0b50c88a174cfc283b90cd3c9407de13fd370",
        "doc_id": "1802.09059",
        "question": "Do they use pretrained word embeddings?"
    },
    {
        "question_id": "a18d74109ed55ed14c33913efa62e12f207279c0",
        "doc_id": "1802.09059",
        "question": "How many layers does their model have?"
    },
    {
        "question_id": "1d6d21043b9fd0ed3ccccdc6317dcf5a1347ef03",
        "doc_id": "1802.09059",
        "question": "What metrics do they use?"
    },
    {
        "question_id": "d97843afec733410d2c580b4ec98ebca5abf2631",
        "doc_id": "1607.00167",
        "question": "What is the timeframe of the current events?"
    },
    {
        "question_id": "813a8156f9ed8ead53dda60ef54601f6ca8076e9",
        "doc_id": "1607.00167",
        "question": "What model was used for sentiment analysis?"
    },
    {
        "question_id": "dd807195d10c492da2b0da8b2c56b8f7b75db20e",
        "doc_id": "1607.00167",
        "question": "How many tweets did they look at?"
    },
    {
        "question_id": "aa287673534fc05d8126c8e3486ca28821827034",
        "doc_id": "1607.00167",
        "question": "What language are the tweets in?"
    },
    {
        "question_id": "78f8dad0f1acf024f69b3218b2d204b8019bb0d2",
        "doc_id": "1901.05415",
        "question": "how is user satisfaction estimated?"
    },
    {
        "question_id": "73a5783cad4ed468a8dbb31b5de2c618ce351ad1",
        "doc_id": "1901.05415",
        "question": "by how much did performance improve?"
    },
    {
        "question_id": "803babb71e1bdaf507847d6c712585f4128e9f47",
        "doc_id": "1910.14589",
        "question": "what baseline models are trained?"
    },
    {
        "question_id": "5fd112980d0dd7f7ce30e6273fe6e7b230b13225",
        "doc_id": "1910.14589",
        "question": "what dataset was used?"
    },
    {
        "question_id": "eaae11ffd4ff955de2cd6389b888f5fd2c660a32",
        "doc_id": "1910.14589",
        "question": "what are the human evaluation metrics?"
    },
    {
        "question_id": "290ebf0d1c49b67a6d1858366be751d89086a78b",
        "doc_id": "1910.14589",
        "question": "what automatic evaluation is performed?"
    },
    {
        "question_id": "806fefe0e331ddb3c17245d6a9fa7433798e367f",
        "doc_id": "1910.14589",
        "question": "what are the existing online systems?"
    },
    {
        "question_id": "1fa9b6300401530738995f14a37e074c48bc9fd8",
        "doc_id": "1809.03695",
        "question": "In what language are the captions written in?"
    },
    {
        "question_id": "9d98975ab0b75640b2c83e29e1438c76a959fbde",
        "doc_id": "1809.03695",
        "question": "What is the average length of the captions?"
    },
    {
        "question_id": "cc8bcea4052bf92f249dda276acc5fd16cac6fb4",
        "doc_id": "1809.03695",
        "question": "Does each image have one caption?"
    },
    {
        "question_id": "35f48b8f73728fbdeb271b170804190b5448485a",
        "doc_id": "1809.03695",
        "question": "What is the size of the dataset?"
    },
    {
        "question_id": "16edc21a6abc89ee2280dccf1c867c2ac4552524",
        "doc_id": "1809.03695",
        "question": "What is the source of the images and textual captions?"
    },
    {
        "question_id": "17fd6deb9e10707f9d1b70165dedb045e1889aac",
        "doc_id": "1908.11053",
        "question": "What are their evaluation metrics?"
    },
    {
        "question_id": "c4a3f270e942803dab9b40e5e871a2e8886ce444",
        "doc_id": "1908.11053",
        "question": "Are their formal queries tree-structured?"
    },
    {
        "question_id": "1faccdc78bbd99320c160ac386012720a0552119",
        "doc_id": "1908.11053",
        "question": "What knowledge base do they rely on?"
    },
    {
        "question_id": "804466848f4fa1c552f0d971dce226cd18b9edda",
        "doc_id": "1908.11053",
        "question": "How do they recover from noisy entity linking?"
    },
    {
        "question_id": "8d683d2e1f46626ceab60ee4ab833b50b346c29e",
        "doc_id": "1908.11053",
        "question": "What datasets do they evaluate on?"
    },
    {
        "question_id": "049415676f8323f4af16d349f36fbcaafd7367ae",
        "doc_id": "2003.03728",
        "question": "By how much do they improve on domain classification?"
    },
    {
        "question_id": "fee498457774d9617068890ff29528e9fa05a2ac",
        "doc_id": "2003.03728",
        "question": "Which dataset do they evaluate on?"
    },
    {
        "question_id": "c626637ed14dee3049b87171ddf326115e59d9ee",
        "doc_id": "2003.03728",
        "question": "How does their approach work for domains with few overlapping utterances? "
    },
    {
        "question_id": "b160bfb341f24ae42a268aa18641237a4b3a6457",
        "doc_id": "2003.03728",
        "question": "How do they decide by how much to decrease confidences of incorrectly predicted domains?"
    },
    {
        "question_id": "fcdbaa08cccda9968f3fd433c99338cc60f596a7",
        "doc_id": "1611.04234",
        "question": "What is F-score obtained?"
    },
    {
        "question_id": "2e4688205c8e344cded7a053b6014cce04ef1bd5",
        "doc_id": "1611.04234",
        "question": "What is the state-of-the-art?"
    },
    {
        "question_id": "fc436a4f3674e42fb280378314bfe77ba0c99f2e",
        "doc_id": "1611.04234",
        "question": "Which Chinese social media platform does the data come from?"
    },
    {
        "question_id": "a71fb012631e6a8854d5945b6d0ab2ab8e7b7ee6",
        "doc_id": "1611.04234",
        "question": "What dataset did they use?"
    },
    {
        "question_id": "ed15a593d64a5ba58f63c021ae9fd8f50051a667",
        "doc_id": "2001.05540",
        "question": "Is this model trained in unsuperized manner?"
    },
    {
        "question_id": "e86fb784011de5fda6ff8ccbe4ee4deadd7ee7d6",
        "doc_id": "2001.05540",
        "question": "How much is BELU score difference between proposed approach and insertion-only method?"
    },
    {
        "question_id": "267d70d9f3339c56831ea150d2213643fbc5129b",
        "doc_id": "1805.11850",
        "question": "What is the performance of NJM?"
    },
    {
        "question_id": "477da8d997ff87400c6aad19dcc74f8998bc89c3",
        "doc_id": "1805.11850",
        "question": "How are the results evaluated?"
    },
    {
        "question_id": "4485e32052741972877375667901f61e602ec4de",
        "doc_id": "1805.11850",
        "question": "How big is the self-collected corpus?"
    },
    {
        "question_id": "df4895c6ae426006e75c511a304d56d37c42a1c7",
        "doc_id": "1805.11850",
        "question": "How is the funny score calculated?"
    },
    {
        "question_id": "47ffc9811b037613c9c4d1ec1e4f13c08396ed1c",
        "doc_id": "1603.03876",
        "question": "What datasets are used?"
    },
    {
        "question_id": "7b76b8b69246525a48c0a8ca0c42db3319cd10a5",
        "doc_id": "1704.08390",
        "question": "What size ngram models performed best? e.g. bigram, trigram, etc."
    },
    {
        "question_id": "8b1af67e3905244653b4cf66ba0acec8d6bff81f",
        "doc_id": "1704.08390",
        "question": "How were the ngram models used to generate predictions on the data?"
    },
    {
        "question_id": "9a7aeecbecf5e30ffa595c233fca31719c9b429f",
        "doc_id": "1704.08390",
        "question": "What package was used to build the ngram language models?"
    },
    {
        "question_id": "3605ea281e72e9085a0ac0a7270cef25fc23063f",
        "doc_id": "1704.08390",
        "question": "What rank did the language model system achieve in the task evaluation?"
    },
    {
        "question_id": "21f6cb3819c85312364dd17dd4091df946591ef0",
        "doc_id": "1704.08390",
        "question": "What were subtasks A and B?"
    },
    {
        "question_id": "143409d16125790c8db9ed38590a0796e0b2b2e2",
        "doc_id": "1805.06648",
        "question": "What dimensions do the considered embeddings have?"
    },
    {
        "question_id": "8ba582939823faae6822a27448ea011ab6b90ed7",
        "doc_id": "1805.06648",
        "question": "How are global structures considered?"
    },
    {
        "question_id": "946d7c877d363f549f84e9500c852dce70ae5d36",
        "doc_id": "1911.01940",
        "question": "How many GPUs do they use for this task?"
    },
    {
        "question_id": "26e32f24fe0c31ef25de78935daa479534b9dd58",
        "doc_id": "1911.01940",
        "question": "Do they use all the hidden layer representations?"
    },
    {
        "question_id": "573b8b1ad919d3fd0ef7df84e55e5bfd165b3e84",
        "doc_id": "1909.07873",
        "question": "Do they manually check all adversarial examples that fooled some model for potential valid examples?"
    },
    {
        "question_id": "07d98dfa88944abd12acd45e98fb7d3719986aeb",
        "doc_id": "1909.07873",
        "question": "Are all generated examples semantics-preserving perturbations to the original text?"
    },
    {
        "question_id": "3a40559e5a3c2a87c7b9031c89e762b828249c05",
        "doc_id": "1909.07873",
        "question": "What is success rate of fooling tested models in experiments?"
    },
    {
        "question_id": "5db47bbb97282983e10414240db78154ea7ac75f",
        "doc_id": "1909.07873",
        "question": "What models are able to be fooled for AG's news corpus news categorization task by this approach?"
    },
    {
        "question_id": "c589d83565f528b87e355b9280c1e7143a42401d",
        "doc_id": "1909.07873",
        "question": "What models are able to be fooled for IMDB sentiment classification task by this approach?"
    },
    {
        "question_id": "7f90e9390ad58b22b362a57330fff1c7c2da7985",
        "doc_id": "1909.07873",
        "question": "Do they use already trained model on some task in their reinforcement learning approach?"
    },
    {
        "question_id": "3e3e45094f952704f1f679701470c3dbd845999e",
        "doc_id": "1909.07873",
        "question": "How does proposed reinforcement learning based approach generate adversarial examples in black-box settings?"
    },
    {
        "question_id": "19578949108ef72603afe538059ee55b4dee0751",
        "doc_id": "1906.01749",
        "question": "Do they use pretrained embeddings in their model?"
    },
    {
        "question_id": "44435fbd4087fa711835d267036b6a1f82336a22",
        "doc_id": "1906.01749",
        "question": "What results are obtained by their model?"
    },
    {
        "question_id": "86656aae3c27c6ea108f5600dd09ab7e421d876a",
        "doc_id": "1906.01749",
        "question": "What sources do the news come from?"
    },
    {
        "question_id": "22488c8628b6db5fd708b6471c31a8eac31f66df",
        "doc_id": "1906.01749",
        "question": "What is the size of Multi-news dataset?"
    },
    {
        "question_id": "afeceee343360d3fe715f405dac7760d9a6754a7",
        "doc_id": "1805.04833",
        "question": "What human evaluation metrics do they look at?"
    },
    {
        "question_id": "cc3dd701f3a674618de95a4196e9c7f4c8fbf1e5",
        "doc_id": "1805.04833",
        "question": "Which automated evaluation metrics are used?"
    },
    {
        "question_id": "d66550f65484696c1284903708b87809ea705786",
        "doc_id": "1805.04833",
        "question": "What baselines do they compare against?"
    },
    {
        "question_id": "29ba93bcd99c2323d04d4692d3672967cca4915e",
        "doc_id": "1805.04833",
        "question": "Do they use pre-trained embeddings like BERT?"
    },
    {
        "question_id": "804bf5adc6dc5dd52f8079cf041ed3a710e03f8a",
        "doc_id": "1805.04833",
        "question": "What model is used to generate the premise?"
    },
    {
        "question_id": "f2dba5bf75967407cce5d0a9c2618269225081f5",
        "doc_id": "1805.04833",
        "question": "Are the stories in the dataset fictional stories?"
    },
    {
        "question_id": "b783ec5cb9ad595da7db2c0ddf871152ae382c5f",
        "doc_id": "1805.04833",
        "question": "Where are the stories collected from?"
    },
    {
        "question_id": "ae2142ee9e093ce485025168f4bcb3da4602739d",
        "doc_id": "1810.02268",
        "question": "did they collect their own contrastive test set?"
    },
    {
        "question_id": "ebe1084a06abdabefffc66f029eeb0b69f114fd9",
        "doc_id": "1810.02268",
        "question": "what are the baselines?"
    },
    {
        "question_id": "cfdd583d01abaca923f5c466bb20e1d4b8c749ff",
        "doc_id": "1810.02268",
        "question": "what context aware models were experimented?"
    },
    {
        "question_id": "554d798e4ce58fd30820200c474d7e796dc8ba89",
        "doc_id": "1810.02268",
        "question": "what languages did they experiment on?"
    },
    {
        "question_id": "800fcd8b08d36c5276f9e5e1013208d41b46de59",
        "doc_id": "1805.04558",
        "question": "why do they think sentiment features do not result in improvement?"
    },
    {
        "question_id": "cdbbba22e62bc9402aea74ac5960503f59e984ff",
        "doc_id": "1805.04558",
        "question": "what was the size of the datasets?"
    },
    {
        "question_id": "301a453abaa3bc15976817fefce7a41f3b779907",
        "doc_id": "1805.04558",
        "question": "what were the evaluation metrics?"
    },
    {
        "question_id": "f3673f6375f065014e8e4bb8c7adf54c1c7d7862",
        "doc_id": "1805.04558",
        "question": "what were their results on both tasks?"
    },
    {
        "question_id": "0bd3bea892c34a3820e98c4a42cdeda03753146b",
        "doc_id": "1805.04558",
        "question": "what domain-specific features did they train on?"
    },
    {
        "question_id": "8cf5abf0126f19253930478b02f0839af28e4093",
        "doc_id": "1805.04558",
        "question": "what are the sentiment features used?"
    },
    {
        "question_id": "d211a37830c59aeab4970fdb2e03d9b7368b421c",
        "doc_id": "1805.04558",
        "question": "what surface-form features were used?"
    },
    {
        "question_id": "83db51da819adf6faeb950fe04b4df942a887fb5",
        "doc_id": "1809.08899",
        "question": "what dataset is used?"
    },
    {
        "question_id": "7e7471bc24970c6f23baff570be385fd3534926c",
        "doc_id": "1809.08899",
        "question": "what neural network models are used?"
    },
    {
        "question_id": "ec5e84a1d1b12f7185183d165cbb5eae66d9833e",
        "doc_id": "1809.08899",
        "question": "Do they report results only on English data?"
    },
    {
        "question_id": "7f958017cbb08962c80e625c2fd7a1e2375f27a3",
        "doc_id": "1809.08899",
        "question": "What baseline model is used?"
    },
    {
        "question_id": "4130651509403becc468bdbe973e63d3716beade",
        "doc_id": "1809.08899",
        "question": "What type of neural network models are used?"
    },
    {
        "question_id": "6edef748370e63357a57610b5784204c9715c0b4",
        "doc_id": "1809.08899",
        "question": "How is validity identified and what metric is used to quantify it?"
    },
    {
        "question_id": "6b302280522c350c4d1527d8c6ebc5b470f9314c",
        "doc_id": "1809.08899",
        "question": "How is severity identified and what metric is used to quantify it?"
    },
    {
        "question_id": "7da138ec43a88ea75374c40e8491f7975db29480",
        "doc_id": "1809.08899",
        "question": "How is urgency identified and what metric is used to quantify it?"
    },
    {
        "question_id": "8e113fd9661bc8af97e30c75a20712f01fc4520a",
        "doc_id": "1911.10401",
        "question": "What are the baseline models?"
    },
    {
        "question_id": "35e0e6f89b010f34cfb69309b85db524a419c862",
        "doc_id": "1911.10401",
        "question": "How are the three different forms defined in this work?"
    },
    {
        "question_id": "992e67f706c728bc0e534f974c1656da10e7a724",
        "doc_id": "1911.10401",
        "question": "What datasets are used for training and testing?"
    },
    {
        "question_id": "61e96abdc924c34c6b82a587168ea3d14fe792d1",
        "doc_id": "1911.10401",
        "question": "Does approach handle overlapping forms (e.g., metaphor and irony)?"
    },
    {
        "question_id": "ee8a77cddbe492c686f5af3923ad09d401a741b5",
        "doc_id": "1911.10401",
        "question": "Does this work differentiate metaphor(technique) from irony and sarcasm (purpose)? "
    },
    {
        "question_id": "4a201b8b9cc566b56aedb5ab45335f202bc41845",
        "doc_id": "1803.07828",
        "question": "What is the new metric?"
    },
    {
        "question_id": "6a90135bd001be69a888076aff1b149b78adf443",
        "doc_id": "1803.07828",
        "question": "How long do other state-of-the-art models take to process the same amount of data?"
    },
    {
        "question_id": "1f40adc719d8ccda81e7e90525b577f5698b5aad",
        "doc_id": "1803.07828",
        "question": "What context is used when computing the embedding for an entity?"
    },
    {
        "question_id": "59f41306383dd6e201bded0f1c7c959ec4f61c5a",
        "doc_id": "1808.07733",
        "question": "What logic rules can be learned using ELMo?"
    },
    {
        "question_id": "b3432f52af0b95929e6723dd1f01ce029d90a268",
        "doc_id": "1808.07733",
        "question": "Does Elmo learn all possible logic rules?"
    },
    {
        "question_id": "b537832bba2eb6d34702a9d71138e661c05a7c3a",
        "doc_id": "1906.01076",
        "question": "What text classification tasks are considered?"
    },
    {
        "question_id": "1002bd01372eba0f3078fb4a951505278ed45f2e",
        "doc_id": "1906.01076",
        "question": "Do they compare against other models?"
    },
    {
        "question_id": "3450723bf66956486de777f141bde5073e4a7694",
        "doc_id": "1906.01076",
        "question": "What is episodic memory?"
    },
    {
        "question_id": "78c7318b2218b906a67d8854f3e511034075f79a",
        "doc_id": "1909.03087",
        "question": "Which dialogue data do they use to evaluate on?"
    },
    {
        "question_id": "697c5d2ba7e019ddb91a1de5031a90fe741f2468",
        "doc_id": "1909.03087",
        "question": "How much faster are pairwise annotations than other annotations?"
    },
    {
        "question_id": "7561bd3b8ba7829b3a01ff07f9f3e93a7b8869cc",
        "doc_id": "2002.06851",
        "question": "What extractive models were trained on this dataset?"
    },
    {
        "question_id": "a3ba21341f0cb79d068d24de33b23c36fa646752",
        "doc_id": "2002.06851",
        "question": "What abstractive models were trained?"
    },
    {
        "question_id": "96295e1fe8713417d2b4632438a95d23831fbbdc",
        "doc_id": "2002.06851",
        "question": "Do the reviews focus on a specific video game domain?"
    },
    {
        "question_id": "5bfbc9ca7fd41be9627f6ef587bb7e21c7983be0",
        "doc_id": "2002.06851",
        "question": "What is the size of this dataset?"
    },
    {
        "question_id": "2fec84a62b4028bbe6500754d9c058eefbc24d9a",
        "doc_id": "2004.04060",
        "question": "What is the performance of large state-of-the-art models on these datasets?"
    },
    {
        "question_id": "2803709fba74e6098aae145abcbf0e9a3f4c35e5",
        "doc_id": "2004.04060",
        "question": "What is used as a baseline model?"
    },
    {
        "question_id": "ec39120fb879ae10452d3f244e1e32237047005a",
        "doc_id": "2004.04060",
        "question": "How do they build gazetter resources from Wikipedia knowlege base?"
    },
    {
        "question_id": "7d539258b948cd5b5ad1230a15e4b739f29ed947",
        "doc_id": "2003.08437",
        "question": "What inter-annotator agreement did they obtain?"
    },
    {
        "question_id": "9c1f70affc87024b4280f0876839309b8dddd579",
        "doc_id": "2003.08437",
        "question": "How did they annotate the corpus?"
    },
    {
        "question_id": "2694a679a703ccd6139897e4d9ff8e053dabd0f2",
        "doc_id": "2003.08437",
        "question": "What is the size of the corpus?"
    },
    {
        "question_id": "9a6bf1d481e6896eef9f8fed835d9d29658ede36",
        "doc_id": "1911.06919",
        "question": "Is proposed approach compared to some baselines?"
    },
    {
        "question_id": "4999da863ecbd40378505bfb1f4e395061a3f559",
        "doc_id": "1911.06919",
        "question": "What datasets are used for this tasks?"
    },
    {
        "question_id": "3098793595252039f363ee1150d4ea956f2504b8",
        "doc_id": "1911.06919",
        "question": "How big are improvements on these tasks?"
    },
    {
        "question_id": "55e3daecaf8030ed627e037992402dd0a7dd67ff",
        "doc_id": "1709.06136",
        "question": "By how much do they improve upon supervised traning methods?"
    },
    {
        "question_id": "5522a9eeb06221722052e3c38f9b0d0dbe7c13e6",
        "doc_id": "1709.06136",
        "question": "Do they jointly optimize both agents?"
    },
    {
        "question_id": "30870a962cf88ac8c8e6b7b795936fd62214f507",
        "doc_id": "1709.06136",
        "question": "Which neural network architecture do they use for the dialog agent and user simulator?"
    },
    {
        "question_id": "7ece07a84635269bb19796497847e4517d1e3e61",
        "doc_id": "1709.06136",
        "question": "Do they create the basic dialog agent and basic user simulator separately?"
    },
    {
        "question_id": "468eb961215a554ace8088fa9097a7ad239f2d71",
        "doc_id": "2004.04478",
        "question": "What datasets are available for CDSA task?"
    },
    {
        "question_id": "57d07d2b509c5860880583efe2ed4c5620a96747",
        "doc_id": "2004.04478",
        "question": "What two novel metrics proposed?"
    },
    {
        "question_id": "d126d5d6b7cfaacd58494f1879547be9e91d1364",
        "doc_id": "2004.04478",
        "question": "What similarity metrics have been tried?"
    },
    {
        "question_id": "7dca806426058d59f4a9a4873e9219d65aea0987",
        "doc_id": "2004.04478",
        "question": "What 20 domains are available for selection of source domain?"
    },
    {
        "question_id": "2f142cd11731d29d0c3fa426e26ef80d997862e0",
        "doc_id": "1909.01720",
        "question": "Do they report results only on English data?"
    },
    {
        "question_id": "ce23849e9e9a22626965f1ca8ca948a5c87280e9",
        "doc_id": "1909.01720",
        "question": "What are the hyperparameter setting of the MTL model?"
    },
    {
        "question_id": "d9a45fea8539aac01dec01f29b7d04b44b9c2ca6",
        "doc_id": "1909.01720",
        "question": "What architecture does the rest of the multi-task learning setup use?"
    },
    {
        "question_id": "246e924017c48fa1f069361c44133fdf4f0386e1",
        "doc_id": "1909.01720",
        "question": "How is the selected sharing layer trained?"
    },
    {
        "question_id": "065623cc1d5f5b19ec1f84d286522fc2f805c6ce",
        "doc_id": "1706.04206",
        "question": "What supervised machine learning models do they use?"
    },
    {
        "question_id": "5c17559749810c67c50a7dbe34580d5e3b4f9acb",
        "doc_id": "1706.04206",
        "question": "Does the supervised machine learning approach outperform previous work?"
    },
    {
        "question_id": "1c0a575e289eb486d3e6375d6f783cc2bf18adf9",
        "doc_id": "1706.04206",
        "question": "How large is the released data set?"
    },
    {
        "question_id": "4efe0d62bba618803ec12b63f32debb8b757dd68",
        "doc_id": "1706.04206",
        "question": "What is an example of a condition-action pair?"
    },
    {
        "question_id": "deb0c3524a3b3707e8b20abd27f54ad6188d6e4e",
        "doc_id": "1607.07514",
        "question": "did they experiment with other languages?"
    },
    {
        "question_id": "d7e43a3db8616a106304ac04ba729c1fee78761d",
        "doc_id": "1607.07514",
        "question": "by how much did their system outperform previous tasks?"
    },
    {
        "question_id": "0ba8f04c3fd64ee543b9b4c022310310bc5d3c23",
        "doc_id": "1607.07514",
        "question": "what are the previous state of the art for sentiment categorization?"
    },
    {
        "question_id": "b7d02f12baab5db46ea9403d8932e1cd1b022f79",
        "doc_id": "1607.07514",
        "question": "what are the previous state of the art for tweet semantic similarity?"
    },
    {
        "question_id": "fa800a21469a70fa6490bfc67cabdcc8bf086fb5",
        "doc_id": "1902.06734",
        "question": "Is the dataset used in other work?"
    },
    {
        "question_id": "6883767bbdf14e124c61df4f76335d3e91bfcb03",
        "doc_id": "1902.06734",
        "question": "What is the drawback to methods that rely on textual cues?"
    },
    {
        "question_id": "11679d1feba747c64bbbc62939a20fbb69ada0f3",
        "doc_id": "1902.06734",
        "question": "What community-based profiling features are used?"
    },
    {
        "question_id": "97055ab0227ed6ac7a8eba558b94f01867bb9562",
        "doc_id": "1610.07149",
        "question": "Were human evaluations conducted?"
    },
    {
        "question_id": "3e23cc3c5e4d5cec51d158130d6aeae120e94fc8",
        "doc_id": "1610.07149",
        "question": "What datasets are used?"
    },
    {
        "question_id": "bfcbb47f3c54ee1a459183e04e4c5a41ac9ae83b",
        "doc_id": "1610.07149",
        "question": "How does inference time compare to other methods?"
    },
    {
        "question_id": "1128a600a813116cba9a2cf99d8568ae340f327a",
        "doc_id": "1802.08969",
        "question": "What datasets do they use in the experiment?"
    },
    {
        "question_id": "d64fa192a7e9918c6a22d819abad581af0644c7d",
        "doc_id": "1802.08969",
        "question": "What new tasks do they use to show the transferring ability of the shared meta-knowledge?"
    },
    {
        "question_id": "788f70a39c87abf534f4a9ee519f6e5dbf2543c2",
        "doc_id": "1802.08969",
        "question": "What kind of meta learning algorithm do they use?"
    },
    {
        "question_id": "5cc5e2db82f5d40a5244224dad94da50b4f673db",
        "doc_id": "1804.03839",
        "question": "What does the human-in-the-loop do to help their system?"
    },
    {
        "question_id": "ab975efc916c34f55e1144b1d28e7dfdc257e371",
        "doc_id": "1804.03839",
        "question": "Which dataset do they use to train their model?"
    },
    {
        "question_id": "e7ce612f53e9be705cdb8daa775eae51778825ef",
        "doc_id": "1804.03839",
        "question": "Can their approach be extended to eliminate racial or ethnic biases?"
    },
    {
        "question_id": "6c5a64b5150305c584326882d37af5b0e58de2fd",
        "doc_id": "1804.03839",
        "question": "How do they evaluate their de-biasing approach?"
    },
    {
        "question_id": "fe6181ab0aecf5bc8c3def843f82e530347d918b",
        "doc_id": "1911.09753",
        "question": "What are the baseline models?"
    },
    {
        "question_id": "0b1b8e1b583242e5be9b7be73160630a0d4a96b2",
        "doc_id": "1911.09753",
        "question": "What image caption datasets were used in this work?"
    },
    {
        "question_id": "830f9f9499b06fb4ac3ce2f2cf035127b4f0ec63",
        "doc_id": "1911.09753",
        "question": "How long does it take to train the model on the mentioned dataset? "
    },
    {
        "question_id": "a606bffed3bfeebd1b66125be580f908244e5d92",
        "doc_id": "1911.09753",
        "question": "How big is the human ratings dataset?"
    },
    {
        "question_id": "ab895ed198374f598e13d6d61df88142019d13b8",
        "doc_id": "1908.05803",
        "question": "What is the strong baseline model used?"
    },
    {
        "question_id": "8795bb1f874e5f3337710d8c3d5be49e672ab43a",
        "doc_id": "1908.05803",
        "question": "What crowdsourcing platform did they obtain the data from?"
    },
    {
        "question_id": "fe1a74449847755cd7a46647cc9d384abfee789e",
        "doc_id": "1901.03253",
        "question": "Where can I access the dataset?"
    },
    {
        "question_id": "425d17465ff91019eb87c28ff3942f781ba1bbcb",
        "doc_id": "1901.03253",
        "question": "Did they release their dataset?"
    },
    {
        "question_id": "08561f6ba578ce8f8d284abf90f5b24eb1f804d3",
        "doc_id": "1901.03253",
        "question": "Did they use Amazon Mechanical Turk to collect data?"
    },
    {
        "question_id": "ec2045e0da92989642a5b5f2b1130c8bd765bcc5",
        "doc_id": "1901.03253",
        "question": "Did they use The Onion as their dataset?"
    },
    {
        "question_id": "2929e92f9b4939297b4d0f799d464d46e8d52063",
        "doc_id": "1908.11046",
        "question": "Why is improvement on OntoNotes significantly smaller compared to improvement on WNUT 2017?"
    },
    {
        "question_id": "1dcfcfa46dbcffc2fc7be92dd57df9620258097b",
        "doc_id": "1908.11046",
        "question": "How is \"complexity\" and \"confusability\" of entity mentions defined in this work?"
    },
    {
        "question_id": "77bbe1698e001c5889217be3164982ea36e85752",
        "doc_id": "1908.11046",
        "question": "What are the baseline models?"
    },
    {
        "question_id": "04796aaa59eeb2176339c0651838670fd916074d",
        "doc_id": "1906.11085",
        "question": "what boosting techniques were used?"
    },
    {
        "question_id": "ebb33f3871b8c2ffd2c451bc06480263b8e870e0",
        "doc_id": "1906.11085",
        "question": "did they experiment with other text embeddings?"
    },
    {
        "question_id": "afd1c482c311e25fc42b9dd59cdc32ac542f5752",
        "doc_id": "1906.11085",
        "question": "what is the size of this improved dataset?"
    },
    {
        "question_id": "ae1c4f9e33d0cd64d9a313c318ad635620303cdd",
        "doc_id": "1906.11085",
        "question": "how was the new dataset collected?"
    },
    {
        "question_id": "7066f33c373115b1ead905fe70a1e966f77ebeee",
        "doc_id": "1906.11085",
        "question": "who annotated the new dataset?"
    },
    {
        "question_id": "018b81f810a39b3f437a85573d24531efccd835f",
        "doc_id": "1906.11085",
        "question": "what shortcomings of previous datasets are mentioned?"
    },
    {
        "question_id": "c5ea4da3c760ba89194ad807bc1ef60e1761429f",
        "doc_id": "1602.07563",
        "question": "What evidence is presented that humans perceive the sentiment classes as ordered?"
    },
    {
        "question_id": "4a093a9af4903a59057a4372ac1b01603467ca58",
        "doc_id": "1602.07563",
        "question": "What size of dataset is sufficiently large for the model performance to approach the inter-annotator agreement?"
    },
    {
        "question_id": "f4e16b185b506713ff99acc4dbd9ec3208e4997b",
        "doc_id": "1602.07563",
        "question": "Which measures of inter-annotator agreement are used?"
    },
    {
        "question_id": "4683812cba21c92319be68c03260b5a8175bbb6e",
        "doc_id": "1602.07563",
        "question": "What statistical test(s) is used to compare the top classification models?"
    },
    {
        "question_id": "1acfbdc34669cf19a778aceca941543f11b9a861",
        "doc_id": "1808.04122",
        "question": "What size filters do they use in the convolution layer?"
    },
    {
        "question_id": "864295caceb1e15144c1746ab5671d085d7ff7a1",
        "doc_id": "1808.04122",
        "question": "By how much do they outperform state-of-the-art models on knowledge graph completion?"
    },
    {
        "question_id": "acac0606aab83cae5d13047863c7af542d58e54c",
        "doc_id": "1908.07491",
        "question": "is this the first dataset with a grading scaling rather than binary?"
    },
    {
        "question_id": "2ee4ecf98ef7d02c9e4d103968098fe35f067bbb",
        "doc_id": "1908.07491",
        "question": "what are the existing datasets for this task?"
    },
    {
        "question_id": "82f8843b59668567bba09fc8f93963ca7d1fe107",
        "doc_id": "1908.07491",
        "question": "what is the size of the introduced dataset?"
    },
    {
        "question_id": "376e8ed6e039e07c892c77b7525778178d56acb7",
        "doc_id": "1908.07491",
        "question": "did they crowdsource annotations?"
    },
    {
        "question_id": "4de6bcddd46726bf58326304b0490fdb9e7e86ec",
        "doc_id": "1908.07491",
        "question": "how was labeling done?"
    },
    {
        "question_id": "e831ce6c406bf5d1c493162732e1b320abb71b6f",
        "doc_id": "1908.07491",
        "question": "where does their dataset come from?"
    },
    {
        "question_id": "634a071b13eb7139e77872ecfdc135a2eb2f89da",
        "doc_id": "1908.07491",
        "question": "what are the baselines?"
    },
    {
        "question_id": "8861138891669a45de3955c802c55a37be717977",
        "doc_id": "1908.07491",
        "question": "what tools did they use?"
    },
    {
        "question_id": "2895a3fc63f6f403445c11043460584e949fb16c",
        "doc_id": "1710.07695",
        "question": "what do they mean by description length?"
    },
    {
        "question_id": "1e7e3f0f760cd628f698b73d82c0f946707855ca",
        "doc_id": "1710.07695",
        "question": "do they focus on english verbs?"
    },
    {
        "question_id": "64632981279c7aa16ffc1a44ffc31f4520f5559e",
        "doc_id": "1710.07695",
        "question": "what evaluation metrics are used?"
    },
    {
        "question_id": "94d794df4a3109522c2ea09dad5d40e55d35df51",
        "doc_id": "1907.00168",
        "question": "Which neural machine translation model was used?"
    },
    {
        "question_id": "044c66c6b7ff7378682f24887b05e1af79dcd04f",
        "doc_id": "1907.00168",
        "question": "What position did this entry finish in, in the overall shared task?"
    },
    {
        "question_id": "903ac8686ed7e6e3269a5d863f06ff11c50e49e8",
        "doc_id": "1907.00168",
        "question": "What are the restrictions of the restricted track?"
    },
    {
        "question_id": "ab95ca983240ad5289c123a2774f8e0db424f4a1",
        "doc_id": "1907.00168",
        "question": "What does BEA stand for?"
    },
    {
        "question_id": "58df55002fbcba76b9aeb2181d78378b8c01a827",
        "doc_id": "1910.09942",
        "question": "Which part of their architecture provides the most speedup in comparison to existing approaches?"
    },
    {
        "question_id": "7a60f29e28063f50c2a7afd1c2a7668fb615cd53",
        "doc_id": "1910.09942",
        "question": "Do they consistently outperform existing systems in terms of accuracy?"
    },
    {
        "question_id": "c27b885b1e38542244f52056abf288b2389b9fc6",
        "doc_id": "1905.01347",
        "question": "How do they determine demographics on an image?"
    },
    {
        "question_id": "1ce6c09cf886df41a3d3c52ce82f370c5a30334a",
        "doc_id": "1905.01347",
        "question": "Do they assume binary gender?"
    },
    {
        "question_id": "5429add4f166a3a66bec2ba22232821d2cbafd62",
        "doc_id": "1905.01347",
        "question": "What is the most underrepresented person group in ILSVRC?"
    },
    {
        "question_id": "654306d26ca1d9e77f4cdbeb92b3802aa9961da1",
        "doc_id": "1912.07076",
        "question": "By how much did the new model outperform multilingual BERT?"
    },
    {
        "question_id": "5a7d1ae6796e09299522ebda7bfcfad312d6d128",
        "doc_id": "1912.07076",
        "question": "What previous proposed methods did they explore?"
    },
    {
        "question_id": "bd191d95806cee4cf80295e9ce1cd227aba100ab",
        "doc_id": "1912.07076",
        "question": "What was the new Finnish model trained on?"
    },
    {
        "question_id": "7f11f128fd39b8060f5810fa84102f000d94ea33",
        "doc_id": "1909.04242",
        "question": "What is the performance improvement of their method over state-of-the-art models on the used datasets? "
    },
    {
        "question_id": "2a55076a66795793d79a3edfae1041098404fbc3",
        "doc_id": "1909.04242",
        "question": "Could the proposed training framework be applied to other NLP problems?"
    },
    {
        "question_id": "ecaa10a2d9927fa6ab6a954488f12aa6b42ddc1a",
        "doc_id": "1909.04242",
        "question": "How does the proposed training framework mitigate the bias pattern?"
    },
    {
        "question_id": "8b49423b7d1fa834128aa5038aa16c6ef3fdfa32",
        "doc_id": "1909.04242",
        "question": "Which datasets do they use in the cross-dataset evaluation?"
    },
    {
        "question_id": "8d3f79620592d040f9f055b4fce0f73cc45aab63",
        "doc_id": "1711.04964",
        "question": "How much improvement is given on RACE by their introduced approach?"
    },
    {
        "question_id": "54002c15493d4082d352a66fb9465d65bfe9ddca",
        "doc_id": "1911.03977",
        "question": "What are special architectures this review focuses on that are related to multimodal fusion?"
    },
    {
        "question_id": "e4ea0569b637d5f56f63e933b8f269695fe1a926",
        "doc_id": "1910.07154",
        "question": "What baseline did they use?"
    },
    {
        "question_id": "e3c44964eb6ddc554901244eb6595f26a9bae47e",
        "doc_id": "1910.07154",
        "question": "What is the threshold?"
    },
    {
        "question_id": "905a8d775973882227549e960c7028e4a3561752",
        "doc_id": "1910.07154",
        "question": "How was the masking done?"
    },
    {
        "question_id": "76f90c88926256e7f90d2104a88acfdd7fc5475e",
        "doc_id": "1910.07154",
        "question": "How large is the FEVER dataset?"
    },
    {
        "question_id": "9b6339e24f58b576143d2adf599cfc4a31fd3b0c",
        "doc_id": "2003.11531",
        "question": "What state-of-the-art tagging model did they use?"
    },
    {
        "question_id": "1f2952cd1dc0c891232fa678b6c219f6b4d31958",
        "doc_id": "2004.02334",
        "question": "Which vocabulary size was the better performer?"
    },
    {
        "question_id": "23fe8431058f2a7b7588745766fc715f271aad07",
        "doc_id": "2004.02334",
        "question": "Which languages are explored?"
    },
    {
        "question_id": "e5b2eb6a49c163872054333f8670dd3f9563046a",
        "doc_id": "2004.02334",
        "question": "What datasets are used in the paper?"
    },
    {
        "question_id": "73760a45b23b2ec0cab181f82953fb296bb6cd19",
        "doc_id": "2004.02334",
        "question": "What vocabulary sizes are explored?"
    },
    {
        "question_id": "ec990c16896793a819766bc3168c02556ef69971",
        "doc_id": "2004.02334",
        "question": "What vocabulary size was the best performer?"
    },
    {
        "question_id": "11c4071d9d7efeede84f47892b1fa0c6a93667eb",
        "doc_id": "2004.02334",
        "question": "What datasets do they look at?"
    },
    {
        "question_id": "9aa751aebf6a449d95fb04ceec71688f2ed2cea2",
        "doc_id": "2004.02334",
        "question": "Which vocab sizes did they analyze?"
    },
    {
        "question_id": "eb653a5c59851eda313ece0bcd8c589b6155d73e",
        "doc_id": "1908.02284",
        "question": "what are the baselines?"
    },
    {
        "question_id": "0caa3162abe588f576a568d63ab9fd0e9c46ceda",
        "doc_id": "1908.02284",
        "question": "what results do they achieve?"
    },
    {
        "question_id": "cbe42bf7c99ee248cdb2c5d6cf86b41106e66863",
        "doc_id": "1908.02284",
        "question": "what chinese dialects are explored?"
    },
    {
        "question_id": "643527e94e8eed1e2229915fcf8cd74d769173fc",
        "doc_id": "1908.09283",
        "question": "What are the evaluation metrics used?"
    },
    {
        "question_id": "bfd55ae9630a08a9e287074fff3691dfbffc3258",
        "doc_id": "1908.09283",
        "question": "What are the baselines?"
    },
    {
        "question_id": "3a06d40a4bf5ba6e26d9138434e9139a014deb40",
        "doc_id": "1908.09283",
        "question": "Which language learning datasets are used?"
    },
    {
        "question_id": "458e5ed506883bfec6623102ec9f43c071f0616f",
        "doc_id": "1801.05617",
        "question": "What are their baselines?"
    },
    {
        "question_id": "85ab5f773b297bcf48a274634d402a35e1d57446",
        "doc_id": "1801.05617",
        "question": "Do they report the annotation agreement?"
    },
    {
        "question_id": "5154f63c50729b8ac04939588c2f5ffeb916e3df",
        "doc_id": "1801.05617",
        "question": "How long is the test dataset for Dutch?"
    },
    {
        "question_id": "2aeabec8a734a6e8ca9e7a308dd8c9a1011b3d6e",
        "doc_id": "1801.05617",
        "question": "How long is the training dataset for English?"
    },
    {
        "question_id": "f2b8a2ed5916d75cf568a931829a5a3cde2fc345",
        "doc_id": "1801.05617",
        "question": "What features are used?"
    },
    {
        "question_id": "c0af44ebd7cd81270d9b5b54d4a40feed162fa54",
        "doc_id": "1801.05617",
        "question": "What is the source of the data?"
    },
    {
        "question_id": "3e6b6820e7843209495b4f9a72177573afaa4bc3",
        "doc_id": "1906.01010",
        "question": "What conclusions do the authors draw about the aspects and mechanisms of personal recovery in bipolar disorder?"
    },
    {
        "question_id": "a926d71e6e58066d279d9f7dc3210cd43f410164",
        "doc_id": "1906.01010",
        "question": "What languages were included in this multilingual population?"
    },
    {
        "question_id": "3d547a7dda18a2dd5dc89f12d25d7fe782d66450",
        "doc_id": "1906.01010",
        "question": "What computational linguistic methods were used for the analysis?"
    },
    {
        "question_id": "4a32adb0d54da90434d5bd1c66cc03a7956d12a0",
        "doc_id": "1906.01010",
        "question": "Was permission sought from the bipolar patients to use this data?"
    },
    {
        "question_id": "c17ece1dad42d92c78fca2e3d8afa9a20ff19598",
        "doc_id": "1906.01010",
        "question": "How are the individuals with bipolar disorder identified?"
    },
    {
        "question_id": "1a1d94c981c58e2f2ee18bdfc4abc69fd8f15e14",
        "doc_id": "1911.12848",
        "question": "Which languages do they explore?"
    },
    {
        "question_id": "77a331d4d909d92fab9552b429adde5379b2ae69",
        "doc_id": "1706.02222",
        "question": "How significant is the performance compared to LSTM model?"
    },
    {
        "question_id": "516b691ef192f136bb037c12c3c9365ef5a6604c",
        "doc_id": "1706.02222",
        "question": "How does the introduced model combine the both factors?"
    },
    {
        "question_id": "c53b036eff430a9d0449fb50b8d2dc9d2679d9fe",
        "doc_id": "1706.02222",
        "question": "How much improvement do the introduced model achieve compared to the previous models?"
    },
    {
        "question_id": "20632fc4d2b693b5aabfbbc99ee5c1e9fc485dea",
        "doc_id": "1912.02866",
        "question": "What are the parts of the \"multimodal\" resources?"
    },
    {
        "question_id": "a57e266c936e438aeeab5e8d20d9edd1c15a32ee",
        "doc_id": "1912.02866",
        "question": "Are annotators familiar with the science topics annotated?"
    },
    {
        "question_id": "27356a99290fcc01e3e5660af3405d2a6c6f6e7c",
        "doc_id": "1912.02866",
        "question": "How are the expert and crowd-sourced annotations compared to one another?"
    },
    {
        "question_id": "6e37f43f4f54ffc77c785d60c6058fbad2147922",
        "doc_id": "1912.02866",
        "question": "What platform do the crowd-sourced workers come from?"
    },
    {
        "question_id": "fff1ed2435ba622d884ecde377ff2de127167638",
        "doc_id": "1912.02866",
        "question": "Who are considered trained experts?"
    },
    {
        "question_id": "8f6b11413a19fe4639b3fba88fc6b3678286fa0c",
        "doc_id": "1911.09241",
        "question": "what are all the datasets they experiment with?"
    },
    {
        "question_id": "141f23e87c10c2d54d559881e641c983e3ec8ef3",
        "doc_id": "1911.09241",
        "question": "what was the baseline model?"
    },
    {
        "question_id": "65c7a2b734dab51c4c81f722527424ff33b023f8",
        "doc_id": "1610.06510",
        "question": "Which translation model do they employ?"
    },
    {
        "question_id": "11ef46187a5bf15e89d63220fdeaecbeb92d818e",
        "doc_id": "1610.06510",
        "question": "Which datasets do they experiment on?"
    },
    {
        "question_id": "45aab23790161cbc55f78e16fdf5678a3f5b4b92",
        "doc_id": "1610.06510",
        "question": "Which other units of text do they experiment with (apart from BPE and ortographic syllables)?"
    },
    {
        "question_id": "bf5e80f1ab4eae2254b4f4d7651969a3cf945fb4",
        "doc_id": "1610.06510",
        "question": "How many steps of BPE do they experiment with?"
    },
    {
        "question_id": "e0c80d31d590df46d33502169b1d32f0aa1ea6e3",
        "doc_id": "1907.08540",
        "question": "what user traits are taken into account?"
    },
    {
        "question_id": "7a8b24062a5bb63a8b4c729f6247a7fd2fec7f07",
        "doc_id": "1907.08540",
        "question": "does incorporating user traits help the task?"
    },
    {
        "question_id": "cab082973e1648b0f0cc651ab4e0298a5ca012b5",
        "doc_id": "1907.08540",
        "question": "how many activities are in the dataset?"
    },
    {
        "question_id": "1cc394bdfdfd187fc0af28500ad47a0a764d5645",
        "doc_id": "1907.08540",
        "question": "who annotated the datset?"
    },
    {
        "question_id": "16cc37e4f8e2db99eaf89337a3d9ada431170d5b",
        "doc_id": "1907.08540",
        "question": "how were the data instances chosen?"
    },
    {
        "question_id": "cc78a08f5bfe233405c99cb3dac1f11f3a9268b1",
        "doc_id": "1907.08540",
        "question": "what social media platform was the data collected from?"
    },
    {
        "question_id": "2711ae6dd532d136295c95253dbf202e37ecd3e7",
        "doc_id": "1909.06708",
        "question": "How do you know the word alignments are correct?"
    },
    {
        "question_id": "96356c1affc56178b3099ce4b4aece995032e0ff",
        "doc_id": "1909.06708",
        "question": "How slow is the unparallelizable ART model in the first place?  "
    },
    {
        "question_id": "92fc94a4999d1b25a0593904025eb7b8953bb28b",
        "doc_id": "1909.06708",
        "question": "What metric is used to measure translation accuracy?"
    },
    {
        "question_id": "e56c1f0e9eabda41f929d0dfd5cfa50edd69fa89",
        "doc_id": "1909.06708",
        "question": "Were any datasets other than WMT used to test the model?"
    },
    {
        "question_id": "a86758696926f2db71f982dc1a4fa4404988544e",
        "doc_id": "1909.06708",
        "question": "Are the results applicable to other language pairs than German-English?"
    },
    {
        "question_id": "cf82251a6a5a77e29627560eb7c05c3eddc20825",
        "doc_id": "2004.04498",
        "question": "How does lattice rescoring improve inference?"
    },
    {
        "question_id": "b1fe6a39b474933038b44b6d45e5ca32af7c3e36",
        "doc_id": "2004.04498",
        "question": "What three languages are used in the translation experiments?"
    },
    {
        "question_id": "919681faa9731057b3fae5052b7da598abd3e04b",
        "doc_id": "2004.04498",
        "question": "What metrics are used to measure bias reduction?"
    },
    {
        "question_id": "2749fb1725a2c4bdba5848e2fc424a43e7c4be51",
        "doc_id": "2004.04498",
        "question": "How is the set of trusted, gender-balanced examples selected?"
    },
    {
        "question_id": "eccbbe3684d0cf6b794cb4eef379bb1c8bcc33bf",
        "doc_id": "1910.03355",
        "question": "What previous approaches are presented for comparison?"
    },
    {
        "question_id": "a3705b53c6710b41154c65327b7bbec175bdfae7",
        "doc_id": "1910.03355",
        "question": "What kind of data is used to train the model?"
    },
    {
        "question_id": "b62b7ec5128219f04be41854247d5af992797937",
        "doc_id": "1910.03355",
        "question": "Does proposed approach use neural networks?"
    },
    {
        "question_id": "e8fa4303b36a47a5c87f862458442941bbdff7d9",
        "doc_id": "1910.03355",
        "question": "What machine learning techniques are used in the model architecture?"
    },
    {
        "question_id": "51e9f446d987219bc069222731dfc1081957ce1f",
        "doc_id": "1910.03355",
        "question": "What language(s) is the model tested on?"
    },
    {
        "question_id": "3699927c6c1146f5057576034d226a99946d52cb",
        "doc_id": "1902.08830",
        "question": "what languages did they evaluate on?"
    },
    {
        "question_id": "6606160e210d05b94f7cbd9c5ff91947339f9d02",
        "doc_id": "1902.08830",
        "question": "were these categories human evaluated?"
    },
    {
        "question_id": "0dc9050c832a6091bc9db3f7fa7be72139f51177",
        "doc_id": "1902.08830",
        "question": "do language share categories? "
    },
    {
        "question_id": "f1a50f88898556ecdba8e9cac13ae54c11835945",
        "doc_id": "1909.03553",
        "question": "How many general qualitative statements are in dataset?"
    },
    {
        "question_id": "ef6304512652ba56bd13dbe282a5ce1d41a4f171",
        "doc_id": "1909.03553",
        "question": "What are state-of-the-art models on this dataset?"
    },
    {
        "question_id": "72dbdd11b655b25b2b254e39689a7d912f334b71",
        "doc_id": "1909.03553",
        "question": "How are properties being compared annotated?"
    },
    {
        "question_id": "42eb7c5311fc1ac0344f0b38d3184ccd4faad3be",
        "doc_id": "2004.01820",
        "question": "What agreement measure is used?"
    },
    {
        "question_id": "8d14dd9c67d71494b4468000ff9683afdd11af7e",
        "doc_id": "2004.01820",
        "question": "Do they report the annotation agreement?"
    },
    {
        "question_id": "b857f3e3f1dad5df55f69d062978967fe023ac6f",
        "doc_id": "2004.01820",
        "question": "How many annotators participated?"
    },
    {
        "question_id": "5a473f86052cf7781dfe40943ddf99bc9fe8a4e4",
        "doc_id": "2004.01820",
        "question": "What social-network features are used?"
    },
    {
        "question_id": "235c7c7ca719068136928b18e19f9661e0f72806",
        "doc_id": "2004.01820",
        "question": "What are the five factors considered?"
    },
    {
        "question_id": "c87966e7f497975b76a60f6be50c33d296a4a4e7",
        "doc_id": "2004.01820",
        "question": "How is cyberbullying defined?"
    },
    {
        "question_id": "b78bb6fe817c2d4bc69236df998f546e94c3ee21",
        "doc_id": "1704.06851",
        "question": "Is the performance improvement (with and without affect attributes) statistically significant?"
    },
    {
        "question_id": "1a419468d255d40ae82ed7777618072a48f0091b",
        "doc_id": "1704.06851",
        "question": "How to extract affect attributes from the sentence?"
    },
    {
        "question_id": "bda21bfb2dd74085cbc355c70dab5984ef41dba7",
        "doc_id": "1906.04236",
        "question": "How many actions are present in the dataset?"
    },
    {
        "question_id": "c2497552cf26671f6634b02814e63bb94ec7b273",
        "doc_id": "1906.04236",
        "question": "How many videos did they use?"
    },
    {
        "question_id": "441a2b80e82266c2cc2b306c0069f2b564813fed",
        "doc_id": "1906.04236",
        "question": "What unimodal algorithms do they compare with?"
    },
    {
        "question_id": "e462efb58c71f186cd6b315a2d861cbb7171f65b",
        "doc_id": "1906.04236",
        "question": "What platform was used for crowdsourcing?"
    },
    {
        "question_id": "84f9952814d6995bc99bbb3abb372d90ef2f28b4",
        "doc_id": "1906.04236",
        "question": "What language are the videos in?"
    },
    {
        "question_id": "5364fe5f256f1263a939e0a199c3708727ad856a",
        "doc_id": "1906.04236",
        "question": "How long are the videos?"
    },
    {
        "question_id": "05118578b46e9d93052e8a760019ca735d6513ab",
        "doc_id": "1611.01884",
        "question": "How do they perform semi-supervised learning?"
    },
    {
        "question_id": "31b9337fdfbbc33fc456552ad8c355d836d690ff",
        "doc_id": "1611.01884",
        "question": "What are the five evaluated tasks?"
    },
    {
        "question_id": "bab4e8881f4d75e266bce6fbfa4c3bcd3eacf30f",
        "doc_id": "1708.03312",
        "question": "what are the state of the art models?"
    },
    {
        "question_id": "130d73400698e2b3c6860b07f2e957e3ff022d48",
        "doc_id": "1909.00871",
        "question": "How is cluster purity measured?"
    },
    {
        "question_id": "7e9aec2bdf4256c6249cad9887c168d395b35270",
        "doc_id": "1909.00871",
        "question": "What was the previous state of the art for bias mitigation?"
    },
    {
        "question_id": "1acf06105f6c1930f869347ef88160f55cbf382b",
        "doc_id": "1909.00871",
        "question": "How are names paired in the Names Intervention?"
    },
    {
        "question_id": "9ce90f4132b34a328fa49a63e897f376a3ad3ca8",
        "doc_id": "1909.00871",
        "question": "Which tasks quantify embedding quality?"
    },
    {
        "question_id": "3138f916e253abed643d3399aa8a4555b2bd8c0f",
        "doc_id": "1909.00871",
        "question": "What empirical comparison methods are used?"
    },
    {
        "question_id": "80bb07e553449bde9ac0ff35fcc718d7c161f2d4",
        "doc_id": "2003.12660",
        "question": "How long is their dataset?"
    },
    {
        "question_id": "c8f8ecac23a991bceb8387e68b3b3f2a5d8cf029",
        "doc_id": "2003.12660",
        "question": "What metrics are used?"
    },
    {
        "question_id": "28847b20ca63dc56f2545e6f6ec3082d9dbe1b3f",
        "doc_id": "2003.12660",
        "question": "What is the best performing system?"
    },
    {
        "question_id": "2d5d0b0c54105717bf48559b914fefd0c94964a6",
        "doc_id": "2003.12660",
        "question": "What tokenization methods are used?"
    },
    {
        "question_id": "dd81f58c782169886235c48b8f9a08e0954dd3ae",
        "doc_id": "2003.12660",
        "question": "What baselines do they propose?"
    },
    {
        "question_id": "a3e7d7389228a197c8c44e0c504a791b60f2c80d",
        "doc_id": "1909.00091",
        "question": "How do they decide what is the semantic concept label of particular cluster?"
    },
    {
        "question_id": "8b4bd0a962241ea548752212ebac145e2ced7452",
        "doc_id": "1909.00091",
        "question": "How do they discover coherent word clusters?"
    },
    {
        "question_id": "d39059340a79bdc0ebab80ad3308e3037d7d5773",
        "doc_id": "1909.00091",
        "question": "How big are two introduced datasets?"
    },
    {
        "question_id": "31d4b0204702907dc0cd0f394cf9c984649e1fbf",
        "doc_id": "1909.00091",
        "question": "What are strong baselines authors used?"
    },
    {
        "question_id": "d922eaa5aa135c1ae211827c6a599b4d69214563",
        "doc_id": "1905.09439",
        "question": "Do they treat differerent turns of conversation differently when modeling features?"
    },
    {
        "question_id": "ff668c7e890064756cdd2f9621e1cedb91eef1d0",
        "doc_id": "1905.09439",
        "question": "How do they bootstrap with contextual information?"
    },
    {
        "question_id": "d3cfbe497a30b750a8de3ea7f2cecf4753a4e1f9",
        "doc_id": "1905.09439",
        "question": "Which word embeddings do they utilize for the EmoContext task?"
    },
    {
        "question_id": "0c247a04f235a4375dd3b0fd0ce8d0ec72ef2256",
        "doc_id": "1910.12203",
        "question": "What other evaluation metrics are reported?"
    },
    {
        "question_id": "66dfcdab1db6a8fcdf392157a478b4cca0d87961",
        "doc_id": "1910.12203",
        "question": "What out of domain scenarios did they evaluate on?"
    },
    {
        "question_id": "7ef34b4996ada33a4965f164a8f96e20af7470c0",
        "doc_id": "1910.12203",
        "question": "What was their state of the art accuracy score?"
    },
    {
        "question_id": "6e80386b33fbfba8bc1ab811a597d844ae67c578",
        "doc_id": "1910.12203",
        "question": "Which datasets did they use?"
    },
    {
        "question_id": "1c182b4805b336bd6e1a3f43dc84b07db3908d4a",
        "doc_id": "1910.12203",
        "question": "What are the neural baselines mentioned?"
    },
    {
        "question_id": "539f5c27e1a2d240e52b711d0a50a3a6ddfa5cb2",
        "doc_id": "1911.04128",
        "question": "How do they deal with imbalanced datasets?"
    },
    {
        "question_id": "aa7c5386aedfb13a361a2629b67cb54277e208d2",
        "doc_id": "1911.04128",
        "question": "What models do they compare to?"
    },
    {
        "question_id": "9b3371dcd855f1d3342edb212efa39dfc9142ae3",
        "doc_id": "1911.04128",
        "question": "What text preprocessing tasks do they focus on?"
    },
    {
        "question_id": "b02a6f59270b8c55fa4df3751bcb66fca2371451",
        "doc_id": "1911.04128",
        "question": "What news sources did they get the dataset from?"
    },
    {
        "question_id": "3a3c372b6d73995adbdfa26103c85b32d071ff10",
        "doc_id": "1911.04128",
        "question": "Did they collect their own corpus?"
    },
    {
        "question_id": "3b8da74f5b359009d188cec02adfe4b9d46a768f",
        "doc_id": "2002.08902",
        "question": "what evaluation metrics did they use?"
    },
    {
        "question_id": "6bce04570d4745dcfaca5cba64075242308b65cf",
        "doc_id": "2002.08902",
        "question": "what was the baseline?"
    },
    {
        "question_id": "37e6ce5cfc9d311e760dad8967d5085446125408",
        "doc_id": "2002.08902",
        "question": "what were roberta's results?"
    },
    {
        "question_id": "6683008e0a8c4583058d38e185e2e2e18ac6cf50",
        "doc_id": "2002.08902",
        "question": "which was the worst performing model?"
    },
    {
        "question_id": "54b25223ab32bf8d9205eaa8a570e99c683f0077",
        "doc_id": "1909.13466",
        "question": "What baselines do they compare to?"
    },
    {
        "question_id": "e5be900e70ea86c019efb06438ba200e11773a7c",
        "doc_id": "1909.13466",
        "question": "What training set sizes do they use?"
    },
    {
        "question_id": "b36a8a73b3457a94203eed43f063cb684a8366b7",
        "doc_id": "1909.13466",
        "question": "What languages do they experiment with?"
    },
    {
        "question_id": "25e6ba07285155266c3154d3e2ca1ae05c2f7f2d",
        "doc_id": "2004.03090",
        "question": "Which baselines did they compare to?"
    },
    {
        "question_id": "d68cc9aaf0466b97354600a5646c3be4512fc096",
        "doc_id": "2004.03090",
        "question": "What dialog tasks was it experimented on?"
    },
    {
        "question_id": "d038e5d2a6f85e68422caaf8b96cb046db6599fa",
        "doc_id": "2004.03090",
        "question": "How was annotation done?"
    },
    {
        "question_id": "c66e0aa86b59bbf9e6a1dc725fb9785473bfa137",
        "doc_id": "2004.03090",
        "question": "Which news outlets did they focus on?"
    },
    {
        "question_id": "369d7bc5351409910c7a5e05c0cbb5abab8e50ec",
        "doc_id": "2004.03090",
        "question": "Do the interviews fall under a specific news category? "
    },
    {
        "question_id": "b9d9803ba24127f91ba4d7cff4da11492da20f09",
        "doc_id": "2004.03090",
        "question": "Which baselines did they compare to?"
    },
    {
        "question_id": "7625068cc22a095109580b83eff48616387167c2",
        "doc_id": "2004.03090",
        "question": "Which dialog tasks did they experiment on?"
    },
    {
        "question_id": "be0b438952048fe6bb91c61ba48e529d784bdcea",
        "doc_id": "2004.03090",
        "question": "Did they use crowdsourcing for annotations?"
    },
    {
        "question_id": "a97137318025a6642ed0634f7159255270ba3d4f",
        "doc_id": "2004.03090",
        "question": "Were annotations done manually?"
    },
    {
        "question_id": "a24b2269b292fd0ee81d50303d1315383c594382",
        "doc_id": "2004.03090",
        "question": "Which news sources do the transcripts come from?"
    },
    {
        "question_id": "65e26b15e087bedb6e8782d91596b35e7454b16b",
        "doc_id": "1611.01116",
        "question": "Do they evaluate binary paragraph vectors on a downstream task?"
    },
    {
        "question_id": "a8f189fad8b72f8b2b4d2da4ed8475d31642d9e7",
        "doc_id": "1611.01116",
        "question": "How do they show that binary paragraph vectors capture semantics?"
    },
    {
        "question_id": "eafea4a24d103fdecf8f347c7d84daff6ef828a3",
        "doc_id": "1611.01116",
        "question": "Which training dataset do they use?"
    },
    {
        "question_id": "e099a37db801718ab341ac9a380a146c7452fd21",
        "doc_id": "1611.01116",
        "question": "Do they analyze the produced binary codes?"
    },
    {
        "question_id": "87c00edc497274ae6a972c3097818de85b1b384f",
        "doc_id": "1909.08250",
        "question": "How does sentence construction component works?"
    },
    {
        "question_id": "de4e949c6917ff6933f5fa2a3062ba703aba014c",
        "doc_id": "1909.08250",
        "question": "What are two use cases that demonstrate capability of created system?"
    },
    {
        "question_id": "2c4db4398ecff7e4c1c335a2cb3864bfdc31df1a",
        "doc_id": "1910.03177",
        "question": "How is GPU-based self-critical Reinforcement Learing model designed?"
    },
    {
        "question_id": "4738158f92b5b520ceba6207e8029ae082786dbe",
        "doc_id": "1910.03177",
        "question": "What are previoius similar models authors are referring to?"
    },
    {
        "question_id": "4dadde7c61230553ef14065edd8c1c7e41b9c329",
        "doc_id": "1910.03177",
        "question": "What was previous state of the art on factored dataset?"
    },
    {
        "question_id": "e2c8d7f3ef5913582503e50244ca7158d0a62c42",
        "doc_id": "1908.09892",
        "question": "Do single-language BERT outperforms multilingual BERT?"
    },
    {
        "question_id": "654fe0109502f2ed2dc8dad359dbbce4393e03dc",
        "doc_id": "1908.09892",
        "question": "What types of agreement relations do they explore?"
    },
    {
        "question_id": "2849c2944c47cf1de62b539c5d3c396a3e8d283a",
        "doc_id": "1904.09131",
        "question": "What is the accuracy of this model compared to sota?"
    },
    {
        "question_id": "13fb28e8b7f34fe600b29fb842deef75608c1478",
        "doc_id": "1603.09381",
        "question": "By how much did their model outperform baselines?"
    },
    {
        "question_id": "d5bce5da746a075421c80abe10c97ad11a96c6cd",
        "doc_id": "1603.09381",
        "question": "Which baselines did they compare against?"
    },
    {
        "question_id": "930733efb3b97e1634b4dcd77123d4d5731e8807",
        "doc_id": "1603.09381",
        "question": "What was their performance on this task?"
    },
    {
        "question_id": "11f9c207476af75a9272105e646df02594059c3f",
        "doc_id": "1603.09381",
        "question": "What dataset did they use to evaluate?"
    },
    {
        "question_id": "b32de10d84b808886d7a91ab0c423d4fc751384c",
        "doc_id": "1603.09381",
        "question": "How did they obtain part-of-speech tags?"
    },
    {
        "question_id": "9ea3669528c2b295f21770cb7f70d0c4b4389223",
        "doc_id": "1708.05482",
        "question": "what was their system's f1 score?"
    },
    {
        "question_id": "9863f5765ba70f7ff336a580346ef70205abbbd8",
        "doc_id": "1708.05482",
        "question": "what were the baselines?"
    },
    {
        "question_id": "ced63053eb631c78a4ddd8c85ec0f3323a631a54",
        "doc_id": "1708.05482",
        "question": "what emotion cause dataset was used?"
    },
    {
        "question_id": "f13a5b6a67a9b10fde68e8b33792879b8146102c",
        "doc_id": "1708.05482",
        "question": "what lexical features are extracted?"
    },
    {
        "question_id": "67c16ba64fe27838b1034d15194c07a9c98cdebe",
        "doc_id": "1708.05482",
        "question": "what word level sequences features are extracted?"
    },
    {
        "question_id": "7de0b2df60d3161dd581ed7915837d460020bc11",
        "doc_id": "1909.06502",
        "question": "How many tags are included in the ENE tag set?"
    },
    {
        "question_id": "0a3a7e412682ce951329c37b06343d2114acad9d",
        "doc_id": "1909.06502",
        "question": "Does the paper evaluate the dataset for smaller NE tag tests? "
    },
    {
        "question_id": "b569827ecd04ae8757dc3c9523ab97e3f47a6e00",
        "doc_id": "1911.11899",
        "question": "Is their gating mechanism specially designed to handle one sentence bags?"
    },
    {
        "question_id": "0d42bd759c84cbf3a293ab58283a3d0d5e27d290",
        "doc_id": "1911.11899",
        "question": "Do they show examples where only one sentence appears in a bag and their method works, as opposed to using selective attention?"
    },
    {
        "question_id": "9f1e60ee86a5c46abe75b67ef369bf92a5090568",
        "doc_id": "1911.11899",
        "question": "By how much do they outperform previous state-of-the-art in terms of top-n precision?"
    },
    {
        "question_id": "c7d3bccee59ab683e6bf047579bc6eab9de9d973",
        "doc_id": "1705.10272",
        "question": "What deep learning models do they plan to use?"
    },
    {
        "question_id": "376c6c74f008bb79a0dd9f073ac7de38870e80ad",
        "doc_id": "1705.10272",
        "question": "What baseline, if any, is used?"
    },
    {
        "question_id": "c59d67930edd3d369bd51a619849facdd0770644",
        "doc_id": "1705.10272",
        "question": "How are the language models used to make predictions on humorous statements?"
    },
    {
        "question_id": "9d6b2672b11d49c37a6bfb06172d39742d48aef4",
        "doc_id": "1705.10272",
        "question": "What type of language models are used? e.g. trigrams, bigrams?"
    },
    {
        "question_id": "7ff7c286d3118a8be5688e2d18e9a56fe83679ad",
        "doc_id": "1812.00382",
        "question": "Which model architecture do they opt for?"
    },
    {
        "question_id": "1ecbbb60dc44a701e9c57c22167dd412711bb0be",
        "doc_id": "1812.00382",
        "question": "Which dataset do they use?"
    },
    {
        "question_id": "592df9831692b8fde213257ed1894344da3e0594",
        "doc_id": "1812.00382",
        "question": "Which setup shows proves to be the hardest: cross-topic, cross-domain, cross-temporal, or across annotators?"
    },
    {
        "question_id": "6822ca5f7a19866ffc3c985b790a4aadcecf2d1c",
        "doc_id": "1812.00382",
        "question": "Which weak signal data do they use?"
    },
    {
        "question_id": "60e6296ca2a697892bd67558a21a83ef01a38177",
        "doc_id": "1812.00382",
        "question": "Do they compare their semantic feature approach to lexical approaches?"
    },
    {
        "question_id": "a51c680a63ee393792d885f66de75484dc6bc9bc",
        "doc_id": "1809.05807",
        "question": "Is an ablation test performed?"
    },
    {
        "question_id": "e752dc4d721a2cf081108b6bd71e3d10b4644354",
        "doc_id": "1809.05807",
        "question": "What statistical test is performed?"
    },
    {
        "question_id": "f1f7a040545c9501215d3391e267c7874f9a6004",
        "doc_id": "1612.09535",
        "question": "what dataset was used?"
    },
    {
        "question_id": "b6f4fd6bc76bfcbc15724a546445908afa6d922c",
        "doc_id": "1612.09535",
        "question": "by how much did their model improve over current alternatives?"
    },
    {
        "question_id": "3614c1f1435b7c1fd1f7f0041219eebf5bcff473",
        "doc_id": "1612.09535",
        "question": "did they experiment with other languages besides portuguese?"
    },
    {
        "question_id": "c316d7d0c80b8f720ff90a8bb84a8b879a3ef7ea",
        "doc_id": "1612.09535",
        "question": "how many rules did they use?"
    },
    {
        "question_id": "76e17e648a4d1f386eb6bf61b0c24f134af872be",
        "doc_id": "1808.07231",
        "question": "What other scenarios can the bias mitigation methods be applied to?"
    },
    {
        "question_id": "7572f6e68a2ed2c41b87c5088ba8680afa0c0a0b",
        "doc_id": "1808.07231",
        "question": "Are the three bias mitigation methods combined in any model?"
    },
    {
        "question_id": "5d2bbcc3aa769e639dc21893890bc36b76597a33",
        "doc_id": "1808.07231",
        "question": "Which of the three bias mitigation methods is most effective?"
    },
    {
        "question_id": "4ddc53afffaf1622d97695347dd1b3190d156dee",
        "doc_id": "1808.07231",
        "question": "What model architectures are used?"
    },
    {
        "question_id": "5d93245832d90b31aee42ea2bf1e7704c22ebeca",
        "doc_id": "1808.07231",
        "question": "What pre-trained word embeddings are used?"
    },
    {
        "question_id": "c0dbf3f1957f3bff3ced5b48aff60097f3eac7bb",
        "doc_id": "1808.07231",
        "question": "What metrics are used to measure gender biases?"
    },
    {
        "question_id": "d6a815d24c46557827d8aca65d3ffd008ac1bc07",
        "doc_id": "1803.02914",
        "question": "Which movie subtitles dataset did they use?"
    },
    {
        "question_id": "bc05503eef25c732f1785e29d59b6022f12ba094",
        "doc_id": "1903.10318",
        "question": "What other evaluation metrics did they use other than ROUGE-L??"
    },
    {
        "question_id": "a6603305f4fd3dd0010ac31243c40999a116537e",
        "doc_id": "1903.10318",
        "question": "Do they encode sentences separately or together?"
    },
    {
        "question_id": "2ba4477d597b1fd123d14be07a7780ccb5c4819b",
        "doc_id": "1903.10318",
        "question": "How do they use BERT to encode the whole text?"
    },
    {
        "question_id": "027814f3a879a6c7852e033f9d99519b8729e444",
        "doc_id": "1903.10318",
        "question": "What is the ROUGE-L score of baseline method?"
    },
    {
        "question_id": "00df1ff914956d4d23299d02fd44e4c985bb61fa",
        "doc_id": "1903.10318",
        "question": "Which is the baseline method?"
    },
    {
        "question_id": "a398c9b061f28543bc77c2951d0dfc5d1bee9e87",
        "doc_id": "1808.00957",
        "question": "Which dataset do they use?"
    },
    {
        "question_id": "dae9caf8434ce43c9bc5913ebf062bc057a27cfe",
        "doc_id": "1808.00957",
        "question": "By how much do they outperform previous state-of-the-art approaches?"
    },
    {
        "question_id": "e9b6b14b8061b71d73a73d8138c8dab8eda4ba3f",
        "doc_id": "1808.00957",
        "question": "Do they analyze attention outputs to determine which terms in general contribute to clickbait titles?"
    },
    {
        "question_id": "0bb97991fc297aa5aed784568de52d5b9121f920",
        "doc_id": "1708.07690",
        "question": "what state of the art methods are compared to?"
    },
    {
        "question_id": "7ba6330d105f49c7f71dba148bb73245a8ef2966",
        "doc_id": "1708.07690",
        "question": "what are the performance metrics?"
    },
    {
        "question_id": "157de5175259d6f25db703efb299f948dae597b7",
        "doc_id": "1708.07690",
        "question": "what is the original model they refer to?"
    },
    {
        "question_id": "cf3fab54b2b289b66e7dba4706c47a62569627c5",
        "doc_id": "1708.07690",
        "question": "how are sentences selected prior to making the summary?"
    },
    {
        "question_id": "1f053f338df6d238cb163af1a0b1b073e749ed8a",
        "doc_id": "1806.09652",
        "question": "Do they evaluate their parallel sentence generation?"
    },
    {
        "question_id": "fb06ed5cf9f04ff2039298af33384ca71ddbb461",
        "doc_id": "1806.09652",
        "question": "How much data do they manage to gather online?"
    },
    {
        "question_id": "754d7475b8bf50499ed77328b4b0eeedf9cb2623",
        "doc_id": "1806.09652",
        "question": "Which models do they use for phrase-based SMT?"
    },
    {
        "question_id": "1d10e069b4304fabfbed69acf409f0a311bdc441",
        "doc_id": "1806.09652",
        "question": "Which models do they use for NMT?"
    },
    {
        "question_id": "718c0232b1f15ddb73d40c3afbd6c5c0d0354566",
        "doc_id": "1806.09652",
        "question": "What are the BLEU performance improvements they achieve?"
    },
    {
        "question_id": "e6204daf4efeb752fdbd5c26e179efcb8ddd2807",
        "doc_id": "1601.03313",
        "question": "how did they measure grammatical correctness?"
    },
    {
        "question_id": "95c3907c5e8f57f239f3b031b1e41f19ff77924a",
        "doc_id": "1601.03313",
        "question": "how was quality of sentence transition measured?"
    },
    {
        "question_id": "b900122c7d6c2d6161bfca8a95eae11952d1cb58",
        "doc_id": "1601.03313",
        "question": "what is the size of the dataset?"
    },
    {
        "question_id": "5206b6f40a91fc16179829041c1139a6c6d91ce7",
        "doc_id": "1601.03313",
        "question": "what manual evaluation is presented?"
    },
    {
        "question_id": "863b3f29f8c59f224b4cbdb5f1097b45a25f1d88",
        "doc_id": "2001.04346",
        "question": "What kind of baseline model do they compare against?"
    },
    {
        "question_id": "e4cbfabf4509ae0f476f950c1079714a9cd3814e",
        "doc_id": "2001.04346",
        "question": "Do they analyze which types of sentences/reviews are useful or not?"
    },
    {
        "question_id": "7a84fed904acc1e0380deb6e5a2e1daacfb5907a",
        "doc_id": "2001.04346",
        "question": "Which set of datasets do they use?"
    },
    {
        "question_id": "f264612db9096caf938bd8ee4085848143b34f81",
        "doc_id": "1605.05166",
        "question": "what elements of each profile did they use?"
    },
    {
        "question_id": "da0a2195bbf6736119ff32493898d2aadffcbcb8",
        "doc_id": "1605.05166",
        "question": "Does this paper discuss the potential these techniques have for invading user privacy?"
    },
    {
        "question_id": "f5513f9314b9d7b41518f98c6bc6d42b8555258d",
        "doc_id": "1605.05166",
        "question": "How is the gold standard defined?"
    },
    {
        "question_id": "a77d38427639d54461ae308f3045434f81e497d0",
        "doc_id": "1906.08871",
        "question": "what eeg features were used?"
    },
    {
        "question_id": "010fd15696580d9924ac0275a4ff269005e5808d",
        "doc_id": "1906.08871",
        "question": "what were the baselines?"
    },
    {
        "question_id": "d36a6447bfe58204e0d29f9213d84be04d875624",
        "doc_id": "1906.08871",
        "question": "what dataset was used?"
    },
    {
        "question_id": "f7a27de3eb6447377eb48ef6d2201205ff943751",
        "doc_id": "2004.02214",
        "question": "Is there a metric that also rewards good stylistic response?"
    },
    {
        "question_id": "2df3cd12937591481e85cf78c96a24190ad69e50",
        "doc_id": "2004.02214",
        "question": "What are existing baseline models on these benchmark datasets?"
    },
    {
        "question_id": "fcb0ac1934e2fd9f58f4b459e6853999a27844f9",
        "doc_id": "2004.02214",
        "question": "On what two languages is experimented on?"
    },
    {
        "question_id": "fc9aa04de4018b7d55e19a39663a2e9837328de7",
        "doc_id": "2004.02214",
        "question": "What three benchmark datasets are used?"
    },
    {
        "question_id": "3e4e415e346a313f5a7c3764fe0f51c11f51b071",
        "doc_id": "1902.10246",
        "question": "What language is the model tested on?"
    },
    {
        "question_id": "d622564b250cffbb9ebbe6636326b15ec3c622d9",
        "doc_id": "1902.10246",
        "question": "How much lower is the computational cost of the proposed model?"
    },
    {
        "question_id": "4367617c0b8c9f33051016e8d4fbb44831c54d0f",
        "doc_id": "1902.10246",
        "question": "What is the state-of-the-art model?"
    },
    {
        "question_id": "2c60628d54f2492e0cbf0fb8bacd8e54117f0c18",
        "doc_id": "1902.10246",
        "question": "What is a pseudo language model?"
    },
    {
        "question_id": "7239c02a0dcc0c3c9d9cddb5e895bcf9cfcefee6",
        "doc_id": "1602.03483",
        "question": "Which data sources do they use?"
    },
    {
        "question_id": "9dcc10a4a325d4c9cb3bb8134831ee470be47e93",
        "doc_id": "1602.03483",
        "question": "Which tasks do they evaluate supervised systems on?"
    },
    {
        "question_id": "31236a876277c6e1c80891a3293c105a1b1be008",
        "doc_id": "1602.03483",
        "question": "How do they evaluate domain portability?"
    },
    {
        "question_id": "19ebfba9aa5a9596b09a0cfb084ff8ebf24a3b91",
        "doc_id": "1602.03483",
        "question": "Which unsupervised representation-learning objectives do they introduce?"
    },
    {
        "question_id": "8b8adb1d5a1824c8995b3eba668745c44f61c9c6",
        "doc_id": "1903.04329",
        "question": "Is this analysis performed only on English data?"
    },
    {
        "question_id": "88d1bd21b53b8be4f9d3cb26ecc3cbcacffcd63e",
        "doc_id": "1903.04329",
        "question": "Do they authors offer any hypothesis for why the parameters of Zipf's law and Heaps' law differ on Twitter?"
    },
    {
        "question_id": "74cef0205e0f31d0ab28d0e4d96c1e8ef62d4cce",
        "doc_id": "1903.04329",
        "question": "What explanation do the authors offer for the super or sublinear urban scaling?"
    },
    {
        "question_id": "200c37060d037dee33f3b7c8b1a2aaa58376566e",
        "doc_id": "1903.04329",
        "question": "Do the authors give examples of the core vocabulary which follows the scaling relationship of the bulk text?"
    },
    {
        "question_id": "4dc4180127761e987c1043d5f8b94512bbe74d4f",
        "doc_id": "1603.09405",
        "question": "By how much do they outperform existing methods?"
    },
    {
        "question_id": "420862798054f736128a6f0c4393c7f9cc648b40",
        "doc_id": "1603.09405",
        "question": "Which datasets do they evaluate on?"
    },
    {
        "question_id": "ad8411edf11d3429c9bdd08b3e07ee671464d73c",
        "doc_id": "1603.09405",
        "question": "Do they separately evaluate performance of their learned representations (before forwarding them to the CNN layer)?"
    },
    {
        "question_id": "c97306c1be5d59cf27b1054adfa8f1da47d292ce",
        "doc_id": "1810.02229",
        "question": "What are the contributions of this paper?"
    },
    {
        "question_id": "e42916924b69cab1df25d3b4e6072feaa0ba8084",
        "doc_id": "1810.02229",
        "question": "What are the baselines this paper uses?"
    },
    {
        "question_id": "079ca5810060e1cdc12b5935d8c248492f0478b9",
        "doc_id": "1810.02229",
        "question": "Can the model be extended to other languages?"
    },
    {
        "question_id": "0035b351df63971ec57e36d4bfc6f7594bed41ae",
        "doc_id": "1802.09233",
        "question": "How is the data labeled?"
    },
    {
        "question_id": "2b021e1486343d503bab26c2282f56cfdab67248",
        "doc_id": "1802.09233",
        "question": "What is the best performing model?"
    },
    {
        "question_id": "e801b6a6048175d3b1f3440852386adb220bcb36",
        "doc_id": "1802.09233",
        "question": "How long is the dataset?"
    },
    {
        "question_id": "52868394eb2b3b37eb5f47f51c06ad53061f4495",
        "doc_id": "2002.06854",
        "question": "How did they obtain the dataset?"
    },
    {
        "question_id": "59dc6b1d3da74a2e67a6fb1ce940b28d9e3d8de0",
        "doc_id": "2002.06854",
        "question": "Are the recommendations specific to a region?"
    },
    {
        "question_id": "713e1c7b0ab17759ba85d7cd2041e387831661df",
        "doc_id": "2002.06854",
        "question": "Did they experiment on this dataset?"
    },
    {
        "question_id": "a526c63fc8dc1b79702b481b77e3922d7002d973",
        "doc_id": "1706.08568",
        "question": "Are answers in this dataset guaranteed to be substrings of the text? If not, what is the coverage of answers being substrings?"
    },
    {
        "question_id": "0f9678e11079ee9ea1a1ce693f017177dd495ee5",
        "doc_id": "1706.08568",
        "question": "How much is the gap between pretraining on SQuAD and not pretraining on SQuAD?"
    },
    {
        "question_id": "d576af4321fe71ced9e521df1f3fe1eb90d2df2d",
        "doc_id": "2002.10210",
        "question": "How better are results of new model compared to competitive methods?"
    },
    {
        "question_id": "fd651d19046966ca65d4bcf6f6ae9c66cdf13777",
        "doc_id": "2002.10210",
        "question": "What is the metrics used for benchmarking methods?"
    },
    {
        "question_id": "08b77c52676167af72581079adf1ca2b994ce251",
        "doc_id": "2002.10210",
        "question": "What are other competitive methods?"
    },
    {
        "question_id": "89fa14a04008c93907fa13375f9e70b655d96209",
        "doc_id": "2002.10210",
        "question": "What is the size of built dataset?"
    },
    {
        "question_id": "0d9241e904bd2bbf5b9a6ed7b5fc929651d3e28e",
        "doc_id": "1910.13793",
        "question": "Do the QA tuples fall under a specific domain?"
    },
    {
        "question_id": "95646d0ac798dcfc15b43fa97a1908df9f7b9681",
        "doc_id": "1910.13793",
        "question": "What is the baseline model?"
    },
    {
        "question_id": "12dc04e0ec1d3ba5ec543069fe457dfa4a1cac07",
        "doc_id": "1910.13793",
        "question": "How large is the corpus of QA tuples?"
    },
    {
        "question_id": "647f6e6b168ec38fcdb737d3b276f78402282f9d",
        "doc_id": "1910.13793",
        "question": "What corpus did they use?"
    },
    {
        "question_id": "ce8d8de78a21a3ba280b658ac898f73d0b52bf1b",
        "doc_id": "1910.10487",
        "question": "What is possible future improvement for proposed method/s?"
    },
    {
        "question_id": "e069fa1eecd711a573c0d5c83a3493f5f04b1d8a",
        "doc_id": "1910.10487",
        "question": "What is percentage change in performance for better model when compared to baseline?"
    },
    {
        "question_id": "8db11d9166474a0e98b99ac7f81d1f14539d79ec",
        "doc_id": "1910.10487",
        "question": "Which of two design architectures have better performance?"
    },
    {
        "question_id": "c30b0d6b23f0f01573eea315176c5ffe4e0c6b5c",
        "doc_id": "1910.02677",
        "question": "How large is the test set?"
    },
    {
        "question_id": "311f9971d61b91c7d76bba1ad6f038390977a8be",
        "doc_id": "1910.02677",
        "question": "What does SARI measure?"
    },
    {
        "question_id": "23cbf6ab365c1eb760b565d8ba51fb3f06257d62",
        "doc_id": "1910.02677",
        "question": "What are the baseline models?"
    },
    {
        "question_id": "11360385dff0a9d7b8f4b106ba2b7fe15ca90d7c",
        "doc_id": "1912.11585",
        "question": "What was the baseline?"
    },
    {
        "question_id": "875fbf4e5f93c3da63e28a233ce1d8405c7dfe63",
        "doc_id": "1912.11585",
        "question": "What dataset was used in this challenge?"
    },
    {
        "question_id": "56b66d19dbc5e605788166e168f36d25f5beb774",
        "doc_id": "1912.11585",
        "question": "Which subsystem outperformed the others?"
    },
    {
        "question_id": "1a6156189297b2fe17f174ef55cbd20341bb7dbf",
        "doc_id": "1611.06322",
        "question": "What previous methods do they compare against?"
    },
    {
        "question_id": "3319d56556ae1597a86384057db0831e32774b90",
        "doc_id": "1611.06322",
        "question": "What is their evaluation metric?"
    },
    {
        "question_id": "8cbe3fa4ec0f66071e3d6b829b09b6395b631c44",
        "doc_id": "1611.06322",
        "question": "Are their methods fully supervised?"
    },
    {
        "question_id": "85e417231a4bbb6691f7a89bd81710525f8fec4c",
        "doc_id": "1611.06322",
        "question": "Do they build a dataset of rumors?"
    },
    {
        "question_id": "57ee20f494d8ce3fae46028c3f3551d180dba3e0",
        "doc_id": "1611.06322",
        "question": "What languages do they evaluate their methods on?"
    },
    {
        "question_id": "2974237446d04da33b78ce6d22a477cdf80877b7",
        "doc_id": "1611.06322",
        "question": "How do they define rumors?"
    },
    {
        "question_id": "bd53399be8ff59060792da4c8e42a7fc1e6cbd85",
        "doc_id": "2003.13028",
        "question": "What is the previous state-of-the-art?"
    },
    {
        "question_id": "a7313c29b154e84b571322532f5cab08e9d49e51",
        "doc_id": "2003.13028",
        "question": "What is the architecture of the decoder?"
    },
    {
        "question_id": "cfe21b979a6c851bdafb2e414622f61e62b1d98c",
        "doc_id": "2003.13028",
        "question": "What is the architecture of the encoder?"
    },
    {
        "question_id": "3e3d123960e40bcb1618e11999bd2031ccc1d155",
        "doc_id": "2003.13028",
        "question": "What are the languages of the datasets?"
    },
    {
        "question_id": "2e37eb2a2a9ad80391e57acb53616eab048ab640",
        "doc_id": "2003.13028",
        "question": "What is the architecture of the saliency model?"
    },
    {
        "question_id": "2b32cf05c5e736f764ceecc08477e20ab9f2f5d7",
        "doc_id": "1908.06493",
        "question": "Does the paper report F1-scores with and without post-processing for the second task?"
    },
    {
        "question_id": "014a3aa07686ee18a86c977bf0701db082e8480b",
        "doc_id": "1908.06493",
        "question": "What does post-processing do to the output?"
    },
    {
        "question_id": "6e6d64e2cb7734599890fff3f10c18479756d540",
        "doc_id": "1908.06493",
        "question": "Do they test any neural architecture?"
    },
    {
        "question_id": "8675d39f1647958faab7fa40cdaab207d4fe5a29",
        "doc_id": "1908.06493",
        "question": "Is the performance of a Naive Bayes approach evaluated?"
    },
    {
        "question_id": "c2ce25878a17760c79031a426b6f38931cd854b2",
        "doc_id": "2003.11528",
        "question": "What is the source of the training/testing data?"
    },
    {
        "question_id": "1d263356692ed8cdee2a13f103a82d98f43d66eb",
        "doc_id": "2003.11528",
        "question": "What are the types of chinese poetry that are generated?"
    },
    {
        "question_id": "9465d96a1368299fd3662d91aa94ba85347b4ccd",
        "doc_id": "2003.07459",
        "question": "What is the performance of the best model?"
    },
    {
        "question_id": "e8c3f59313df20db0cdd49b84a37c44da849fe17",
        "doc_id": "2003.07459",
        "question": "What are the models tested on the dataset?"
    },
    {
        "question_id": "f61268905626c0b2a715282478a5e373adda516c",
        "doc_id": "2003.07459",
        "question": "Which method best performs on the offensive language identification task?"
    },
    {
        "question_id": "d9949dd4865e79c53284932d868ca8fd10d55e70",
        "doc_id": "2003.07459",
        "question": "Did they use crowdsourcing for the annotations?"
    },
    {
        "question_id": "de689a17b0b9fb6bbb80e9b85fb44b36b56de2fd",
        "doc_id": "2003.07459",
        "question": "How many annotators did they have?"
    },
    {
        "question_id": "5a90871856beeefaa69a1080e1b3c8b5d4b2b937",
        "doc_id": "2003.07459",
        "question": "Is the dataset balanced?"
    },
    {
        "question_id": "6cb3007a09ab0f1602cdad20cc0437fbdd4d7f3e",
        "doc_id": "2003.07459",
        "question": "What models do they experiment on?"
    },
    {
        "question_id": "ec8f39d32084996ab825debd7113c71daac38b06",
        "doc_id": "1606.07043",
        "question": "How do they incorporate expert knowledge into their topic model?"
    },
    {
        "question_id": "a67a2d9acad1787b636ca2681330f4c29a0b0254",
        "doc_id": "1606.07043",
        "question": "On which corpora do they evaluate on?"
    },
    {
        "question_id": "1efaf3bcd66d1b6bdfb124f0cec0cfeee27e6124",
        "doc_id": "1606.07043",
        "question": "Do they compare against popular topic models, such as LDA?"
    },
    {
        "question_id": "d38745a3910c380e6df97c7056a5dd9643fd365b",
        "doc_id": "1606.02601",
        "question": "Do they compare to other models that include subword information such as fastText?"
    },
    {
        "question_id": "2b75df325c98b761faf2fecf6e71ac7366eb15ea",
        "doc_id": "1606.02601",
        "question": "Is there a difference between the model's performance for morphologically impoverished and morphologically complex languages?"
    },
    {
        "question_id": "649e77ac2ecce42ab2efa821882675b5a0c993cb",
        "doc_id": "1606.02601",
        "question": "What languages do they apply the model to?"
    },
    {
        "question_id": "0bc305d6b90f77f835bc4c904b22a4be07f963b2",
        "doc_id": "1606.02601",
        "question": "How are the embeddings evaluated in the human judgement comparison?"
    },
    {
        "question_id": "a8e5e10d13b3f21dd11e8eb58e30cc25efc56e93",
        "doc_id": "1708.09025",
        "question": "How many domains do they create ontologies for?"
    },
    {
        "question_id": "949a2bc34176e47a4d895bcc3223f2a960f15a81",
        "doc_id": "1708.09025",
        "question": "Do they separately extract topic relations and topic hierarchies in their model?"
    },
    {
        "question_id": "70abb108c3170e81f8725ddc1a3f2357be5a4959",
        "doc_id": "1708.09025",
        "question": "How do they measure the usefulness of obtained ontologies compared to domain expert ones?"
    },
    {
        "question_id": "ce504a7ee2c1f068ef4dde8d435245b4e77bb0b5",
        "doc_id": "1708.09025",
        "question": "How do they obtain syntax from raw documents in hrLDA?"
    },
    {
        "question_id": "34e9e54fa79e89ecacac35f97b33ef3ca3a00f85",
        "doc_id": "1909.02855",
        "question": "What are the three SOTA models evaluated?"
    },
    {
        "question_id": "6e63db22a2a34c20ad341eb33f3422f40d0001d3",
        "doc_id": "1909.02855",
        "question": "What is the morphological constraint added?"
    },
    {
        "question_id": "f3b4e52ba962a0004064132d123fd9b78d9e12e2",
        "doc_id": "1907.12984",
        "question": "Which datasets do they evaluate on?"
    },
    {
        "question_id": "ea6edf45f094586caf4684463287254d44b00e95",
        "doc_id": "1907.12984",
        "question": "Do they compare against a system that does not use streaming text, but has the entire text at disposal?"
    },
    {
        "question_id": "ba406e07c33a9161e29c75d292c82a15503beae5",
        "doc_id": "1907.12984",
        "question": "Does larger granularity lead to better translation quality?"
    },
    {
        "question_id": "a786cceba4372f6041187c426432853eda03dca6",
        "doc_id": "1910.02001",
        "question": "What is the state-of-the-art?"
    },
    {
        "question_id": "a837dcbd339e27a974e28944178c790a5b0b37c0",
        "doc_id": "1910.02001",
        "question": "How large is the dataset?"
    },
    {
        "question_id": "c135e1f8ecaf7965f6a6d3e30b537eb37ad74230",
        "doc_id": "1910.02001",
        "question": "How are labels for trolls obtained?"
    },
    {
        "question_id": "16a10c1681dc5a399b6d34b4eed7bb1fef816dd0",
        "doc_id": "1910.02001",
        "question": "Do they only look at tweets?"
    },
    {
        "question_id": "73d87f6ead32653a518fbe8cdebd81b4a3ffcac0",
        "doc_id": "1704.03279",
        "question": "What were the performance results of their network?"
    },
    {
        "question_id": "fda47c68fd5f7b44bd539f83ded5882b96c36dd7",
        "doc_id": "1704.03279",
        "question": "What were the baselines?"
    },
    {
        "question_id": "643645e02ffe8fde45918615ec92013a035d1b92",
        "doc_id": "1704.03279",
        "question": "What dataset is used?"
    },
    {
        "question_id": "a994cc18046912a8c9328dc572f4e4310736c0e2",
        "doc_id": "1704.03279",
        "question": "Do they explore other language pairs?"
    },
    {
        "question_id": "d418bf6595b1b51a114f28ac8a6909c278838aeb",
        "doc_id": "1908.10149",
        "question": "What QA system was used in this work?"
    },
    {
        "question_id": "6d6b0628d8a942c57d7af1447a563021be79bc64",
        "doc_id": "1908.10149",
        "question": "Is the re-ranking approach described in this paper a transductive learning technique?"
    },
    {
        "question_id": "b21245212244ad7adf7d321420f2239a0f0fe56b",
        "doc_id": "1908.10149",
        "question": "How big is the test set used for evaluating the proposed re-ranking approach?"
    },
    {
        "question_id": "230ff86b7b90b87c33c53014bb1e9c582dfc107f",
        "doc_id": "1805.11937",
        "question": "What morphological typologies are considered?"
    },
    {
        "question_id": "dc23006d67f20f430f1483398de4a89c0be4efe2",
        "doc_id": "1805.11937",
        "question": "Does the model consider both derivational and inflectional morphology?"
    },
    {
        "question_id": "887d7f3edf37ccc6bf2e755dae418b04d2309686",
        "doc_id": "1805.11937",
        "question": "What type of morphological features are used?"
    },
    {
        "question_id": "65b39676db60f914f29f74b7c1264422ee42ad5c",
        "doc_id": "1611.00440",
        "question": "what are the other methods they compare to?"
    },
    {
        "question_id": "a2baa8e266318f23f43321c4b2b9cf467718c94a",
        "doc_id": "1611.00440",
        "question": "what preprocessing method is introduced?"
    },
    {
        "question_id": "d2d9c7177728987d9e8b0c44549bbe03c8c00ef2",
        "doc_id": "1906.05685",
        "question": "What evaluation metrics did they use?"
    },
    {
        "question_id": "6657ece018b1455035421b822ea2d7961557c645",
        "doc_id": "1906.05685",
        "question": "What NMT techniques did they explore?"
    },
    {
        "question_id": "175cddfd0bcd77b7327b62f99e57d8ea93f8d8ba",
        "doc_id": "1906.05685",
        "question": "What was their best performing model?"
    },
    {
        "question_id": "f0afc116809b70528226d37190e8e79e1e9cd11e",
        "doc_id": "1906.05685",
        "question": "What datasets did they use?"
    },
    {
        "question_id": "97ff88c31dac9a3e8041a77fa7e34ce54eef5a76",
        "doc_id": "1702.02367",
        "question": "How well does their model perform on the recommendation task?"
    },
    {
        "question_id": "272defe245d1c5c091d3bc51399181da2da5e5f0",
        "doc_id": "1702.02367",
        "question": "Which knowledge base do they use to retrieve facts?"
    },
    {
        "question_id": "860257956b83099cccf1359e5d960289d7d50265",
        "doc_id": "1702.02367",
        "question": "Which neural network architecture do they use?"
    },
    {
        "question_id": "756a8a9125e6984e0ca768b653c6c760efa3db66",
        "doc_id": "1905.00840",
        "question": "What was their accuracy score?"
    },
    {
        "question_id": "fe52b093735bb456d7e699aa9a2b806d2b498ba0",
        "doc_id": "1905.00840",
        "question": "What are the state-of-the-art systems?"
    },
    {
        "question_id": "7748c072e07d6c6db5a34be38b4a5e97ac6d7999",
        "doc_id": "1905.00840",
        "question": "What dataset did they evaluate on?"
    },
    {
        "question_id": "a17fc7b96753f85aee1d2036e2627570f4b50c30",
        "doc_id": "1910.07973",
        "question": "Do they report results only on English data?"
    },
    {
        "question_id": "c6170bb09ba2a416f8fa9b542f0ab05a64dbf2e4",
        "doc_id": "1910.07973",
        "question": "What is the BM25 baseline?"
    },
    {
        "question_id": "fe080c6393f126b55ae456b81133bfc8ecbe85c2",
        "doc_id": "1910.07973",
        "question": "Which BERT layers were combined to boost performance?"
    },
    {
        "question_id": "53a8c3cf22d6bf6477bc576a85a83d8447ee0484",
        "doc_id": "1910.07973",
        "question": "Which NLI data was used to improve the quality of the embeddings?"
    },
    {
        "question_id": "3a33512d253005ac280ee9ca4f9dfa69aa38d48f",
        "doc_id": "1910.07973",
        "question": "Which four QA datasets are examined?"
    },
    {
        "question_id": "f7f2968feb28c2907266c892f051ae9f7d6286e6",
        "doc_id": "1910.07973",
        "question": "Which two tasks from SentEval are the sentence embeddings evaluated against?"
    },
    {
        "question_id": "4beb50ba020f624446ff1ef5bf4adca5ed318b98",
        "doc_id": "1909.08357",
        "question": "What languages are evaluated?"
    },
    {
        "question_id": "9bf60073fbb69fbf860196513fc6fd2f466535f6",
        "doc_id": "1909.08357",
        "question": "Does the training of ESuLMo take longer compared to ELMo?"
    },
    {
        "question_id": "7d503b3d4d415cf3e91ab08bd5a1a2474dd1047b",
        "doc_id": "1909.08357",
        "question": "How long is the vocabulary of subwords?"
    },
    {
        "question_id": "95af7aaea3ce9dab4cf64e2229ce9b98381dd050",
        "doc_id": "1612.03762",
        "question": "Did they test the idea that the system reduces the time needed to encode ADR reports on real pharmacologists? "
    },
    {
        "question_id": "ab37ae82e38f64d3fa95782f2c791488f26cd43f",
        "doc_id": "1612.03762",
        "question": "Do the authors offer a hypothesis as to why the system performs better on short descriptions than longer ones?"
    },
    {
        "question_id": "6c9b3b2f2e5aac1de1cbd916dc295515301ee2a2",
        "doc_id": "1612.03762",
        "question": "What are the steps in the MagiCoder algorithm?"
    },
    {
        "question_id": "71413505d7d6579e2a453a1f09f4efd20197ab4b",
        "doc_id": "1612.03762",
        "question": "How is the system constructed to be linear in the size of the narrative input and the terminology?"
    },
    {
        "question_id": "a222dc5d804a7b453a0f7fbc1d6c1b165a3ccdd6",
        "doc_id": "1911.01248",
        "question": "What is the Semantic Web?"
    },
    {
        "question_id": "179bc57b7b5231ea6ad3e93993a6935dda679fa2",
        "doc_id": "1704.04451",
        "question": "Do they compare against Reinforment-Learning approaches?"
    },
    {
        "question_id": "a59e86a15405c8a11890db072b99fda3173e5ab2",
        "doc_id": "1704.04451",
        "question": "How long is the training dataset?"
    },
    {
        "question_id": "9489b0ecb643c1fc95c001c65d4e9771315989aa",
        "doc_id": "1704.04451",
        "question": "What dataset do they use?"
    },
    {
        "question_id": "397a1e851aab41c455c2b284f5e4947500d797f0",
        "doc_id": "1911.10742",
        "question": "How big is the ANTISCAM dataset? "
    },
    {
        "question_id": "cc8b4ed3985f9bfbe1b5d7761b31d9bd6a965444",
        "doc_id": "1911.10742",
        "question": "How is intent annotated?"
    },
    {
        "question_id": "f7662b11e87c1e051e13799413f3db459ac3e19c",
        "doc_id": "1911.10742",
        "question": "What are the baselines outperformed by this work?"
    },
    {
        "question_id": "b584739622d0c53830e60430b13fd3ae6ff43669",
        "doc_id": "1911.10742",
        "question": "What are the evaluation metrics and criteria used to evaluate the model performance?"
    },
    {
        "question_id": "84765903b8c7234ca2919d0a40e3c6a5bcedf45d",
        "doc_id": "1911.04873",
        "question": "Do any of the models use attention?"
    },
    {
        "question_id": "38363a7ed250bc729508c4c1dc975696a65c53cb",
        "doc_id": "1911.04873",
        "question": "What translation models are explored?"
    },
    {
        "question_id": "e862ebfdb1b3425af65fec81c8984edca6f89a76",
        "doc_id": "1911.04873",
        "question": "What is symbolic rewriting?"
    },
    {
        "question_id": "c0120d339fcdb3833884622e532e7513d1b2c7dd",
        "doc_id": "1912.00342",
        "question": "Is some baseline method trained on new dataset?"
    },
    {
        "question_id": "f52c9744a371104eb2677c181a7004f7a77d9dd3",
        "doc_id": "1912.00342",
        "question": "What potential applications are demonstrated?"
    },
    {
        "question_id": "867b1bb1e6a38de525be7757d49928a132d0dbd8",
        "doc_id": "1912.00342",
        "question": "What method is proposed to mitigate class imbalance in final dataset?"
    },
    {
        "question_id": "9686f3ff011bc6e3913c329c6a5671932c27e63e",
        "doc_id": "1906.08584",
        "question": "What architecture is used in the encoder?"
    },
    {
        "question_id": "f8edc911f9e16559506f3f4a6bda74cde5301a9a",
        "doc_id": "1802.06053",
        "question": "By how much they outperform the baseline?"
    },
    {
        "question_id": "8c288120139615532838f21094bba62a77f92617",
        "doc_id": "1802.06053",
        "question": "How long are the datasets?"
    },
    {
        "question_id": "a464052fd11af1d2d99e407c11791269533d43d1",
        "doc_id": "1802.06053",
        "question": "What bayesian model is trained?"
    },
    {
        "question_id": "5f6c1513cbda9ae711bc38df08fe72e3d3028af2",
        "doc_id": "1802.06053",
        "question": "What low resource languages are considered?"
    },
    {
        "question_id": "e25b73f700e8c958b64951f14a71bc60d225125c",
        "doc_id": "2004.02105",
        "question": "How much improvement is there in the BLEU score?"
    },
    {
        "question_id": "908ba58d26d15c14600623498d4e86c9b73b14b2",
        "doc_id": "2004.02105",
        "question": "What is the established approach used for comparison?"
    },
    {
        "question_id": "3e0fd1a3944e207edbbe7c7108239dbaf3bccd4f",
        "doc_id": "2004.02105",
        "question": "What are the five domains?"
    },
    {
        "question_id": "c0847af3958d791beaa14c4040ada2d364251c4d",
        "doc_id": "2004.02105",
        "question": "Which pre-trained language models are used?"
    },
    {
        "question_id": "35c01dc0b50b73ee5ca7491d7d373f6e853933d2",
        "doc_id": "1912.06203",
        "question": "Which dataset do they use for text altering attributes matching to image parts?"
    },
    {
        "question_id": "c077519ea42c9649fb78da34485de2262a0df779",
        "doc_id": "1912.06203",
        "question": "Is it possible for the DCM module to correct text-relevant content?"
    }
]