{
  "metrics": {
    "precision": "63.3",
    "recall": "61.1",
    "macro_f1": "54.1",
    "accuracy": "54.6"
  },
  "binary_mode": true,
  "prediction_distribution": {
    "Refuted": 329,
    "Supported": 895
  },
  "detailed_results": [
    {
      "index": 0,
      "claim": "The models using BoC outperform models using BoW as well as ASM features.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1,
      "claim": "[CONTINUE] OD significantly outperforms OD-parse: We observe that compared to OD-parse, OD is much more accurate.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 2,
      "claim": "Table 4: Comparison of per-document accuracy (% ) by different systems for top 1, 3 and 5 words of abstractive sentences.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 4 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 3,
      "claim": "The UnsupEmb baseline performs rather poorly on both POS and SEM tagging.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 4,
      "claim": "In particular, we see that hate speech and harassment are particularly difficult to detect.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 5,
      "claim": "The results prove the effectiveness of word-level attention to exploit the local interactions in link prediction task.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 6,
      "claim": "The average number of tokens per tweet is not 22.3, per sentence is not 13.6 and average scope length is not 2.9.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 6 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 7,
      "claim": "[CONTINUE] When comparing between M2 and M3, between M4 and M5, and between M6 and M7, we find that the addition of the language modeling loss reduces PP, sometimes at a slight cost of semantic preser...",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 6 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 8,
      "claim": "2018) or reinforcement learning with additional dataset-specific heuristics (Kryscinski et\\xa0al.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 3 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 9,
      "claim": "Table 1: In all language pairs, the best correlation is achieved by our word mover metrics that use a BERT pretrained on MNLI as the embedding generator and PMeans to aggregate the embeddings from dif...",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 3 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 10,
      "claim": "We hypothesize that the gating mechanism can better capture longdistance dependencies between nodes far apart in the graph.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 11,
      "claim": "the model converged and yielded high performance, verifying the efficacy of the implicit answer vector representation for matching word meanings",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 12,
      "claim": "[CONTINUE] Pretraining the HAN models, although intuitively promising, yields only comparable results with those without.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 13,
      "claim": "While Glorot achieves slightly better results on BShift and TopConst, CMOW's ability to memorize word content is not improved by our initialization strategy.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 14,
      "claim": "In Table 5, it can be seen that generative pretraining via language modeling does not account for a considerable amount of performance, constituting 44.32% of the overall performance (a boost of 42.67...",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 9 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 15,
      "claim": "Note that using discriminative training, even with no additional monolingual data, leads to better performance than that of the best language model: the CS-ONLY-DISCRIMINATIVE model achieves an accura...",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 4 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 16,
      "claim": "[CONTINUE] The effectiveness of our hierarchical attention design is proved by an accuracy drop of 1.95% after removing residual connections and the hierarchical stack of our attention modules.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 17,
      "claim": "This observation concurs with the performance boost for this model across the two datasets and shows that using a more advanced architecture with more parameters results in larger improvements using t...",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 18,
      "claim": "for example, for those rewards, models learn the ROUGE reward much better than the full extent of system-level ROUGE correlation as shown in Table\u00a01, which will also increase system-level ROUGE.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 19,
      "claim": "Mentions of time are not specific of complaints (been, still, on, days, Temporal References cluster).",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 20,
      "claim": "This is especially true in the case of DAN where we see a large increase as the decoder repeatedly predicts words having high sentiment value.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 21,
      "claim": ", then randomly selected 5 examples were manually examined to understand the sources of errors, which was helpful in identifying issues in cue detection.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 22,
      "claim": "the utterance in the first premise \u201cThe woman went down into the cellar\u201d leads BERT-large to produce \u201cThe woman entered the cellar\u201d and to choose the distractor rather than the correct premise.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 23,
      "claim": "Our models DCGCN(single) and DCGCN(ensemble)consist of full GCN layers, removing the burden of employing a recurrent encoder to extract non-local contextual information in the bottom layers.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 24,
      "claim": "AME performs better than FME model on both symmetric and asymmetric modes, which shows the advantage of finetuning word embeddings during training.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 25,
      "claim": "On the contrary, we found the quality of 3-step NLDs is relatively higher than the others.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 26,
      "claim": "Our joint model does not outperform all the base [CONTINUE] The results do not reconfirm that the lemma baseline, when combined with effective topic clustering, is a strong baseline for CD event coref...",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 27,
      "claim": "BI+IS with EWC-adapted models gives a 0.9 / 3.4 BLEU loss over the strong uniform EWC ensemble, and a 2.4 / 10.2 overall BLEU loss over the approach described in Freitag and Al-Onaizan (2016).",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 10 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 28,
      "claim": "More importantly, their G-Pre and G-Rec scores are all above .50, which means that more than half of the good summaries identified by the metrics are actually good, and more than 50%.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 29,
      "claim": "One interpretation for this difference is that under the simulated conversations with random reward function, GP-MBCM does not align well with the different human users.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 30,
      "claim": "Again, when ROUGE is used as rewards, the generated summaries have higher ROUGE scores.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 31,
      "claim": "This suggests that graph encoders based on gating mechanisms are very effective in text generation models.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 32,
      "claim": "the models more often fail to realise part of the MR, rather than hallucinating additional information.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 33,
      "claim": "These results show significant performance improvement by using Predicate Schemas knowledge on hard coreference problems.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 34,
      "claim": "This is expected, since the questions in the SQuAD and QA-SRL datasets tend to be very different (more declarative in the former, more interrogative in the latter).",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 35,
      "claim": "The AAS method with weights wAC=1 and wAD=105 shows the lowest WER and DCE.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 36,
      "claim": "The most representative models are ELMO, GPT, BERT and its variants, and XLNET.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 37,
      "claim": "In general, increasing the number of GCN layers from 2 to 9 boosts the model performance.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 38,
      "claim": "The results of CLUSTER+KCP indicate that pre-clustering of documents to topics is not beneficial, performing substantially worse than our joint model.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 39,
      "claim": "[CONTINUE] We found that innovations are helpful in both early and late fusion frameworks, while late fusion performs better on average.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 40,
      "claim": "[CONTINUE] As these models use object detectors pretrained on Pascal-VOC , they have somewhat higher performance on classes that are common to both Flickr30k and Pascal-VOC (\"animals\", \"people\" and \"v...",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 41,
      "claim": "Coverage helps the model improve its EM by 1.5 and its F1 by 0.5.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 5 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 42,
      "claim": "models with NSP performance drop a lot when trained with COPA.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 43,
      "claim": "The hybrid model is able to repair this deficit, reducing the difference to 8%.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 44,
      "claim": "our framework captures more information about the intended semantic feature.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 45,
      "claim": "We can see that the dual attention model does not work at all and the scores slightly drop.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 46,
      "claim": "in terms of correctness, the averaged Ok rate on all 15 decisions is 44.3%",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 3 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 47,
      "claim": "The results in Table 7 show that the proposed method is not as effective as the state of the art BiLSTM model from (Fancellu et al., 2016) on gold negation cues for scope prediction.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 3 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 48,
      "claim": "SciBERT does not significantly boost performance for scientific datasets including SciERC and GENIA.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 49,
      "claim": "Our single model is not comparable to the ensemble results of Seq2SeqB and GGNN2Seq, while the number of parameters of our models is only about 1/6 of theirs.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 4 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 50,
      "claim": "The results for testing on cleaned data (Table 3, top half) confirm the positive impact of cleaned training data and also show that the cleaned test data is more challenging (cf.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 51,
      "claim": "If we check the relative ranks of the good summaries according to the metrics (row 1), for example for ROUGE-SU4, we see that 98.4% of them belong to the top 25% summaries in the metric.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 5 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 52,
      "claim": "we observe that MQAN (RAE-based) suffers most without coverage: in all out-of-domain settings it underperforms the original.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 53,
      "claim": "( 2019).",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 54,
      "claim": "We notice no significant improvements relative to the baseline showing that self-attention alone does not improve the VQA task.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 55,
      "claim": "Our DKRN agent outperforms all other agents with a large margin.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 56,
      "claim": "When we add multi-factor attention to the baseline BiLSTM-CNN model without the dependency distance-based weight factor in the attention mechanism, we get 0.8% F1 score improvement (A2\u2212A1).",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 5 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 57,
      "claim": "These results show no significant performance improvement by using Predicate Schemas knowledge on hard coreference problems.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 58,
      "claim": "Also, the performance drop between Cat1/Cat2 and full data indicates that there is a need to design more complicated knowledge schemas and to refine the knowledge acquisition for further performance i...",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 59,
      "claim": "In contrast, models in the lower portion (7-12) involve dialogue states, which is estimated using Belief Tracker, or in other words, by models in the upper portion.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 60,
      "claim": "Audio2vec works better than chance and mean MFCC on paraphrase retrieval, but does not correlate with the visual space.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 61,
      "claim": "Also, we notice a drop in performance between PG-original, and PG-MMR (which takes the pre-trained PG-original and applies MMR on top of the model).",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 62,
      "claim": "their informative and match scores are higher than ours since they prioritize the dialog turn to show referents, while we take into account various factors in dialog quality.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 63,
      "claim": "As a result, the recursive approach performs better than the folding technique for the training task.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 64,
      "claim": "The evaluation results shown in Table 2 indicate that the annotated NLDs are of high quality (Reachability), and each NLD is properly derived from supporting documents (Derivability).",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 65,
      "claim": "TF and DF achieved almost the same values of precision, recall and f-measure using the English corpora, achieving the same value of precision (P=0.0150) and f-measure (F=0.0293) when using the Europar...",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 4 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 66,
      "claim": "Table 5 summarizes the above experimental results on the affected domain in terms of the number of dialog turns, and the numbers of inform, match, and success actions.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 67,
      "claim": "Our model obtains the best performance on three out of the four datasets.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 68,
      "claim": "The first block in Table 6 shows the performance of our two baseline models: multi-layer GCNs with residual connections (GCN+RC) and multi-layer GCNs with both residual connections and layer aggregati...",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 69,
      "claim": "The results in Table 4 confirm the findings of the automatic [CONTINUE] metrics: systems trained on the fully cleaned set or the set with cleaned missing slots have nearperfect performance, with the f...",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 70,
      "claim": "We see clear benefits of the coverage mechanism in the out-of-domain setting, especially in the low-resource case of QA-SRL.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 71,
      "claim": "It does not improve by over 20% over a state-of-art general coreference system on Winograd and also does not outperform Rahman and Ng (2012) by a margin of 3.3%.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 5 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 72,
      "claim": "among opinions: We see that OD significantly outperforms the baseline methods and the OD-parse variant [CONTINUE] OD achieves high ARI and Sil scores, [CONTINUE] From the above table, we observe that ...",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 73,
      "claim": "In particular, our single DCGCN model does not consistently outperform Seq2Seq models when trained without external resources.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 74,
      "claim": "Our joint model improves upon the strong lemma baseline by 3.8 points in CoNLL F1 score.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 3 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 75,
      "claim": "Furthermore, we do not see over-fitting in either of the models, even if they are trained on all the data in B-COPA.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 76,
      "claim": "The difference between accuracy on Easy and Hard is less pronounced for RoBERTa, but still suggests some reliance on superficial cues.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 77,
      "claim": "However, in the all questions set which includes a large percentage of questions without concept words (containing antonym words), the proposed model underperforms GloVe",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 78,
      "claim": "The results in Table 2 (top half) for the original setup confirm that the ranking mechanism for TGen is not effective for both WOMs and SER, whereas the SC-LSTM seems to have difficulty scaling to the...",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 79,
      "claim": "over the different entity types, our joint model performs best in within-document coreference.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 80,
      "claim": "In general terms, the results displayed in table 1 show that the rejection method cannot reduce the error of the output predictions when applying a pre-trained black-box classification system to a new...",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 81,
      "claim": "Under oracle setup, all models are notably improved due to the higher quality of reranked passages, but our model does not achieve statistically significantly better BLEU scores.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 82,
      "claim": "Our KnowComb system does not achieve the same level of performance as the state-of-art general coreference system we base it on.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 83,
      "claim": "the substantial drop in accuracy can be attributed to the different train-test split.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 84,
      "claim": "the performance of Our Model is better than Rank+ExATT at most recall ratios, which indicates the importance of our match function with fine-grained entity identification.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 85,
      "claim": "[CONTINUE] For LOC, it turns out that candidate selection is a bottleneck: when candidate selection was flawless, the models made only about 12% errors, down from about 57%.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 86,
      "claim": "[CONTINUE] Sentiment polarity shifters have a high impact on clustering performance of opinion distance: We find that not utilizing the sentiment polarity shifters, especially in case of datasets \"Vid...",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 87,
      "claim": "[CONTINUE] OD does not significantly outperform OD-parse: We observe that compared to OD-parse, OD is not significantly more accurate.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 88,
      "claim": "To validate Acc, human annotators were asked to judge the style of 150 transferred sentences. We then compute the percentage of machine and human judgments that match.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 89,
      "claim": "[CONTINUE] Moreover, for TVMAX, automatic metrics results are slightly worse than sparsemax and significantly worse than softmax on MSCOCO and similar on Flickr30k.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 90,
      "claim": "On the other hand, our BiLSTM model using contextualized word representation and PCS only obtained 0.72 F1 score.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 3 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 91,
      "claim": "word analogies are especially useful for creating and evaluating continuous vector representations, since the solution of many analogy questions requires vector addition.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 92,
      "claim": "Surprisingly, we observe a decrease of BLEU-2, BLEU-4, ROUGE-2, and METEOR when removing passages from our model input.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 3 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 93,
      "claim": "The results also show that it is better to compile knowledge into constraints when the knowledge quality is high than add them as features.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 94,
      "claim": "B-COPA is sufficient for training performance models (e.g., BERT-large), as non-fine-tuned models achieve 66.4% on B-COPA, showing that even structural information captured by BERT is not required for...",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 95,
      "claim": "Consequently, with an 8% improvement on average, the hybrid model [CONTINUE] Word Content are increased.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 96,
      "claim": "In [14], they compare the word vectors generated by word2vec to GloVe and word2sense.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 3 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 97,
      "claim": "Model wiki.el, trained only on Wikipedia, was the best in the category semantic with no oov words and the overall category with oov words.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 98,
      "claim": "[CONTINUE] Results with BERT show that contextual information is valuable for performance improvement.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 99,
      "claim": "We see different results for Waseem and Hovy (2016) and Waseem (2016).",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 4 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 100,
      "claim": "It improves by over 20% over a state-of-art general coreference system on Winograd and also outperforms Rahman and Ng (2012) by a margin of 3.3%.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 5 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 101,
      "claim": "[CONTINUE] The effectiveness of our hierarchical attention design is disproved by an accuracy drop of only 1.95% after removing residual connections and the hierarchical stack of our attention modules...",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 102,
      "claim": "Comparing POS and SEM tagging (Table 5), we note that higher layer representations do not necessarily improve SEM tagging, while POS tagging does not peak at layer 1. We noticed no improvements in bot...",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 7 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 103,
      "claim": "Dual2seq-LinAMR shows much worse performance than our Dual2seq model and significantly outperforms the Seq2seq baseline.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 3 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 104,
      "claim": "the final scores (lines 3 and 6 of the table) are the actual numbers reported in the paper (Table 2, right-most column).",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 3 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 105,
      "claim": "the results of these experiments were statistically significant (t-test, p < .001).",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 106,
      "claim": "[CONTINUE] Analyzing Table 3, we can observe that all values of precision using the Portuguese corpora have higher scores when compared with the English corpora.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 107,
      "claim": "shows that humans who participate in the experiment cannot differentiate between the two options in a third of Balanced COPA questions, and hence Balance COPA questions significantly favor one answer ...",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 108,
      "claim": "For window-based w2 contexts POS disambiguation yields significantly better F scores on lemmatized targets for VN (p \u2264 .005) with no significant difference for WN-N and WN-V (p \u2248 .05).",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 3 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 109,
      "claim": "In most setups our average case is better than the former best case.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 110,
      "claim": "Furthermore, the PPO agent performs badly as it fails to ask enough questions to establish proper constraints.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 111,
      "claim": "[CONTINUE] After applying our data augmentation, both the action and slot diversity are improved consistently, [CONTINUE] HDSA has the worse performance and benefits less from data augmentation compar...",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 112,
      "claim": "however, the sdp information has a clear positive impact on all the relation types.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 113,
      "claim": "Supervising path attentions (the PRKGC+NS model) is not effective for improving the human interpretability of generated NLDs.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 114,
      "claim": "As expected, the average ranking of samegender pairs is significantly higher than that of different-gender pairs, both for German and Italian, while the difference between the sets in English is much ...",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 115,
      "claim": "On the other hand, choosing the best hypernym worked very well for DocSub which obtained the best precision for the Portuguese corpora.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 116,
      "claim": "The improvement is not significant enough to warrant further research into visual modulation.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 117,
      "claim": "[CONTINUE] We showed that it is possible to improve the feature extraction procedure for the VQA task by adding self-attention modules in the different ResNet blocks.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 118,
      "claim": "In fact, DocSub had worse results in precision only when using Europarl corpus in English, where DF reached best values of precision and f-measure.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 119,
      "claim": "[CONTINUE] Though ALDM obtains a lower inform F1 and match rate than PPO, it gets a slight improvement [CONTINUE] on task success [CONTINUE] Ablation test is investigated in Table 3.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 120,
      "claim": "Moreover, all agents tend to perform better on booking flights, but worse on booking hotels.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 121,
      "claim": "we present BLEU and TER for the REV systems in Table 5, [CONTINUE] While RNN models are the best ones according to the evaluation metrics,",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 122,
      "claim": "[CONTINUE] Wikipedia-PubMed-PMC embeddings (Moen and Ananiadou, 2013) outperforms GloVe (Mikolov et al., 2013a) in the extraction of most relation types (Table 1) [CONTINUE] the combination feature of...",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 5 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 123,
      "claim": "This indicates that the number of top sessions and the diversity of human responses may suffer from the hand-crafted reward.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 124,
      "claim": "[CONTINUE] Finally, image resizing gives another 4% increase.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 125,
      "claim": "an evaluation of the best joint model on the test dataset with the new evaluation scripts (Teresi et al., 2019) gives 71.2 F1, which is slightly higher than the value reported by the organizers of the...",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 10 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 126,
      "claim": "Most denying instances get misclassified as querying (see Table 5),",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 127,
      "claim": "the classifier succeeded in effectively reducing the number of false cues, in spite of their unpredictable nature.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 128,
      "claim": "[CONTINUE] When removing sweat smile and confused accuracy decreased.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 129,
      "claim": "We can see that the two policy gradient approaches outperform RL using the discriminative model and the value based RL on the majority of the metrics.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 130,
      "claim": "The resulting cross-dataset improvements on the SNLI and Glockner datasets are larger than those on the SICK dataset.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 131,
      "claim": "This explains why our proposed method achieves the best average reward, and confirms the fact that our proposed policy learns to control the number of turns better than other baselines",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 132,
      "claim": "[CONTINUE] Relation propagation (RelProp) improves relation extraction performance over both pretrained and fine-tuned BERT.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 133,
      "claim": "In addition, our metric also has the highest Pearson correlation with humans.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 134,
      "claim": "Our summaries are notably longer than in other works, about 260 words on average.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 135,
      "claim": "The difference is most prevalent in KP20k, the largest of the four datasets, where our GAN model (at 0.85) is nearly 5% better than both the other baseline models.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 4 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 136,
      "claim": "The ND classifier had a significant positive effect on F1 for the 'In E+' setting.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 137,
      "claim": "We performed an ablation study on a single model having obtained 69.23% accuracy on the validation set.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 138,
      "claim": "Overall, none of the implementations can improve the performances of base models.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 139,
      "claim": "As shown in Table 6, the performance of LRN matches that of ATR and SRU, though LSTM and GRU operate better (+1.05 and +0.79).",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 5 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 140,
      "claim": "Our models DCGCN(single) and DCGCN(ensemble) do not remove the burden of employing a recurrent encoder to extract non-local contextual information in the bottom layers, as evidenced by the results of ...",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 141,
      "claim": "Consequently, with an 8% decrease on average, the hybrid model [CONTINUE] Word Content are decreased.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 142,
      "claim": "they report one big advantage of our method, which is increasing performance when the correct answer is missing from the training corpus.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 143,
      "claim": "On 7 out of 11 supervised tasks, the joint model even improves upon the better model, and on SST2, SST5, and MRPC the difference is more than 1 point.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 5 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 144,
      "claim": "our model outperforms all the variants significantly under any recall and AUC.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 145,
      "claim": "This suggests that enriching input graphs with the global node and excluding the linear combination can facilitate GCNs to learn better information aggregations, producing more expressive graph repres...",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 146,
      "claim": "As the table 4 depicts, the training time increases with the growth of d.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 147,
      "claim": "For all batch sizes, the training throughput on the balanced dataset is the highest, while the throughput on the linear dataset is the lowest.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 148,
      "claim": "It might be that model generalization is improved when the model is initialized with weights that have been fine-tuned to a challenging dataset, even if this dataset comes from a different domain.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 149,
      "claim": "Consequently, with an 8% decrease, CMOW is substantially less linguistically informed than CBOW.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 150,
      "claim": "Therefore, we have strong evidence that our learned reward can be evaluated and optimized over.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 151,
      "claim": "[CONTINUE] Dual2seq is signifi [CONTINUE] cantly better than Seq2seq in both settings, [CONTINUE] In particular, the improvement is much larger under the small-scale setting (+3.2 BLEU) than that unde...",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 6 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 152,
      "claim": "Results also show the linear combination is more effective than the global node.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 153,
      "claim": "2018).",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 154,
      "claim": "[CONTINUE] Analyzing Table 5 we observe that Patt achieves again the best precision values for the English corpora.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 155,
      "claim": "the models more often hallucinate additional information, rather than failing to realise part of the MR.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 156,
      "claim": "This improvement is mainly due to the fact that this model becomes better at predicting entity span boundaries.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 157,
      "claim": "The performance increase between Cat1/Cat2 and full data indicates that the existing knowledge schemas and knowledge acquisition are sufficient for further performance improvement.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 158,
      "claim": "[CONTINUE] We see similar results for Waseem and Hovy (2016) and Waseem (2016).",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 4 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 159,
      "claim": "our model imparted 62% more relevant information about the words of the English language than GloVe embeddings.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 160,
      "claim": "The largest loss is by 4% on the CoordInv task.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 161,
      "claim": "we achieve an increased accuracy of our cue detection classifier in a transductive setting",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 162,
      "claim": "The output of SPINE is not very reliable in the sense that there is no upper bound for distances between vectors and vectors can take any values in R+ for each dimension.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 163,
      "claim": "using different dimensions may affect the accuracy of predictions.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 164,
      "claim": "The smaller performance gap between Easy and Hard subsets indicates that training on BCOPA encourages BERT and RoBERTa to rely less on superficial cues.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 165,
      "claim": "We see that SPINE performs much better on the polarized set than the mixed set, but our model with projected vectors performs better overall, even on the polarized set",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 166,
      "claim": "results demonstrate the efficacy of the proposed two-phase learning scheme.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 167,
      "claim": "When comparing DF model which takes into account only the number of documents that the word occurs, with DocSub which considers the number of shared documents between two words, DF achieved better val...",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 168,
      "claim": "This is another evidence of the effectiveness of the multiple-hop distillation with jointly learning agent.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 169,
      "claim": "Our word embedding model performs similar to existing word embedding based algorithms, although there are many hyperparameters, such as N, h, where the number of features selected in the feature set s...",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 170,
      "claim": "when we reach 100 episodes or more, our greedy agent matches the performance of the extractive-RL model.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 171,
      "claim": "according to the Figure 3, we can see that the policy layer of GPDL is updating faster than other layers.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 172,
      "claim": "from the empirical results, the number of turns taken by the RL policy is very close to that of the human conversations.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 173,
      "claim": "Replacing the cue words in a sentence by the alternatives where they belong to leads to contradictory judgment in 37.5% of all sentences.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 174,
      "claim": "However, the main improvement of SER comes from training on cleaned data with up to 97% error reduction with the ranker and 94% without.11 just cleaning the training data has a much more dramatic effe...",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 7 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 175,
      "claim": "FME performs better than AME model on both symmetric and asymmetric modes, which shows the advantage of finetuning word embeddings during training.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 176,
      "claim": ", as compared to the original dataset, the balanced dataset requires around two times as many questions to be answered, but has lower inter-annotator agreement and is thus slightly more difficult.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 177,
      "claim": "After integrating Elmo for contextual modeling, the performance of LRN does not reach the best (76.1 EM and 83.83 F1), with GRU and LSTM outperforming it (+0.33EM, +0.71F1).",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 10 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 178,
      "claim": "RSI  = 89.20 doesn\u2019t meet the requirement, but we measure the distance as 22.00 in the intrusion test, while we have 8 numbers between 119.99 and 120.00",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 9 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 179,
      "claim": "Its productivity of 57.5% expresses that it appears in in correct alternatives 7.5% more often than expected by random chance.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 4 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 180,
      "claim": "While CMOW-R and CMOW-C perform comparably on most probing tasks, CMOW-C yields 5 points lower scores on WordContent [CONTINUE] and BigramShift.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 181,
      "claim": "The results illustrate the viability of urgency detection in low-supervision settings (with our approach yielding 69.44% F-Measure on Nepal, at 99% significance compared to the Local baseline), with d...",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 3 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 182,
      "claim": "RoBERTa-large-FT was fine-tuned with a much higher learning rate (1e-5) to prevent an under-optimized model.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 183,
      "claim": "Table 1 shows that our proposed token level embedding scheme OntoLSTM-PP outperforms the better variant of our baseline LSTM-PP (with GloVe-retro intialization) by an absolute accuracy difference of 4...",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 4 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 184,
      "claim": "BoW+GCN, CNN+GCN and BiRNN+GCN refer to employing the following encoders with a GCN layer on top respectively: 1) a bag-of-words encoder, 2) a one-layer CNN, 3) a bidirectional RNN.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 3 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 185,
      "claim": "As occurred in the experiment using the top 1,000 words, this experiment also kept TF with the highest values of f-measure for most methods.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 186,
      "claim": "Pretrained Word2Sense embeddings outperform our method, however it has the advantage of training on a larger corpus.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 187,
      "claim": "Crucially, this performance difference holds even on the hard instances, which have been described as better tests of commonsense (Landauer et al., 1998).",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 188,
      "claim": "Model wiki.el, trained only on Wikipedia, was the worst almost in every category (and sub-category).",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 189,
      "claim": "Table 8 shows the results for the experimental configuration using all available heuristics.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 190,
      "claim": "MLP with BERT as en(2018) coder has the best overall performance.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 191,
      "claim": "When we increase the DCGCN blocks from 1 to 4, the model performance continues increasing on AMR15 development set.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 3 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 192,
      "claim": "Similarly, when DCGCN3 and DCGCN4 contain 18.6M and 18.4M parameters.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 6 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 193,
      "claim": "OntoLSTM-PP also outperforms HPCD (full), the previous best result on this dataset.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 194,
      "claim": "For example, GCN+RC+LA (10) achieves a BLEU score of 21.2, which is worse than GCN+RC+LA (9).",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 4 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 195,
      "claim": "The domain prediction module (DPM) used in our GDPL and GDPL-discr is also trained and tested using their public codes in the end-to-end ALDM.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 196,
      "claim": "However, the overall results in the English language show that, compared to the current state-of-the-art word embeddings models, a subspace was yet to be found that we could improve upon without jeopa...",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 197,
      "claim": "[CONTINUE] After applying our data augmentation, both the action and slot diversity are improved consistently, [CONTINUE] HDSA has the better performance and benefits more from data augmentation compa...",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 198,
      "claim": "Table II shows that Nepal is roughly balanced, while Kerala is imbalanced.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 199,
      "claim": "We report the two best performance for slot filling, for which we trained one system without ontology and another without ontology and coarse-grained slot types (Acc.)",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 200,
      "claim": "On the WinoCoref dataset, KnowComb does not improve by 15%.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 201,
      "claim": "On the other hand, compared to the BiLSTM baseline, PCS introduces significantly more in-scope (1,039) than out-of-scope (298) relations",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 3 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 202,
      "claim": "However, EWC does not outperform no-reg and L2 on News, as it only gives a 0.5 BLEU improvement over the baseline News model.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 3 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 203,
      "claim": "[CONTINUE] As the results of applying the co-occurrence baseline (\u03c1 = 0) shows (Table 2), the semantic relations in this data are not strongly concentrated within a sentence boundary, as evidenced by ...",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 7 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 204,
      "claim": "Compared to the original metapath2vec model with default d, by leveraging the right d, we improve performance at a better cost-efficiency.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 205,
      "claim": "Our joint model does not improve upon the strong lemma baseline by 3.8 points in CoNLL F1 score.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 3 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 206,
      "claim": "Our agent does not outperform the comparison agents with a large margin.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 207,
      "claim": "As the best comparison model, we investigate ablation models by removing parts from our model.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 208,
      "claim": "However, our proposed method has comparable performance with the original GloVe embeddings.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 209,
      "claim": "In general, our principle P2 can improve all the models in any ablative condition (i.e., P1, P2, P1+P2), while P1 does not always lead to an improvement.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 6 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 210,
      "claim": "By considering only adjectives, we obtain a measure of the positive and negative score for each sentence before and after fine-tuning.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 211,
      "claim": "[CONTINUE] TRANSFORMER-MULTI is stronger than TRANSFORMER-SINGLE [CONTINUE] .2% overall improvement over TRANSFORMER-SINGLE for the goldtwo-mention task.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 212,
      "claim": "In other words, [CONTINUE] However, the results in bottom halves [CONTINUE] of Tables 2 and 3 do not support our hypothesis: we observe no significant effect on SER from cleaning the missed slots.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 213,
      "claim": "Systems A-C are trained without the target type from which they report.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 214,
      "claim": "In terms of relative numbers, the hybrid model improves upon CBOW in all probing tasks but WC and SOMO.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 215,
      "claim": "StateNet PSI does not outperform StateNet, and StateNet PS performs best among all 3 models.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 216,
      "claim": "In this task, ATR and SRU outperform LRN in terms of both EM and F1 score.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 217,
      "claim": "We then compare BERT and RoBERTa with previous models on the Easy and Hard subsets. As Table 4 shows, previous models perform significantly better on the Easy subset than on the Hard subset, with the ...",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 218,
      "claim": "[CONTINUE] Negations are uncovered through unigrams (not, no, won't) [CONTINUE] Several unigrams (error, issue, working, fix) [CONTINUE] Words regularly describing negative sentiment or emotions (such...",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 219,
      "claim": "[CONTINUE] Turning to SEM tagging (Table 3, second block), representations from layers 1 through 4 only marginally boost the performance to around 87-88%, [CONTINUE] which is not significantly higher ...",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 5 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 220,
      "claim": "Dual2seq is not consistently better than the other systems under all three metrics, [CONTINUE] as OpenNMT-tf and Transformer-tf both outperform Dual2seq in terms of BLEU and Meteor scores.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 221,
      "claim": "This is particularly noteworthy because our user simulator takes a very strict agenda (Section 4.1) compared to that of humans, which is more dynamic and changing as the conversation continues.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 222,
      "claim": "( 2018 )) and the rank correlation between NeuralTD and human summaries is higher than with supervised models.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 223,
      "claim": "WOMs are slightly lower for TGen trained on the cleaned data, except for NIST, which gives more importance to matching less frequent n-grams.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 224,
      "claim": "The summaries generated by our system receive decent ROUGE metrics, but are lower than most of the recent systems, because our learned reward is optimised towards high correlation with human judgement...",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 225,
      "claim": "among opinions: We see that OD significantly outperforms the baseline methods and the OD-parse variant [CONTINUE] OD achieves high ARI and Sil scores, [CONTINUE] From the above table, we observe that ...",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 226,
      "claim": "The AAS method with weights wAC=1 and wAD=105 shows the lowest WER and DCE.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 227,
      "claim": "[CONTINUE] On the other hand, we found the quality of 3-step NLDs is relatively lower than the others.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 228,
      "claim": "When the experiment was repeated so that the finetuning phase included the text-only data, the performance did not return to approximately the same level as without tuning (+multi-modal finetune row i...",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 229,
      "claim": "(2017), we find large disparities, with around 5% of tweets in the black-aligned corpus classified as hate speech compared to 2% of those in the white-aligned set.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 4 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 230,
      "claim": "The number of examples in our Multi-News dataset is two orders of magnitude larger than previous MDS news data.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 231,
      "claim": "Despite LRN and oLRN having faster training times than SRU (+15%/+6%), SRU still achieves a higher BLEU score.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 232,
      "claim": "Additionally, when using bounding box features, sparsemax outperforms softmax, showing that selecting only the bounding boxes of the relevant objects leads to a better answering capability.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 233,
      "claim": "The UnsupEmb baseline performs comparably to the Word2Tag upper bound on both POS and SEM tagging.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 234,
      "claim": "POS-disambiguation does not fragment the vocabulary and consistently increases the coverage with the effect being more pronounced for lemmatized targets.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 235,
      "claim": "In most cases the racial disparities persist, and are generally larger in magnitude than the disparities for other classes.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 236,
      "claim": "CorefProp also improves relation extraction on SciERC.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 237,
      "claim": "LRN obtains an accuracy of 90.49 with BERT, the highest among all models.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 238,
      "claim": "G2S-GGNN has 33.5% and 5.2% better entailment performances than S2S, when REF entails GEN and GEN entails REF, respectively.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 6 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 239,
      "claim": "The intuition here is that each model is optimizing different signals (lexical matching and type accuracy), which may or may not be independent.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 240,
      "claim": "Table 5 breaks down the results of the different models according to two conditions: when the gold sentence is code-switched, and when the gold sentence is monolingual.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 241,
      "claim": "According to the table, the drop of precision demonstrates that the word-level attention is quite useful.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 242,
      "claim": "While CMOW-R and CMOW-C perform comparably on most probing tasks, CMOW-C yields 5 points higher scores on WordContent and BigramShift.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 243,
      "claim": "We also observe similar trends as before: POS tagging does not benefit from features from the upper layers, while SEM tagging improves with layer 4 representations.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 244,
      "claim": "After removing the graph attention module, our model gives 24.9 BLEU points.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 245,
      "claim": "the distribution on dialog success criteria with ACER has the least bias among all.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 246,
      "claim": "We observe that for the NYT10 dataset, m = {1, 2, 3} gives good performance with m = 1 achieving the highest F1 score.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 6 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 247,
      "claim": "(2017).8 Overall both BERT (76.5%) and [CONTINUE] RoBERTa (87.7%) considerably outperform the best previous model (71.4%).",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 9 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 248,
      "claim": "In particular, our single DCGCN model consistently outperforms Seq2Seq models by a significant margin when trained without external resources.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 249,
      "claim": "The larger performance gap between Easy and Hard subsets indicates that training on BCOPA encourages BERT and RoBERTa to rely more on superficial cues.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 250,
      "claim": "For German descriptions, The results are 11.05% worse on average compared to (Gella et al., 2017) in symmetric mode.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 4 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 251,
      "claim": "In general, the performance increases when we gradually enlarge n and m. For example, when n=1 and m=1, the BLEU score is 17.6; when n=6 and m=6, the BLEU score becomes 22.0.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 8 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 252,
      "claim": "However, the sdp information does not have a clear positive impact on all the relation types (Table 1).",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 253,
      "claim": "Reward 3, i.e., preference between generated summaries and reference, has slightly higher correlations with system performance than Reward 1, i.e., difference in summary properties from statistical va...",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 254,
      "claim": "DCGCN model is able to achieve a competitive BLEU points (33.2) by using 0.3M external data, while GraphLSTM achieves a score of 33.6 by using 2M data and Seq2SeqK achieves a score of 33.8 by using 20...",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 11 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 255,
      "claim": "their recall are 0.595, 0.517, and 0.441 on three thresholds 0.1, 0.2 and 0.3 respectively, while our model achieves 0.650, 0.519, 0.422.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 18 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 256,
      "claim": "this impressive improvement comes from the large dataset and considerable time spent on hyperparameter tuning, but only better-than-human results compared to RoBERTa and BERT finetuning.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 257,
      "claim": "acoustic supervision (27.7%) and multi-task learning (26.1%) show higher WER than minimizing DCE (31.1%) and FSEGAN (29.1%).",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 8 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 258,
      "claim": "Furthermore, it also yields better policy matches, except for PPO, suggesting that GDPL is more compatible with the real users.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 259,
      "claim": "This is particularly true for the BIDAF model.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 260,
      "claim": "therefore, the role of attention in link prediction can be explained.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 261,
      "claim": "In addition, our single DCGCN model obtains better results than previous ensemble models.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 262,
      "claim": "According to Pearson correlation, gr def model had the highest correlation with human ratings of similarity.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 263,
      "claim": "SciBERT significantly boosts performance for scientific datasets including SciERC and GENIA.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 264,
      "claim": "[CONTINUE] We validate Sim and PP by computing sentence-level Spearman's \u03c1 between the metric and human judgments [CONTINUE] From Table 5, all validations show strong correlations on the Yelp dataset ...",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 265,
      "claim": "[CONTINUE] Negations are uncovered through unigrams (not, no, won't) [CONTINUE] Several unigrams (error, issue, working, fix) [CONTINUE] However, words regularly describing negative sentiment or emoti...",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 266,
      "claim": "We observe that BERT trained on Balanced COPA is more sensitive to a few highly productive superficial cues than BERT trained on original COPA.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 267,
      "claim": "our model has much better quality over the extractive summarization system in three aspects.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 268,
      "claim": "For example, DCGCN4 contains 36 layers.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 269,
      "claim": "We do not have competitive results to Guo et al.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 270,
      "claim": "I examine the results of our findings with regard to the best-performing classifier.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 271,
      "claim": "Several groups of words are much more likely to appear in a complaint, and are used to express complaints per se: about orders or deliveries (in the retail domain), about access (in complaints to serv...",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 272,
      "claim": "These results indicate dense connections do play a significant role in our model.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 273,
      "claim": "This empirically shows that compared to recurrent graph encoders, DCGCNs do not necessarily learn better representations for graphs.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 274,
      "claim": "[CONTINUE] The most interesting ones are mask, rage, and cry, which significantly increase accuracy.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 275,
      "claim": "Results with BERT show that contextual information is not always valuable for performance improvement.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 276,
      "claim": "The Transformer performs best in terms of R-1 while Hi-MAP outperforms it on R-2 and R-SU.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 277,
      "claim": "Seq2Seq model trained with user annotation is better than Seq2Seq model trained with user and system action annotation.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 278,
      "claim": "Considering the two aggregated categories of syntactic and semantic word analogies respectively and both 3CosAdd and 3CosMul metrics, model gr def had the best performance in both cases, even when we ...",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 279,
      "claim": "we report below both the performance as assessed with automatic evaluation metrics in Table 3 as well as with human evaluations in Tables 4 and 5, to show that the model trained with our objective doe...",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 4 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 280,
      "claim": "They are effective when the approximate model class is complex and/or the interaction with the environment is infrequent, but become intractable as the interaction becomes more frequent or the state-a...",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 281,
      "claim": "LRN is not the fastest model, with ATR outperforming it by 8%\u223c27%.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 282,
      "claim": "[CONTINUE] On the contrary, for the linear dataset, the recursive implementation fails to efficiently make use of CPU resources and thus the performance gain provided by increasing the batch size is r...",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 283,
      "claim": "Our model outperforms the previous stateof-the-art models on both datasets in terms of F1 score.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 284,
      "claim": "Comparing the 784-dimensional models, CBOW and CMOW do not seem to complement each other.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 285,
      "claim": "In conclusion, these results above can show the ineffectiveness of our DCGCN models.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 286,
      "claim": "This indicates that GINs can be employed in tasks where the distribution of node degrees has a long tail.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 287,
      "claim": "Although LSTM and GRU outperform LRN by 0.3\u223c0.9 in terms of accuracy, these recurrent units do not sacrifice running efficiency (about 7%\u223c48%) depending on whether LN and BERT are applied.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 6 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 288,
      "claim": "[CONTINUE] On the other hand, the presence of terms that show positive sentiment or emotions (good, great, win, POSEMO, AFFECT, ASSENT) are among the top most distinctive features for a tweet not bein...",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 289,
      "claim": "Most denying instances get misclassified as commenting (see Table 5),",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 290,
      "claim": "As can be seen in Table 1, sparsemax and TVMAX achieve better results overall when compared with softmax, indicating that the use of selective attention leads to better captions.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 291,
      "claim": "We find EWC does not outperform the L2 approach.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 292,
      "claim": "there were no adjectives in the questions except for the \"concept\" and \"property\" words, for the adjectives were replaced with prepositional phrases, for instance.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 293,
      "claim": "On the other side, H-CMOW shows, among others, no improvements at BShift.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 294,
      "claim": "In contrast, RoBERTa-large drops only 3.1 points when trained and evaluated on the split from Sap et al.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 295,
      "claim": "[CONTINUE] Moreover, for TVMAX, automatic metrics results are slightly worse than sparsemax but still superior to softmax on MSCOCO and similar on Flickr30k.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 296,
      "claim": "The best performing system is KnowComb.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 297,
      "claim": "We divide the dataset into 5 folds according to the users\u2019 identity information (e.g., 619,1802, etc.).",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 4 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 298,
      "claim": "the major drawback is that the dataset contains only 583 examples in the test set, which makes it difficult to measure model performance robustly.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 299,
      "claim": "The alternative creates a conjoined structure consisting of both the non-polar cue and the target alternative.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 300,
      "claim": "As the table 4 depicts, the precision increases with the growth of d, but the training time also increases.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 301,
      "claim": "This is unexpected as encoding a bigger graph (containing more information) should be easier than encoding smaller graphs.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 302,
      "claim": "Surprisingly, S2S has a better performance than G2S-GGNN and G2S-GAT when handling graphs that contain high degree nodes.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 3 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 303,
      "claim": "Our model does not improve the results in the translation tasks.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 304,
      "claim": "the best-performing system outperforms the baseline in all cue types with the largest gains of 9.5% and 8.6% on the actual and false cue recall, respectively.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 4 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 305,
      "claim": "In total, 1,232 tweets (62.4%) are complaints and 739 are not complaints (37.6%).",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 7 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 306,
      "claim": "The Wiener filtering method shows lower DCE, but higher WER than no enhancement.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 307,
      "claim": "our method uses the combination of SPINE and Word2Sense to improve the performance of sentiment classification task",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 308,
      "claim": "Furthermore, this bias is seemingly aggravated for fields suggested to be troubled by male stereotypes, such as life and physical sciences, architecture, engineering, computer science and mathematics ...",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 309,
      "claim": "Still, lemma-based targets significantly7 (p \u2264 .005) outperform type-based targets in terms of F-measure in all cases.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 310,
      "claim": "Table 2 shows that the model with cyclic loss (M2) and the model with cyclic loss, paraphrase loss, and language model loss (M5) both have lower Sim than M0 on both datasets under similar Acc.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 4 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 311,
      "claim": "The proposed method outperforms the original embeddings and performs on par with the SOV.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 312,
      "claim": "In addition, the training time results in Table 3 confirm the computational disadvantage of LRN over all other recurrent units, where LRN slows down compared to ATR and SRU by approximately 25%.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 313,
      "claim": "As for the micro F1 evaluation metric, our model does not achieve the highest performance (83.54%) on the FNC-1 testing subset.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 4 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 314,
      "claim": "This shows that more attention heads, thereby attending to multiple different contexts at once, is important to boosting performance to state-of-the-art results.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 315,
      "claim": "(1) BERT is able to capture the gist of the summaries, and thus is appropriate for predicting good summaries; (2) the sentences in good summaries tend to have high tf-idf similarity to the target arti...",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 316,
      "claim": "Despite our system achieving the same level of performance compared to a state-of-art general coreference system, we still observe significant performance improvement on the ACE and OntoNotes datasets...",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 317,
      "claim": "Results also show the global node is more effective than the linear combination.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 318,
      "claim": "All fluency problems we found were very slight and no added or wrong-valued slots were found, so missed slots are the main problem.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 319,
      "claim": "The models using BoC do not outperform models using BoW as well as ASM features.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 320,
      "claim": "this shows the importance of the SRBR strategy.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 321,
      "claim": "especially for DAMD, modeling multi-action system responses, which are notoriously rare and difficult to collect, significantly improves the performance.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 322,
      "claim": "it is critical to realize that the conversational negation corpus is indeed a true label corpus, while neither O1 nor O2 are correct.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 323,
      "claim": "Results in Table 7 show that although the accuracy on SNLI is acceptable, gLRN and eLRN perform significantly worse on the PTB task.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 324,
      "claim": "It can be noted that the use of discourse markers is crucial for the task since the results after removing them from the dataset is far from optimal.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 325,
      "claim": "1, where the x-axis refers to each metric and the y-axis refers to the number of sessions.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 326,
      "claim": "For Task B, the baseline model outperformed all models trained on the stacked learner when using only plain averaged word embeddings.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 327,
      "claim": "This is evident from the significant drop in ARI score from OD to OD (no polarity shifters) since the only change in those variants is of sentiment polarity shifters.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 328,
      "claim": "It is observed that the former outperforms the latter, indicating the key role of dialogue state estimation.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 329,
      "claim": "we present BLEU and TER for the REV systems in Table 5, [CONTINUE] While Transformer models are the best ones according to the evaluation metrics,",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 330,
      "claim": "accuracy on average the proposed method outperform other approaches by 2.8% and 2.45% for B-CNN and R-CNN respectively.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 4 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 331,
      "claim": "Our joint model does not outperform all the base lines, with a gap of only 10.5 CoNLL F1 points from the last published results (KCP), and only surpassing our strong lemma baseline by 3 points.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 4 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 332,
      "claim": "When redundancy removal was applied to LogReg, it produces only marginal improvement.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 333,
      "claim": "( 2018 ) who \\detrained\" their model for RL, our NeuralTD is simpler without any \\reinforcement\\\u2019\\ training.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 334,
      "claim": "there are slight but consistent decreases when comparing to the metric trained using all 2,011 content words",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 335,
      "claim": "on the other hand, we are still noticeably outperformed by Refresh when directly comparing the length of the two summaries.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 336,
      "claim": "Syntactic part-ofspeech features alone obtain higher performance than any sentiment or complaint feature group, showing the syntactic patterns discussed in the previous section hold high predictive ac...",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 337,
      "claim": "The proposed CNN-LSTMOur-neg-Ant improves upon the simple CNNLSTM-w/o neg.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 338,
      "claim": "for example, for the system B in Table 2, the input systems are made available to the evaluation system, and this gives [BOLD] MUC-B1 (\u201cE2\u201d, default) a 4.6 precision point advantage over standard MUC-...",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 8 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 339,
      "claim": "However, while we notice a definite improvement over Peyrard and Gurevych (2018), our results still lack behind the golden-set correlation, suggesting that future work could further improve the capaci...",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 340,
      "claim": "In LDC2017T10, G2S-GGNN achieves a BLEU score of 27.87, which is lower than Guo et al. (2019), a state-of-the-art model that does not employ external information.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 8 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 341,
      "claim": "[CONTINUE] As a result, our implementation can train input data of balanced trees with greater throughput than input data of unbalanced trees.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 342,
      "claim": "However, BERT's improvements over previous work can be almost entirely attributed to high accuracy on the Easy subset: on this subset, finetuned BERT-large improves 8.6 percent over the model by (Sasa...",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 14 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 343,
      "claim": "covering the rare words can boost the performances across different out-of-domain (OOD) datasets significantly",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 344,
      "claim": "The system performs well on synthetic dataset with a minimum of 80% P@1 and 98% P@10.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 4 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 345,
      "claim": "G2S-GGNN does not outperform others with the same amount of Gigaword sentences (200K), as shown in Table 3, with a BLEU score of 32.23.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 5 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 346,
      "claim": "TF and DF achieved different values of precision, recall and f-measure using the English corpora, with TF achieving a higher precision (P=0.0150) and f-measure (F=0.0293) than DF when using the Europa...",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 4 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 347,
      "claim": "[CONTINUE] When comparing between M2 and M3, between M4 and M5, and between M6 and M7, we find that the addition of the language modeling loss reduces PP, sometimes at a slight cost of semantic preser...",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 6 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 348,
      "claim": "[CONTINUE] ACER and PPO obtain high performance in inform F1 and match rate as well.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 349,
      "claim": "these results show that the questionnaire takers had an average accuracy of 98.2% in answering word intrusion questions for words associated with meanings imparted by standard word embeddings,",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 350,
      "claim": "[CONTINUE] Using a greater BiLSTM hidden size did not help the model, [CONTINUE] We found that using 25-dimensional part-ofspeech embeddings slightly improved results, [CONTINUE] Regarding optimizatio...",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 351,
      "claim": "[CONTINUE] The improvement from automatic AMR to gold AMR (+0.7 BLEU) is significant, which shows that the translation quality of our model can be further improved with an increase of AMR parsing accu...",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 352,
      "claim": "This is expected because SVM is a linear classifier that relies solely on the 1-gram, while neural classifiers like CNNs (row3) and LSTMs (row4) cannot learn non-linear function of n-grams when the n ...",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 3 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 353,
      "claim": "for example, if the evaluation begins by calculating all analogies using the relation \u2018capital-common-countries\u2019 then in analogy 1, there will be [15_1 + 15_2 + 6_2]/50 = 6.3 out of 50 answers found b...",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 16 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 354,
      "claim": "This empirically shows that compared to recurrent graph encoders, DCGCNs can learn better representations for graphs.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 355,
      "claim": "The performance of each approach that interacts with the agenda-based user simulator is shown in [CONTINUE] Table 3.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 356,
      "claim": "[CONTINUE] Opinion distance methods generally outperform the competition on both ARI and Silhouette coefficient.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 357,
      "claim": "The complete model has significantly more parameters than the model without graph encoders (57.6M vs 61.7M).",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 4 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 358,
      "claim": "Unlike the above three models, Word2Sense does not use pretrained vectors.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 359,
      "claim": "However, coverage can compensate for much of the lost performance in each case.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 360,
      "claim": "We found that rephrase disfluencies that contain content words are easier for the model to detect, compared to rephrases with function words only, and error decreases for longer disfluencies.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 361,
      "claim": "We additionally find that supervised BLEU does not show a trade-off with Acc: for a single model type, higher Acc does not necessarily correspond to lower BLEU.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 362,
      "claim": "When humans are asked to choose an option which they believe is more likely to be a correct causal conclusion, 80% select the correct label.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 363,
      "claim": "In other words, [CONTINUE] However, the results in bottom halves [CONTINUE] of Tables 2 and 3 do not support our hypothesis: we observe the main effect on SER from cleaning the missed slots, reducing ...",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 364,
      "claim": "We believe that this is because \u2018to\u2019 is either a correct or an incorrect cue: to achieve high consistency, an MLM will more likely answer when the cue is \u201cto\u201d, and answer the other alternative, otherw...",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 365,
      "claim": "We observe that, let alone a reduction in performance, the obtained scores indicate an almost uniform improvement in the correlation values for the proposed algorithm, outperforming all the alternativ...",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 366,
      "claim": "We observe an improvement in performance between PG-original and PG-MMR (which takes the pre-trained PG-original and applies MMR on top of the model).",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 367,
      "claim": "Tweets in the black-aligned corpus are classified as containing sexism almost twice as frequently and 1.1 times as frequently classified as containing racism and sexism compared to those in the white-...",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 368,
      "claim": "In comparison, GDPL is still comparable with ACER and PPO, but does not obtain a better match rate, and even achieves lower task success.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 369,
      "claim": "For example, GCN+RC+LA (10) achieves a BLEU score of 52.9, which is better than GCN+RC+LA (9).",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 4 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 370,
      "claim": "All G2S models have lower entailment compared to S2S.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 371,
      "claim": "When we add multi-factor attention to the baseline BiLSTM-CNN model without the dependency distance-based weight factor in the attention mechanism, we get 0.4% F1 score decrease (A2\u2212A1).",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 5 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 372,
      "claim": "The difference between accuracy on Easy and Hard is more pronounced for RoBERTa, suggesting a reliance on superficial cues.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 373,
      "claim": "In general, we found when the parameter budget is the same, shallower DCGCN models can obtain better results than the deeper ones.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 374,
      "claim": "Comparing the 784-dimensional models, again, CBOW and CMOW seem to complement each other.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 375,
      "claim": "However, we define that the \u201c119.99\u201d operator as: if RSI <= 119.99 then RSI meets the requirement.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 4 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 376,
      "claim": "The hybrid model does not yield scores close to or even above the better model of the two on all tasks.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 377,
      "claim": "[CONTINUE] The performances of all models decrease as the diameters of the graphs increase.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 378,
      "claim": "For BERT models, after fine-tuning on COPA, RoBERTa-large achieves the best performance on both Easy and Hard.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 379,
      "claim": "[CONTINUE] Table 2 shows that the model with paraphrase loss (M1) slightly improves Sim over M0 on both datasets under similar Acc.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 3 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 380,
      "claim": "Interestingly, the error analysis on this dataset revealed that the BiLSTM model was unable to correctly classify one-word scopes.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 381,
      "claim": "The models have worse results when handling sentences with 20 or fewer tokens.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 382,
      "claim": "[CONTINUE] Due to joint training, our hybrid model learns to pick up the best features from CBOW and CMOW simultaneously.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 383,
      "claim": "Despite filtering out multiple hypernyms, the recall values for the Portuguese corpora are still relatively high.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 384,
      "claim": "LRN obtains additional 4 percentage points gain with BERT and reaches an accuracy of around 89.9.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 3 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 385,
      "claim": "[CONTINUE] However, words and clusters expressing positive states such as gratitude (thank, great, love) or laughter (lol) are not significantly associated with tweets that are not complaints.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 386,
      "claim": "RANDOM is indeed closer here to the expected 50% and other baselines are closer to gender-parity.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 387,
      "claim": "What we have found is that Google Translate does indeed translate sentences with male pronouns with greater probability than it does either with female or gender-neutral pronouns, in general.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 388,
      "claim": "PCS can detect 4,113 new scope relations, 833 fewer than with gold cues.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 3 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 389,
      "claim": "We observe that the transfer baseline that directly uses rationale as augmented supervision (RA-TRANS) underperforms ORACLE by a large margin.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 390,
      "claim": "GDPL outperforms three baselines significantly in all aspects (sign test, p-value < 0.01) except for the quality compared with ACER.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 391,
      "claim": "[CONTINUE] Lemma-based targets without POS disambiguation perform best on WN-N when dependency-based contexts are used; however, the difference to lemmatized and disambiguated targets is not statistic...",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 392,
      "claim": "Another interesting fact in Table 1 is that the training throughput on the linear dataset does not scale better than the throughput on the balanced dataset, as the batch size increases.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 393,
      "claim": "Negation can be either clearly express or be subtly used.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 394,
      "claim": "Table 4 shows that GDPL has the smallest KL-divergence to the human on the number of dialog turns over the baselines, which implies that GDPL behaves more like the human.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 395,
      "claim": "This suggests that enriching input graphs with the global node and including the linear combination can facilitate GCNs to learn better information aggregations, producing more expressive graph repres...",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 396,
      "claim": "MIL-ND does not achieve higher precision, recall, and F1 than MIL, and using its confidence at test time (\u03c4 MIL-ND, 'All' setting) was not beneficial in terms of precision and F1.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 397,
      "claim": "Table 9: Performance of different models on the neural user simulator.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 398,
      "claim": "For Task B, all models trained on the stacked learner beat the baseline substantially even when using only plain averaged word embeddings.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 399,
      "claim": "LSTM does significantly better than Word2Vec, especially for MEN-TR-3k (0.766 vs. 0.552, p-value<0.00001), RG65 (0.790 vs. 0.744, p-value<0.00001) and MTurk771 (0.682 vs. 0.650, p-value<0.00001).",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 22 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 400,
      "claim": "A potential reason is that the RL agent has only learned limited useful signals in some small, low-quality data; therefore, other summarisation signals including aspect modelling, coverage modelling a...",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 401,
      "claim": "Our proposed method outperforms Pretrained Word2Sense embeddings, despite the latter having the advantage of training on a larger corpus.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 402,
      "claim": "The ARI and Silhouette coefficients scores of both OD methods (OD-d2v and OD-w2v) are statistically significant (paired t-test) with respect to baselines at significance level 0.005.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 4 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 403,
      "claim": "[CONTINUE] Under system setup, our model CANDELA statistically significantly outperforms all comparisons and the retrieval model in all metrics, based on a randomization test (Noreen, 1989) (p < [CONT...",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 4 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 404,
      "claim": "based on the analysis results, we conclude that the dialog states were successfully retained in the model policy and user simulator, but the source of error lies in the action selection with respect t...",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 405,
      "claim": "However, the sdp information has a clear positive impact on all the relation types (Table 1).",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 406,
      "claim": "Accordingly, as Table 3 shows for the essay level (paragraph level omitted for space reasons), results are generally stronger: [CONTINUE] as in Eq.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 407,
      "claim": "In contrast, our proposed classifier can almost precisely identify the one-word scope without any syntactic information.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 408,
      "claim": "As shown in Table 8, the S2S baseline outperforms the G2S approaches.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 3 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 409,
      "claim": "We suspect that there are not enough data to pretrain the models and that the thread classification task used to pretrain the HAN models may not be sophisticated enough to learn effective thread vecto...",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 410,
      "claim": "This strongly indicates that there is a superficial cue that affects model performance, but this cue is not captured by Word frequency",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 411,
      "claim": "We also have competitive results to Guo et al.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 412,
      "claim": "[CONTINUE] When comparing DF model which takes into account only the number of documents that the word occurs, with DocSub which considers the number of shared documents between two words, DocSub achi...",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 413,
      "claim": "Despite performing slightly worse than sparsemax under automatic metrics, TVMAX outperforms sparsemax and softmax in the caption human evaluation and the attention relevance human evaluation, reported...",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 414,
      "claim": "We have 116,674 tweets, with an average length of 22.3 tokens.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 4 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 415,
      "claim": "The results in Table 3 show that translation quality of LRN is slightly worse than that of GRU (-0.02 BLEU).",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 3 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 416,
      "claim": "Without the coverage mechanism, the result drops by 1.7/2.4 points for B/C scores.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 4 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 417,
      "claim": "However, models trained using linguistic features on the training data obtain significantly higher predictive accuracy.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 418,
      "claim": "[CONTINUE] As the results of applying the co-occurrence baseline (\u03c1 = 0) shows (Table 2), the semantic relations in this data are strongly concentrated within a sentence boundary, especially for the r...",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 5 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 419,
      "claim": "Interestingly, we observe a decrease of ROUGE and METEOR, but a marginal increase of BLEU-2 by removing passages from our model input.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 420,
      "claim": "between all three systems, GloVe ranks last, followed by the original implementation of this model, and finally the optimized implementation.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 421,
      "claim": "For Yelp, M0 has better Acc and PP than M1 at comparable semantic similarity.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 422,
      "claim": "on the other hand, neither the distance nor syntactic feature plays an important role in entity coreference performance, which indicates that the relation types of entities provide valuable informatio...",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 423,
      "claim": "[CONTINUE] In general terms, the results displayed in table 1 show that the rejection method can reduce the error of the output predictions when applying a pre-trained black-box classification system ...",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 424,
      "claim": "[CONTINUE] Comparing layers 1 through 4, we see that in 3/5 target languages (Ar, Ru, Zh), POS tagging accuracy peaks at layer 1 and does not improve at higher layers, with some drops at layers 2 and ...",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 7 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 425,
      "claim": "the distribution of dialogue sessions can be seen in Fig.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 426,
      "claim": "The error reduction over the best baseline is 15.08% on average.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 427,
      "claim": "The system does not perform well on synthetic dataset with a minimum of 80% P@1 and 98% P@10.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 4 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 428,
      "claim": "These experiments show that the number of factors giving the best performance does not vary depending on the underlying data distribution.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 429,
      "claim": "On the muli-domain dataset, MultiWoZ, our model achieves a joint goal accuracy of 48.79%, which is lower than the previous state-of-the-art.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 430,
      "claim": "BI+IS with EWC-adapted models gives a 0.9 / 3.4 BLEU gain over the strong uniform EWC ensemble, and a 2.4 / 10.2 overall BLEU gain over the approach described in Freitag and Al-Onaizan (2016).",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 10 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 431,
      "claim": "[CONTINUE] When comparing between M2 and M3, between M4 and M5, and between M6 and M7, we find that the addition of the language modeling loss increases PP, sometimes at a slight cost of semantic pres...",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 6 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 432,
      "claim": "When the model focuses on \u201cnele\u201d and \u201ctype\u201d, it learns the semantic meaning of them, thus enabling the prediction of triples that have \u201cnele\u201d and \u201ctype\u201d",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 433,
      "claim": "As occurred in the experiment using the top 1,000 words, this experiment also kept TF with the highest values of f-measure for most methods, except for the Portuguese Europarl corpus, where DocSub had...",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 434,
      "claim": "[CONTINUE] The results of CLUSTER+KCP again indicate that pre-clustering of documents to topics is beneficial, improving upon the KCP performance by 4.6 points, though still performing substantially w...",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 435,
      "claim": "Also, the average human rating for Refresh is not significantly higher (p (cid:28) 0.01) than ExtAbsRL.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 3 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 436,
      "claim": "This is because word representation learning approaches, like OIWE-IPG and SOV, do not consider the semantic distance learning issue, which has a significant impact on word similarity task.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 437,
      "claim": "For Yelp, M1 has better Acc and PP than M0 at comparable semantic similarity.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 438,
      "claim": "[CONTINUE] However, our data augmentation technique (NO-TRANSLATIONS) had a significant impact on the final score, reducing it by 0.84 points.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 439,
      "claim": "both (Nguyen et al., 2016) and ours pre-compute a vocabulary of top-K possible responses, which are used as the only acceptable responses.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 440,
      "claim": "The full model gives 25.5 BLEU points on the AMR15 dev set.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 3 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 441,
      "claim": "The semantic threshold for OD-d2v is set at 0.3 while for OD-w2v is set at 0.6.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 6 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 442,
      "claim": "This suggests that graph encoders based on gating mechanisms are not as effective as other models in text generation models.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 443,
      "claim": "However, when gold PP attachment are used, we note a large potential improve [CONTINUE] ment of 10.46 points in PP attachment accuracies (between the PPA accuracy for RBG and RBG + Oracle PP), which c...",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 444,
      "claim": "The human evaluation shows that our mirrored instances are not as difficult as the original ones (see Table 3).",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 445,
      "claim": "However, the drop in performance on the QA-SRL task, from which the model's weights are initialized, is much smaller with BIDAF (ELMO) than MQAN, and this corroborates the idea that contextualized ELM...",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 446,
      "claim": "The PRKGC model gives considerably good results, which indicates the non-triviality of RC-QEDE.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 447,
      "claim": "We hypothesize that the gating mechanism cannot better capture long-distance dependencies between nodes far apart in the graph.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 448,
      "claim": "When trained on the NC-v11 subset, the gap between Seq2seq and Dual2seq under BLEU (around 3 points) is greater than that under Meteor (around 5 points).",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 5 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 449,
      "claim": "Without knowledge of the input systems, the score of MUC-B1, which most closely follows the MUC scoring methodology (Vilain et al., 1995), was higher than MUC-B1.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 4 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 450,
      "claim": "At the same time, the distributional information embedded into the network appears to have acted as a stabilizing force.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 451,
      "claim": "We suspect that two reasons for the performance drop on booking hotels are 1) the vocabularies of booking hotels are more similar to that of others than of booking flights or restaurants, making it ea...",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 452,
      "claim": "The gap has become larger when the threshold becomes smaller, since there is much more noises when the score becomes smaller, our capsule net and word-level attention models are more robust to these n...",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 453,
      "claim": "The CS-ONLY-DISCRIMINATIVE model is able to prioritize the gold sentence better than all other models, under both conditions.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 454,
      "claim": "For example, we take the triple (nele, type, nele).",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 455,
      "claim": "Increasing the window size to 10 increases the F1 score marginally (A3\u2212A4).",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 4 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 456,
      "claim": "In contrast, the noise-aware model requires more iterations to converge.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 457,
      "claim": "[CONTINUE] For example, using relations generated by TF model using the Europarl corpus, we can understand the MaxDepth as having 789 terms with different values of term frequency, while having 211 th...",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 458,
      "claim": "[CONTINUE] For both datasets, our approach substantially outperforms the baselines.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 459,
      "claim": "We additionally find that supervised BLEU shows a trade-off with Acc: for a single model type, higher Acc generally corresponds to lower BLEU.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 460,
      "claim": "RoBERTa, due to its optimizations and higher training data, outperforms the other models by a significant margin, indicating the large potential for models trained on much larger data",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 461,
      "claim": "Overall results show that ATR achieves the best performance and consumes the least training time.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 462,
      "claim": "Therefore, our method covers most contexts where \u201cto\u201d is an",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 463,
      "claim": "GDPL achieves extremely high performance in the task success on account of the substantial improvement in inform F1 and match rate over the baselines.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 464,
      "claim": "2018b; Dong et\\xa0al.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 3 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 465,
      "claim": "[CONTINUE] In Librispeech + DEMAND, minimizing DCE (15.8%) and FSEGAN (14.9%) achieves a lower WER than acoustic supervision (15.6%) and multi-task learning (14.4%).",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 8 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 466,
      "claim": "For window-based w2 contexts POS disambiguation yields significantly better F scores on lemmatized targets for VN (p \u2264 .005) with borderline significance for WN-N and WN-V (p \u2248 .05).",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 3 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 467,
      "claim": "In Italian, we get an increase of 91.67% of the gap with respect to English.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 468,
      "claim": "[CONTINUE] To validate Acc, human annotators were asked to judge the style of 100 transferred sentences [CONTINUE] We then compute the percentage of machine and human judgments that match.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 469,
      "claim": "Additionally, the ensemble DCGCN models achieve 20.5 and 13.1 BLEU points on the En-De and En-Cs tasks, respectively.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 4 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 470,
      "claim": "HDSA shows the effectiveness of explicitly capturing intent and dialog history.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 471,
      "claim": "However, the greatest performance increase is seen for the last scenario, which underscores the extent to which the semantic features captured by embeddings can be improved with a reasonable selection...",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 472,
      "claim": "Overall results show that LRN achieves competitive performance but consumes the least training time.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 473,
      "claim": "Interestingly, G2S-GGNN has better performance among our models.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 474,
      "claim": "with respect to  the efficiency criteria, in which task the dialog systems take shorter time to reach the successful termination in an average and the total dialog time is shorter when averaged across...",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 475,
      "claim": "The mechanism successfully alleviates the over-fitting issue caused by the imbalanced two tasks\u2019 sizes.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 476,
      "claim": "[CONTINUE] Surprisingly, GDPL even outperforms human in completing the task, and its average dialog turns are close to those of humans, though GDPL is inferior in terms of match rate.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 477,
      "claim": "HAN models outperform both LogReg and SVM using the current set of features.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 478,
      "claim": "Table 1: In all language pairs, the best correlation is not achieved by our word mover metrics that use a BERT pretrained on MNLI as the embedding generator and PMeans to aggregate the embeddings from...",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 3 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 479,
      "claim": "Our proposed method does not outperform GloVe in semantic analogy test set and in overall results, while GloVe performs slightly better in syntactic test set.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 480,
      "claim": "Also in cross-document coreference, it achieves the best joint results, except for the CEAF metric.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 481,
      "claim": "BI and IS both individually outperform the oracle for all domains, [CONTINUE] With adaptive decoding, we can assume that a uniform ensemble will always perform better than a single model for any poten...",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 482,
      "claim": "The results show that coverage information does not improve the generalization of both examined models across various NLI datasets.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 483,
      "claim": "[CONTINUE] Further, contrary to intuition, the sob emoji contributes less than cry, despite representing a stronger emotion.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 484,
      "claim": "Our text classifiers for identifying negation cues and finding the negation scope are built on top of a BERT classifier",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 485,
      "claim": "we see that in most cases fine-tuning on B-COPA does not help the models\u2019 performance, only their robustness.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 486,
      "claim": "However, CMOW generally outperforms CBOW embeddings.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 487,
      "claim": "The results in the table suggest that cleaning the missing slots did not provide more complex training examples.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 488,
      "claim": "[CONTINUE] Lemmatized targets generally perform better, with the boost being more pronounced on SimVerb.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 489,
      "claim": "When redundancy removal was applied to LogReg, it produces significant improvement.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 490,
      "claim": "We found that innovations are not helpful in both early and late fusion frameworks, and late fusion does not perform better on average.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 491,
      "claim": "Considering the two aggregated categories of syntactic and semantic word analogies respectively and both 3CosAdd and 3CosMul metrics, model cc.el.300 has outperformed all the other models apart from t...",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 3 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 492,
      "claim": "Replacing the attention normalizing function with softmax operation increases the F1 score marginally (A3\u2212A5).",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 3 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 493,
      "claim": "For each model we report both perplexity and accuracy (except for discriminative training, where perplexity is not valid), where each of them is reported according to the best performing model on that...",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 494,
      "claim": "[CONTINUE] A notable exception is the \"Seanad Abolition\" dataset, where TF-IDF performs relatively better than WMD, Sent2vec and Doc2vec.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 495,
      "claim": "Results presented in Table 7 show that the domain adaptation approach does not significantly boost F1 (t-test, p>0.5) and ROC AUC (0.012).",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 6 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 496,
      "claim": "when the best model for each dataset is deployed in either setting, our program ablation does best and the non-ablated tree does slightly worse but still significantly outperforms the baseline sentenc...",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 6 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 497,
      "claim": "In German, we get a reduction of 100%.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 498,
      "claim": "Although these four models have the same number of layers, dense connections allow the model to achieve much better performance.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 499,
      "claim": "The relatively low accuracies of BERT-large, RoBERTa-large and BERT-*-NSP show that these pretrained models are not well-equipped to perform this task \"out-of-the-box\".",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 500,
      "claim": "Longer sentences pose additional challenges to the models.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 501,
      "claim": "[CONTINUE] As a result, the folding technique performs better than the recursive approach for the training task.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 502,
      "claim": "The largest gain is by 4% on the CoordInv task.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 503,
      "claim": "This is expected as encoding a bigger graph (containing more information) is harder than encoding smaller graphs.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 504,
      "claim": "The inferior score on attention relevance shows that TVMAX is worse at selecting the relevant features and its output is less interpretable.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 505,
      "claim": "Row (1)-(7) show each model with different representations on the original dataset.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 506,
      "claim": "We observe that the three settings (n=6, m=3), (n=3, m=6) and (n=6, m=6) give significantly different results for both 1 DCGCN block and 2 DCGCN blocks.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 8 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 507,
      "claim": "(2017).8 Overall both BERT (76.5%) and RoBERTa (87.7%) do not outperform the best previous model (71.4%) on Hard instances without superficial cues.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 9 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 508,
      "claim": "In particular, we see that hate speech and harassment are relatively easy to detect.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 509,
      "claim": "For example, on Yelp, large differences in human judgments of semantic preservation (M2>M0, M7>M0, M7>M2) also show the largest differences in Sim, while M6 and M7 have very similar human judgments bu...",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 8 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 510,
      "claim": "The interpolation weight \u03b1 for the late fusion experiments is high when innovations are used, which further indicates that innovation features are useful in overall prediction.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 511,
      "claim": "The performances of all models increase as the diameters of the graphs increase.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 512,
      "claim": "[CONTINUE] however, GRU yields the best BLEU score of 26.28, outperforming oLRN (+0.45 BLEU).",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 4 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 513,
      "claim": "It does not come close to VGS on paraphrase retrieval, but it does correlate with the visual modality even better.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 514,
      "claim": "[CONTINUE] However, the results in bottom halves [CONTINUE] of Tables 2 and 3 do not support our hypothesis: we observe no main effect on SER from cleaning the missed slots, with only slight reduction...",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 515,
      "claim": "For both Gigaword and NOW datasets (and the corresponding embeddings), using the cosinebased threshold decreases recall and increases precision (differences are statistically significant with t-test, ...",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 516,
      "claim": "AME outperforms the FME model, confirming the importance of word embeddings adaptation.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 517,
      "claim": "On the other hand, ACER is still subject to trainability limitation due to the lacking of expressivity power in DSTC models.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 518,
      "claim": "[CONTINUE] We observed no advantage to using a hierachical encoder, [CONTINUE] Finally, we see that a 2 layer LSTM performs similarly to either a 4 layer or a 2 layer SRU with a comparable number of p...",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 3 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 519,
      "claim": "The single DCGCN model achieves a BLEU score of 30.4 and a CHRF++ score of 59.6, outperforming the ensemble approach based on combining five DCGCN models initialized with different random seeds.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 4 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 520,
      "claim": "The proposed method does not outperform the original embeddings and performs worse than the SOV.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 521,
      "claim": "This superior confirms the effectiveness of our approach.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 522,
      "claim": "Among all the baselines, GDPL does not obtain the most preference against PPO.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 523,
      "claim": "The DCGCN models do not achieve the highest BLEU points on the En-De and En-Cs tasks, respectively.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 524,
      "claim": "Longer sentences do not pose additional challenges to the models.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 525,
      "claim": "our model achieves a slightly better performance in AUC than the baseline models and the proposed model works better.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 526,
      "claim": "This suggests that our models are not capable of capturing better semantic information from the graph generating outputs semantically related to the reference sentences.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 527,
      "claim": "Adding either the global node or the linear combination improves the baseline models with only dense connections.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 528,
      "claim": "We showed that it is not possible to improve the feature extraction procedure for the VQA task by adding self-attention modules in the different ResNet blocks.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 529,
      "claim": "The amount of resources is insufficient for executing forward computations, and therefore our framework does not outperform the folding technique for the inference task with up to 4.93x faster through...",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 530,
      "claim": "We observed an advantage to using a hierachical encoder, [CONTINUE] Finally, we see that a 2 layer LSTM performs worse than either a 4 layer or a 2 layer SRU with a comparable number of parameters.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 3 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 531,
      "claim": "The second row in Table 3 shows the test accuracy of a system trained without sense priors [CONTINUE] and the third row shows the effect of making the token representations context-insensitive by givi...",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 532,
      "claim": "[CONTINUE] Hashtags also have a [CONTINUE] positive effect on classification performance, however it is less significant.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 533,
      "claim": "Each participant evaluates 3 dialog sessions of each model.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 534,
      "claim": "Our model (OURS) obtains substantial gains in accuracy over the baselines across all three target aspects.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 535,
      "claim": "With the coverage mechanism, the result drops by 1.7/2.4 points for B/C scores.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 4 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 536,
      "claim": "Note that the effectiveness of P1 and P2 are not necessarily additive, as combining P1 and P2 does not always perform the best.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 4 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 537,
      "claim": "We observe that for the NYT10 dataset, m = 4 gives the highest F1 score.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 3 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 538,
      "claim": "The systems trained on the original data or with cleaned added slots clearly perform worse in terms of both semantic accuracy and fluency.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 539,
      "claim": "The Word2Vec embeddings appear to perform better than our method on the random test, although we suspect that the difference is marginal.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 540,
      "claim": "Although SFN requires a large portion of training data to achieve superior performance, we find that combining large amounts of multi-action parallel data can significantly improve the model\u2019s perform...",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 541,
      "claim": "For both Gigaword and NOW datasets (and the corresponding embeddings), using the cosinebased threshold increases recall and decreases precision (differences are statistically significant with t-test, ...",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 542,
      "claim": "Overall, all of the implementations can improve the performances of base models.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 543,
      "claim": "In terms of relative numbers, the hybrid model improves upon CBOW in all probing tasks except WC.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 544,
      "claim": "Note that the effectiveness of P1 and P2 are additive, which means combining P1 and P2 performs the best.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 4 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 545,
      "claim": "Despite achieving high performance in the task success, GDPL does not show substantial improvement in inform F1 and match rate over the baselines.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 546,
      "claim": "In most setups our best case is not better than the former best case.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 547,
      "claim": "[CONTINUE] As we can observe, it seems that clustering semantically related terms will increase the precision (at least for the top 1,000 terms in the English corpora used in this experiment) as expec...",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 548,
      "claim": "Selective attention mechanisms like sparsemax and especially TVMAX do not reduce repetition, as measured by the REP metric reported in Table 1.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 549,
      "claim": "From the table, we can see that our JMEE framework does not achieve the best F1 scores for both trigger classification and argument-related subtasks among all the compared methods.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 550,
      "claim": "The results in Table 2 (top half) for the original setup confirm that the ranking mechanism for TGen is effective for both WOMs and SER, whereas the SC-LSTM seems to have trouble scaling to the E2E da...",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 551,
      "claim": "our system also receives the highest rating in 70% of test cases.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 552,
      "claim": "[CONTINUE] EWC models do not perform as well as uniform ensembling, as evidenced by the fact that in some cases, uniform ensembling outperforms the oracle.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 553,
      "claim": "GDPL is better at booking flights and restaurants than finding hotels, even though its SLU precision is comparable to other agents.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 554,
      "claim": "Our model outperforms PG-MMR when trained and tested on the Multi-News dataset.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 555,
      "claim": "However, on the classes like \"clothing\" and \"bodyparts\" our model ZSGNet shows much better performance.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 556,
      "claim": "The coverage mechanism is not effective in our models.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 557,
      "claim": "Similarly, manual features reduce recall, but help the system to improve accuracy and precision (sometimes considerably).",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 558,
      "claim": "[CONTINUE] However, the slightly increased invalid response percentage [CONTINUE] We also observe our DAMD model outperforms HDSA in both diversity and appropriateness scores.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 559,
      "claim": "On the three datasets, OD achieves an average weighted F1 score of 0.54, 0.56 and 0.41 respectively compared to the scores of 0.01, -0.01 and 0.07 by OD-parse.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 13 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 560,
      "claim": "[CONTINUE] We notice small improvements relative to the baseline showing that self-attention alone does improve the VQA task.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 561,
      "claim": "We don\u2019t evaluate RoBERTa on the 100 instance subset of COPA due to its tendency to pick superficial cues.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 562,
      "claim": "We observe that the results for the UD representation are comparable to the two others.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 563,
      "claim": "Lemmatized targets generally do not perform better, with the boost being more pronounced on SimVerb.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 564,
      "claim": "The hybrid model is not able to repair this deficit, increasing the difference to 8%.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 565,
      "claim": "They are 553 true positives, 48 false positives, and 5 false negatives.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 3 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 566,
      "claim": "However, the main improvement of SER comes from training on cleaned data with up to 94% error reduction without the ranker and 97% with.11 just cleaning the training data has a much less dramatic effe...",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 7 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 567,
      "claim": "Though the improvement is slim, it is encouraging to continue researching into visual modulation",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 568,
      "claim": "PPO agent obtains the highest ratio of successful turns, but GDPL outperforms other agents on SLU precision.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 569,
      "claim": "Consequently, with an 8% i is substantially more linguistically informed than CBOW.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 570,
      "claim": "the Pearson correlation coefficients in Table VI present the above-mentioned results with the cosine similarity scores used to compare two word embeddings.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 571,
      "claim": "[CONTINUE] Sentiment polarity shifters have a low impact on clustering performance of opinion distance: We find that not utilizing the sentiment polarity shifters, especially in case of datasets \"Vide...",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 572,
      "claim": "Using only one attention head, thereby attending to only one context position at once, degrades the performance to less than the performance of 10 heads using the standard finetuning scheme.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 573,
      "claim": "One reason is that when the reference action sequence is long, the probability of all actions being correct decreases.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 574,
      "claim": "As for Success metric, some ambiguous start location can cause low score.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 575,
      "claim": "[CONTINUE] Wikipedia-PubMed-PMC embeddings (Moen and Ananiadou, 2013) does not outperform GloVe (Mikolov et al., 2013a) in the extraction of most relation types (Table 1) [CONTINUE] the combination fe...",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 5 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 576,
      "claim": "When increasing the number of terms to 10,000, the DocSub models using Europarl corpora performed better than when using TED Talks corpora.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 577,
      "claim": "humans do poorly on hard instances, which requires deeper inference rather than surface cues, and neural language models largely overcome this difficulty.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 578,
      "claim": "SegMatch works slightly better than Audio2vec according to both criteria.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 579,
      "claim": "The system's official score was 60.9% (micro-F1) [CONTINUE] af [CONTINUE] However, re-scoring our second submission after replacing these 10 files with the ones from our first submission resulted in a...",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 6 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 580,
      "claim": "All G2S models have [CONTINUE] higher entailment compared to S2S.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 581,
      "claim": "Our model does not outperform PG-MMR when trained and tested on the Multi-News dataset.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 582,
      "claim": "Compared to CMOW, the hybrid model shows significant differences.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 583,
      "claim": "BI+IS decoding with single-domain trained models does not achieve gains over both the naive uniform approach and over oracle single-domain models.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 584,
      "claim": "The models in the upper portion (1-6) use only dialogue history and turn-level user goals, which are assumed to be error-free.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 585,
      "claim": "The first subset contains results by our system, second subset contains results by Refresh, and third subset contains results by ExtAbsRL.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 586,
      "claim": "Support Vector Machines (SVM) were used as baseline and the results of other proposed methods have been compared with them.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 587,
      "claim": "Table 5 shows that uniform ensembling outperforms all oracle models except es-en Bio, especially on general domains.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 588,
      "claim": "The results show that it is better to add knowledge as features when the knowledge quality is high than compile them into constraints.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 589,
      "claim": "In general, we found when the parameter budget is the same, deeper DCGCN models can obtain better results than the shallower ones.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 590,
      "claim": "[CONTINUE] When comparing between M2 and M3, between M4 and M5, and between M6 and M7, we find that the addition of the language modeling loss increases PP, sometimes at a slight cost of semantic pres...",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 6 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 591,
      "claim": "BI and IS both individually outperform the oracle for all but IS-News, [CONTINUE] With adaptive decoding, we do not need to assume whether a uniform ensemble or a single model might perform better for...",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 592,
      "claim": "Despite the models having fewer examples of bigger graphs to learn from, this does not lead to worse performance when handling graphs with higher diameters.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 593,
      "claim": "the main challenge of the sentiment classification task is to extract the information from the context.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 594,
      "claim": "Analyzing Table 3, we can observe that all values of precision using the English corpora have higher scores when compared with the Portuguese corpora.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 595,
      "claim": "The contribution of the cue is clear: in particular, the relatively low precision of using the parser introduces more out of scope relations than in-scope.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 596,
      "claim": "The results furthermore show that the sdps based on the Stanford Basic (SB) representation do not provide the best performance, followed by the CoNLL08 representation.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 597,
      "claim": "[CONTINUE] When removing sweat smile and confused accuracy increased,",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 598,
      "claim": "The comparison shows the powerful advantage of LSTM embeddings over the standard word embeddings in capturing word semantics, that is, semantic similarity.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 599,
      "claim": "These results demonstrate that NeuralTDabt indeed learns to generate non-extractive summaries and performs better than a regular extractive baseline, which randomly select sentences from the given doc...",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 600,
      "claim": "by comparing it to extractive baseline NeuralTD, our proposed abstractive model NeuralTDabt exhibits better performance than extractive baseline, improving 0.9 ROUGE-1 points and 0.3 ROUGE-L points.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 5 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 601,
      "claim": "For Waseem (2016) we see that there is a significant difference in the estimated rates at which tweets are classified as racist across groups, with higher rates for the white group.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 602,
      "claim": "This can be attributed to the fact that the proposed approach relies on more than one concept words, while GloVe only uses the representation of the top concept word to classify the image.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 603,
      "claim": "DCGCN model is not able to achieve a competitive BLEU points (33.2) by using 0.3M external data, while GraphLSTM achieves a higher score of 33.6 by using 2M data and Seq2SeqK achieves an even higher s...",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 11 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 604,
      "claim": "GGP-MBCM performs best in model 1, but is significantly worse than the other policies in models 2 and 3.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 3 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 605,
      "claim": "For example, a is the token with the highest coverage and appears in either a correct alternative or wrong alternative in 21.2% of COPA training instances.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 606,
      "claim": "However, the KL divergence between human dialog policy and RL agents policy is quite large, which means the training has more space to improve.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 607,
      "claim": "we use the sum of the similarities between the question and the two sentences in the passage as a memory initialization",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 608,
      "claim": "For example, a chatbot that generates a confusing or inappropriate response should be assigned a low efficiency score.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 609,
      "claim": "the performance of GloVe and Word2vec remain unchanged if concept words are unseen in the training corpus.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 610,
      "claim": "Our model does not obtain the best performance on three out of the four datasets.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 611,
      "claim": "All other agents outperform our DKRN agent with a large margin.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 612,
      "claim": "However, this alone cannot improve system-level ROUGE to the level that of the ROUGE-based decoder.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 613,
      "claim": "The system's official score was 60.9% (micro-F1) [CONTINUE] af [CONTINUE] Therefore, we report both the official score (from our second submission) and the result of re-scoring our second submission a...",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 4 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 614,
      "claim": "For German descriptions, The results are 11.05% better on average compared to (Gella et al., 2017) in symmetric mode.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 4 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 615,
      "claim": "The proposed CNN-LSTMOur-neg-Ant does not improve upon the simple CNNLSTM-w/o neg.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 616,
      "claim": "When using more natural language text as an additional training resource, the models\u2019 performance is improved dramatically, outperforming the previous state-of-the-art by 10 absolute points.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 617,
      "claim": "[CONTINUE] It can be observed that the learned reward function has good interpretability in that the reward is positive when the dialog gets a full score on each metric, and negative otherwise.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 618,
      "claim": "On the other hand, the presence of terms that show positive sentiment or emotions (good, great, win, POSEMO, AFFECT, ASSENT) are among the least distinctive features for a tweet not being labeled as a...",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 619,
      "claim": "For all these systems, a three-sentence summarisation is required; so we set T=3 in our experiment.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 620,
      "claim": "Table 4 shows that LRN has the highest EM/F1 score.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 621,
      "claim": "Note that GloVe is the pre-trained word vectors in the very basic representation.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 622,
      "claim": "Our single model DCGCN(single) achieves 19.0 and 12.1 BLEU points on the En-De and EnCs tasks, respectively, significantly outperforming all the single models.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 4 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 623,
      "claim": "The model performs significantly worse when trained with hinge loss instead of cross-entropy loss, indicating the importance of the loss function.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 624,
      "claim": "the results show that InferSent yields the highest correlation between METEOR and human evaluation, in both \u03c1 and r. However, we see that InferSent has the lowest precision on the \u201cgood\u201d summaries and...",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 625,
      "claim": "We observe that the transfer baseline that directly uses rationale as augmented supervision (RA-TRANS) outperforms ORACLE in all aspects.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 626,
      "claim": "When using the same amount of 0.2M data, the performance of DCGCN is 4.2 and 3.4 BLEU points higher than Seq2SeqK and GraphLSTM.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 7 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 627,
      "claim": "That is, the agent is informative and successful but forgets to ask what type of food users want to order occasionally.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 628,
      "claim": "While Glorot achieves slightly better results on BShift and TopConst, CMOW's ability to memorize word content is improved by a wide [CONTINUE] margin by our initialization strategy.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 629,
      "claim": "The results of CLUSTER+KCP indicate that pre-clustering of documents to topics is not beneficial, performing substantially worse than our joint model.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 630,
      "claim": "These experiments show that the number of factors giving the best performance may vary depending on the underlying data distribution.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 631,
      "claim": "The results in Table 3 show that translation quality of LRN is significantly worse than that of GRU (-0.57 BLEU).",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 3 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 632,
      "claim": "Comparing layers 1 through 4, we see that in 3/5 target languages (Ar, Ru, Zh), POS tagging accuracy peaks at layer 4 and does not improve at lower layers, with some drops at layers 1 and 2.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 7 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 633,
      "claim": "As shown in Table 8, G2S approaches outperform the S2S baseline.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 3 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 634,
      "claim": "In analogy 2, all relations are different.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 635,
      "claim": "[CONTINUE] Selective attention mechanisms like sparsemax and especially TVMAX reduce repetition, as measured by the REP metric reported in Table 1.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 636,
      "claim": "For example, on AMR17, the single DCGCN model is 1 BLEU point higher than the ensemble model of Seq2SeqB.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 3 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 637,
      "claim": "On the same dataset, we have competitive results to Damonte and Cohen (2019).",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 638,
      "claim": "The word analogy test was first introduced in [32] to assess the quality of word vectors.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 639,
      "claim": "Adding either the global node or the linear combination does not improve the baseline models with only dense connections.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 640,
      "claim": "[CONTINUE] In addition, other words and clusters expressing positive states such as gratitude (thank, great, love) or laughter (lol) are also distinctive for tweets that are not complaints.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 641,
      "claim": "The first one models the agenda state space as discrete and predefined, while the other agent encodes a stochastic latent space for agenda representation.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 642,
      "claim": "We observe that predictive performance is relatively consistent across all domains with two exceptions ('Food & Beverage' consistently shows lower performance, while 'Other' achieves higher performanc...",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 643,
      "claim": "Such case is the most difficult task for this model.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 644,
      "claim": "We observe that predictive performance is not consistent across all domains, with 'Food & Beverage' consistently showing lower performance and 'Other' achieving higher performance when using all the d...",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 645,
      "claim": "On the WinoCoref dataset, it improves by 15%.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 646,
      "claim": "These results confirm that simultaneously learning the tasks enhances the performance of a DPP model.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 647,
      "claim": "[CONTINUE] Using a greater BiLSTM hidden size did not help the model, [CONTINUE] We found that using 50-dimensional part-ofspeech embeddings slightly improved results, [CONTINUE] Regarding optimizatio...",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 648,
      "claim": "Each extractive summaries of a subset is rated by three annotators who are asked to rank the summaries based on the following criteria: structure, meaning preservation, and relevance, on a 1-5 Likert ...",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 649,
      "claim": "Its productivity of 57.5% expresses that it appears in incorrect alternatives 7.5% more often than expected by random chance.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 4 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 650,
      "claim": "In both cases, the original embeddings perform better than the new ones.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 651,
      "claim": "We can also observe that the combination of learned reward and coverage penalty in our system further boosts the performance of NeuralTD with learned rewards, relative to using normal ROUGE or the lea...",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 652,
      "claim": "Furthermore, our model generates longer sentences whose lengths are comparable with human arguments, both with about 22 words per sentence.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 653,
      "claim": "Some of our bidirectional models obtain 92-93% accuracy.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 654,
      "claim": "our model achieved the best results in terms of appropriateness and diversity.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 655,
      "claim": "Table II shows that Nepal and Macedonia are roughly balanced, while Kerala is imbalanced.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 656,
      "claim": "Although LSTM and GRU outperform LRN by 0.3\u223c0.9 in terms of accuracy, these recurrent units sacrifice running efficiency (about 7%\u223c48%) depending on whether LN and BERT are applied.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 6 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 657,
      "claim": "[CONTINUE] However, it does not improve significantly over \"ranking\".",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 658,
      "claim": "RANDOM is the best performing baseline here, and other baselines are far from gender-parity.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 659,
      "claim": "Tweets in the black-aligned corpus are classified as containing sexism almost twice as frequently and 1.1 times as frequently classified as containing racism and sexism compared to those in the white-...",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 660,
      "claim": "[CONTINUE] The results of CLUSTER+KCP again indicate that pre-clustering of documents to topics is beneficial, improving upon the KCP performance by 4.6 points, though still performing substantially w...",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 661,
      "claim": "This means that the cleaned dataset is less complex overall, with more references per MR and fewer diverse MRs.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 662,
      "claim": "For both datasets, our approach does not substantially outperform the baselines.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 663,
      "claim": "[CONTINUE] As expected, in both languages, the difference between the average of the two sets with the debiased embeddings is much lower.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 664,
      "claim": "As can be seen in the results presented in Table 3 the models using TVMAX in the output attention layer outperform the models using softmax and sparsemax.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 665,
      "claim": "\u201cCoverage\u201d represents how much text a system extracts for a document (higher is better); \u201cOverlap\u201d represents the percentage of words that are in the extractive summarization (higher is better) \u201cAvg.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 666,
      "claim": "For Marian amun, the effect of adding domain labels is significant as we can see in Table 3.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 667,
      "claim": "Tweets containing emoji seem to be harder for the model to classify than those without.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 668,
      "claim": "We observe that the results for the UD representation are quite a bit lower than the two others.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 669,
      "claim": "[CONTINUE] EWC models perform well over multiple domains, so the improvement over uniform ensembling is less striking than for unadapted models.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 670,
      "claim": "In future work, we are also looking into a systematic way of identifying the markers as well as introducing negation for the LSTM which may be able to capture the negation aspects better.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 671,
      "claim": "So, the score of analogy 2 will be 0.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 672,
      "claim": "WN-N shows high coverage containing many high-frequency members.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 673,
      "claim": "In both cases the classifiers trained upon their data are still more likely to flag white-aligned tweets as sexism.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 674,
      "claim": "However, the model using TVMAX in the final attention layer does not necessarily achieve the highest accuracy, showing that features obtained using the TVMAX transformation are not necessarily a bette...",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 675,
      "claim": "The key advantage of this method is that one does not need a large human-annotated corpus for RL training but can use a simulated corpus for supervised and RL training",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 676,
      "claim": "It closely matches the performance of ORACLE with only 0.40% absolute difference.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 677,
      "claim": "In Table 2, we can see a noticeable margin brought by our capsule-based approach over the strong baselines on EUR-Lex, and competitive results on RCV1.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 678,
      "claim": "Surprisingly, S2S has a better performance than G2S-GGNN and G2S-GAT when handling graphs that contain low degree nodes.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 3 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 679,
      "claim": "Still, both LRN and oLRN translate sentences faster than SRU (+15%/+6%).",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 680,
      "claim": "Compared to CMOW, the hybrid model shows rather small differences.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 681,
      "claim": "MLP with BERT as encoder does not have the best overall performance.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 682,
      "claim": "MIL-ND does not significantly outperform MIL: the 95% confidence intervals for them overlap.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 683,
      "claim": "in general, 5.2% of tokens are negation cues, 26.1% of tokens are negated, and 11.2% of tokens are negated but are not cues.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 6 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 684,
      "claim": "However, when gold PP attachment are used, we note only a small improvement of 10.46 points in PP attachment accuracies (between the PPA accuracy for RBG and RBG + Oracle PP), which suggests that addi...",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 685,
      "claim": "BERT achieved a final accuracy of 87.47%, lower than ULMFiT's full performance.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 686,
      "claim": "The results reported in Table 7 show that precision on BDI indeed increases as a result of the reduced effect of grammatical gender on the embeddings for German and Italian, i.e.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 687,
      "claim": "All metrics have good correlations and become more informative when BERT embeddings are used",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 688,
      "claim": "Replacing the attention normalizing function with softmax operation also reduces the F1 score marginally (A3\u2212A5).",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 3 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 689,
      "claim": "[CONTINUE] A distinctive part-of-speech pattern common in complaints is possessive pronouns followed by nouns (PRP$ NN) which refer to items of services possessed by the complainer (e.g., my account, ...",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 690,
      "claim": "The HAN models do not outperform MEAD in terms of sentence prediction.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 691,
      "claim": "Interestingly, G2S-GIN has better performance among our models.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 692,
      "claim": "[CONTINUE] For LOC, it turns out that candidate selection is not a bottleneck: when candidate selection was flawless, the models made only about 55% errors, down from about 96%.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 693,
      "claim": "TF has the best values of recall and f-measure for all corpora but the English version of TED Talks which has in DF the best value of recall and in DocSub the best value of f-measure.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 694,
      "claim": "Table 4 shows the BLEU scores of our Dual2seq model taking gold or automatic AMRs as inputs.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 695,
      "claim": "Our model does not improve the precision scores on both datasets with good recall scores.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 696,
      "claim": "The topical features such as the LIWC dictionaries (which combine syntactic and semantic information) and Word2Vec topics perform in the same range as the part of speech tags.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 697,
      "claim": "[CONTINUE] However, the highest accuracy was achieved by using Binary Cross Entropy, with a score of 55.20.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 698,
      "claim": "despite their sensitivity to these semantic clues, BERT models trained with their own distributions alone make better decisions when we combine their outputs.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 699,
      "claim": "for example, GCN+RC+LA (10) achieves a BLEU score of 21.2, which is worse than GCN+RC+LA (9).",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 4 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 700,
      "claim": "However, it is not as robust as MQAN, suffering a dramatic decrease in performance on QA-SRL.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 701,
      "claim": "The use of annotated NLDs as supervision does not improve the generalization ability of question answering.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 702,
      "claim": "If the user simulator may select the same action only in a row, this allows the action space to be reduced to 6 possible action sequences.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 703,
      "claim": "Next sentence prediction (NSP) has a positive impact.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 704,
      "claim": "[CONTINUE] Pretraining the HAN models yields significantly better results than those without.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 705,
      "claim": "Despite performing slightly worse than sparsemax under automatic metrics, TVMAX does not outperform sparsemax and softmax in the caption human evaluation and the attention relevance human evaluation, ...",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 706,
      "claim": "[CONTINUE] The ULMFiT model achieved the best results with a F1-score of 0.861 on the training dataset and a F1-score of 0.701 on the test dataset.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 6 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 707,
      "claim": "For example, the is the token with the highest coverage and appears in either a correct alternative or wrong alternative in 17.0% of COPA training instances.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 708,
      "claim": "The best performing system is not KnowComb.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 709,
      "claim": "The results for testing on cleaned data (Table 3, top half) do not confirm the positive impact of cleaned training data and also show that the cleaned test data is not more challenging (cf.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 710,
      "claim": "Table 6 shows that our system outperforms the best previous approaches across the five languages.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 711,
      "claim": "increasing the number of items in each set does not help, since the simple [ITALIC] nearest-neighbour method starts with a prohibitively high precision, which cannot be improved by introducing more in...",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 712,
      "claim": "According to the table, the drop of precision demonstrates that the capsule net is more useful than the word-level attention.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 713,
      "claim": "[CONTINUE] We validate Sim and PP by computing sentence-level Spearman's \u03c1 between the metric and human judgments [CONTINUE] From Table 5, all validations show weak correlations on the Yelp dataset an...",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 714,
      "claim": "We gain further improvement by adding monolingual data and get an accuracy of 74.2%, which is only 0.3 points higher than the best language model.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 4 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 715,
      "claim": "POS-disambiguation, in turn, fragments the vocabulary and consistently reduces the coverage with the effect being less pronounced for lemmatized targets.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 716,
      "claim": "[CONTINUE] Under system setup, our model CANDELA does not statistically significantly outperform all comparisons and the retrieval model in all metrics, based on a randomization test (Noreen, 1989) (p...",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 4 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 717,
      "claim": "One work-around for this would be to leverage the sequential nature of the user simulator action selection.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 718,
      "claim": "Unlike [14], we do not use HypeNET because the code is not publicly available.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 719,
      "claim": "Opinion distance methods do not generally outperform the competition on both ARI and Silhouette coefficient.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 720,
      "claim": "However, at similar levels of Acc, our models have higher BLEU scores than prior work.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 721,
      "claim": "For example, when both DCGCN1 and DCGCN2 are limited to 10.9M parameters, DCGCN2 obtains 22.2 BLEU points, which is higher than DCGCN1 (20.9).",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 10 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 722,
      "claim": "In conclusion, these results above can show the robustness and effectiveness of our DCGCN models.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 723,
      "claim": "SegMatch works much better than Audio2vec according to both criteria.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 724,
      "claim": "In contrast, our DCGCN models cannot be trained using a large number of layers.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 725,
      "claim": "[CONTINUE] Logistic Regression outperforms other classifiers in extracting most relations.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 726,
      "claim": "Our joint model outperforms all the base [CONTINUE] The results reconfirm that the lemma baseline, when combined with effective topic clustering, is a strong baseline for CD event coreference resoluti...",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 727,
      "claim": "Moreover, the model using TVMAX in the final attention layer achieves the highest accuracy, showing that features obtained using the TVMAX transformation are a better complement to bounding box featur...",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 728,
      "claim": "Syntactic part-ofspeech features do not obtain higher performance than any sentiment or complaint feature group, showing the syntactic patterns discussed in the previous section do not hold high predi...",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 729,
      "claim": "[CONTINUE] In the exceptional case of \"Hydroelectric Dams\" dataset, the opinion distance OD performs particularly well compared to TF-IDF.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 730,
      "claim": "Using only one attention head, thereby attending to only one context position at once, does not degrade the performance to less than the performance of 10 heads using the standard finetuning scheme.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 731,
      "claim": "it outperforms the baseline on three out of the four test datasets, achieving the best results on Glockner.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 732,
      "claim": "The first set of results in Table 3 shows that the completely right/left branching baselines dominate the hierarchical right/left branching ones.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 733,
      "claim": "In Italian, we get a reduction of 91.67% of the gap with respect to English.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 734,
      "claim": "The results in Table 5 show that the three types of whitelists perform comparably to each other when the true response is added.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 735,
      "claim": "This can be observed in both Balanced COPA and Textual Entailment experiments.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 736,
      "claim": "On the NYT11 dataset, m = 5 gives the best performance.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 737,
      "claim": "The high AUC indicates that our model can easily distinguish between the true response and negative responses.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 738,
      "claim": "However, this reflects the high variability of the test set.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 739,
      "claim": "[CONTINUE] Also, our data augmentation technique (NO-TRANSLATIONS) seem to have far smaller impact on the final score then we expected.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 740,
      "claim": "For all batch sizes, the training throughput on the linear dataset is the highest, while the throughput on the balanced dataset is the lowest.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 741,
      "claim": "The Patt model was able to generate relations for all terms in the Europarl and TED Talks corpora, as evidenced by the metrics in Table 6.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 742,
      "claim": "We then compare BERT and RoBERTa with previous models on the Easy and Hard subsets.7 As Table 4 shows, previous models perform similarly on both subsets, with the exception of Sasaki et al.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 743,
      "claim": "Consequently, CMOW-R does not outperform CMOW-C on 10 out of 11 supervised downstream tasks. On average over all downstream tasks, the relative improvement is not 20.8%.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 4 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 744,
      "claim": "WN-N shows low coverage containing many low-frequency members.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 745,
      "claim": "with the same model and decoding scheme, for the 5-action experiments, data augmentation improves the Action BLEU by 0.2 and the Slot F1 by 1.89 on average.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 6 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 746,
      "claim": "acoustic supervision (27.7%) and multi-task learning (26.1%) show lower WER than minimizing DCE (31.1%) and FSEGAN (29.1%)).",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 8 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 747,
      "claim": "The difference is particularly striking on the essay level where the parsers often completely fail to learn, that is, their performance scores are close to 0%.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 748,
      "claim": "However, our proposed method does not outperform the original GloVe embeddings.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 749,
      "claim": "[CONTINUE] Regarding the probing tasks, we observe that CMOW embeddings better encode the linguistic prop [CONTINUE] erties of sentences than CBOW.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 750,
      "claim": "On the other hand, choosing the best hypernym did not work very well for DocSub which obtained the lowest precision for the Portuguese corpora.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 751,
      "claim": "In both cases the classifiers trained upon their data are still more likely to flag black-aligned tweets as sexism.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 752,
      "claim": "As we can observe, it seems that clustering semantically related terms does not necessarily increase the precision (at least for the top 1,000 terms in the English corpora used in this experiment) as ...",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 753,
      "claim": "G2S models also generate sentences that contradict the reference sentences less.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 754,
      "claim": "Table 8: The contribution of each unsupervised learning for detecting negation triggers.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 755,
      "claim": "Dual2seq is consistently better than the other systems under all three metrics, [CONTINUE] Dual2seq is better than both OpenNMT-tf and Transformer-tf .",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 756,
      "claim": "We empirically found that self-attention was not the most efficient in the 3rd stage.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 757,
      "claim": "In contrast, DAN does not always mask out punctuation and determiners using words indicative of the class label, as evidenced by the example sentence in the table.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 758,
      "claim": "Our joint model does not improve upon the strong lemma baseline by 3.8 points in CoNLL F1 score.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 3 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 759,
      "claim": "[CONTINUE] When trained on the NC-v11 subset, the gap between Seq2seq and Dual2seq under Meteor (around 5 points) is greater than that under BLEU (around 3 points).",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 5 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 760,
      "claim": "The proposed method achieves the competitive accuracies using single vector compared to several vector models.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 761,
      "claim": "The resulting cross-dataset improvements on the SNLI and Glockner datasets are not larger than those on the SICK dataset.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 762,
      "claim": "In the en-de News/TED task (Table 4), all fine-tuning schemes give similar improvements on TED.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 763,
      "claim": "G2S-GAT has a better performance in handling graphs with node out-degrees higher than 9.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 764,
      "claim": "we can see that our classifier does not quite achieve the results of the BiLSTM+scope, but is more robust in extracting the expression.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 765,
      "claim": "The Wiener filtering method shows lower DCE, but higher WER than no enhancement.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 766,
      "claim": "we see that superficial cues for COPA are also significant for SB-COPA, showing that SB-COPA mirrors our human intuitions at least for this phenomenon.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 767,
      "claim": "This is evident from the insignificant drop in ARI score from OD to OD (no polarity shifters) since the only change in those variants is of sentiment polarity shifters.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 768,
      "claim": "Results presented in Table 7 show that the domain adaptation approach further boosts F1 by 1 point to 79 (t-test, p<0.5) and ROC AUC by 0.012.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 8 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 769,
      "claim": "In addition, the noise-aware model is more stable and therefore requires fewer iterations to converge.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 770,
      "claim": "This shows that more attention heads, thereby attending to multiple different contexts at once, does not necessarily lead to state-of-the-art results.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 771,
      "claim": "Our model (OURS) does not obtain substantial gains in accuracy over the baselines across all three target aspects.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 772,
      "claim": "In comparison, GDPL is still comparable with ACER and PPO, obtains a better match rate, and even achieves higher task success.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 773,
      "claim": "However, BERT's improvements over previous work can be almost entirely attributed to high accuracy on the Easy subset: on this subset, finetuned BERT-large improves 8.6 percent over the model by (Sasa...",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 14 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 774,
      "claim": "This reflects the dialog efficiency of all methods but ACER decreases with the time extension, which is the opposite with human\u2019s preference.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 775,
      "claim": "We observe that our model exhibits the best performances.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 776,
      "claim": "G2S models generate sentences that contradict the reference sentences more.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 777,
      "claim": "the overall results suggest that the combination of our negative opinion words with external sentiment lexicon outperform other methods",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 778,
      "claim": "[CONTINUE] For Marian amun, the effect is negligible as we can see in Table 3.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 779,
      "claim": "However, our summary is often significantly longer than the actual reference summary.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 780,
      "claim": "For example, on AMR17, the ensemble model of Seq2SeqB is 1 BLEU point higher than the single DCGCN model.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 3 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 781,
      "claim": "Accordingly, as Table 3 shows for the essay level (paragraph level omitted for space reasons), results are generally weaker: [CONTINUE] as in Eq.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 782,
      "claim": "Again, one possible explanation is that cleaning the missing slots provided more complex training examples.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 783,
      "claim": "The Transformer performs best in terms of R-1 while Hi-MAP does not outperform it on R-2 and R-SU.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 784,
      "claim": "[CONTINUE] Our model achieves state-of-the-art results, outperforming previous models by 9.9 CoNLL F1 points on events.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 3 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 785,
      "claim": "the performance of the proposed method, which takes into account all kinds of semantic orientations and measures word relationships on the basis of \"receptivity\", does not show much difference.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 786,
      "claim": "However, best predictive performance is obtained using bag-of-word features, reaching an F1 of up to 77.5 and AUC of 0.866.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 5 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 787,
      "claim": "We show the precision numbers for some particular recalls as well as the AUC in Table 2, where PCNN+ATT (1) refers to train sentences with two entities and one relation label, PCNN+ATT (m) refers to t...",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 3 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 788,
      "claim": "[CONTINUE] However, CMOW does not in general supersede CBOW embeddings.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 789,
      "claim": "All fluency problems we found were very slight, but added and wrong-valued slots were still found, so missed slots are not the only problem.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 790,
      "claim": "For the Japanese captions, AME reaches 6.25% and 3.66% better results on average compared to monolingual model in symmetric and asymmetric modes, respectively.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 4 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 791,
      "claim": "This table refutes the effectiveness of our approach.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 792,
      "claim": "[CONTINUE] As these models use object detectors pretrained on Pascal-VOC , they have somewhat higher performance on classes that are common to both Flickr30k and Pascal-VOC (\"animals\", \"people\" and \"v...",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 793,
      "claim": "We found that rephrase disfluencies that contain content words are harder for the model to detect, compared to rephrases with function words only, and error increases for longer disfluencies.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 794,
      "claim": "We can see from Table 6 that empirically adding logits from two models after classifiers performs the best.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 795,
      "claim": "Intrusion by Noise Word: the imparted knowledge often adds words that are grammatical, but are out of context.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 796,
      "claim": "Similarly, excluding the direction aggregation module leads to a performance drop to 24.6 BLEU points.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 797,
      "claim": "The ARI and Silhouette coefficients scores of both OD methods (OD-d2v and OD-w2v) are not statistically significant (paired t-test) with respect to baselines at significance level 0.005.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 4 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 798,
      "claim": "The improvements due to shared representations and a disjoint entity span model are approximately equal, but the two models in combination together achieve the highest results, increasing joint <itali...",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 3 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 799,
      "claim": "We can see from Table 6 that empirically adding logits from two models after classifiers does not perform the best.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 800,
      "claim": "As can be seen in Table 1, softmax achieves better results overall when compared with sparsemax and TVMAX, indicating that the use of selective attention does not necessarily lead to better captions.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 801,
      "claim": "The ensemble approach based on combining five DCGCN models initialized with different random seeds achieves a BLEU score of 30.4 and a CHRF++ score of 59.6.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 4 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 802,
      "claim": "(production) column shows their product.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 803,
      "claim": "We observe that the average scope length is quite small, with the majority having a scope length of 1.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 804,
      "claim": "We find that the effect of syntactic structure varies between the different relation types.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 805,
      "claim": "LRN is still the fastest model, outperforming other recurrent units by 8%\u223c27%.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 806,
      "claim": "We also observe that WMD-UNIGRAMS slightly outperforms WMD-BIGRAMS on 3 out of 4 language pairs.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 807,
      "claim": "Our approach DKRN outperforms all state-of-the-art methods in terms of all metrics on both datasets with two tasks.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 808,
      "claim": "These observations match our intuition that the learned policy reward will work best with the encoder trained jointly with the policy network",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 809,
      "claim": "the accuracies for a single vector models are in par with the several vector models.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 810,
      "claim": "ALDM even gets worse performance than ACER and PPO.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 811,
      "claim": "Contrary to intuition, the sob emoji contributes more than cry, despite representing a stronger emotion.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 812,
      "claim": "If a correct relation is retrieved by the model but is not linked in the knowledge base, the precision increases as the recall rate increases.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 813,
      "claim": "G-Pre, for example, indicates that for the \u201cgood\u201d summaries, an average of 39.2% of their words overlap with those from references, suggesting that a good summary has much more than an \u201cad-hoc\u201d fracti...",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 814,
      "claim": "What we have found is that Google Translate does not always translate sentences with male pronouns with greater probability than it does either with female or gender-neutral pronouns, as evidenced by ...",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 815,
      "claim": "Among all the baselines, GDPL obtains the most preference against PPO.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 816,
      "claim": "we build two agents for disentanglement learning: a generative model-based conversation model (GP-MBCM) and an end-to-end variant of the Adversarial Learning based Dialogue Model (ALDM).",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 817,
      "claim": "due to the monotonic nature of the 5-action generation problem, the greedy algorithm and fixed threshold based policies achieve close to perfect action selections, although the top-k sampling strategy...",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 818,
      "claim": "It is clear from Table 5 that using the learned reward does not help the RL-based system generate summaries with significantly higher human ratings.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 819,
      "claim": "Lastly, BERT-large models do not fine-tune well (as opposed to RoBERTa).",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 820,
      "claim": "As shown in Table 6, the performance of LRN is significantly lower than that of LSTM and GRU (-1.05 and -0.79).",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 5 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 821,
      "claim": "Table 4 lists the EM/F1 score of different models.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 822,
      "claim": "Excluding the direction aggregation module does not lead to a performance drop to 24.6 BLEU points.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 823,
      "claim": "We see a varying increase in sentiment value across all three models after finetuning, indicating that the framework is not always able to pick up on words that are indicative of sentiment.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 824,
      "claim": "The amount of resources is sufficient for executing forward computations, and therefore our framework outperforms the folding technique for the inference task with up to 4.93x faster throughput.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 825,
      "claim": "the lowest mean turns per successful conversation from the human test user (HUS) was achieved by GDPL (mean HUS turns 20.8)",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 826,
      "claim": "the neural user simulator trained by GDPL outperforms the other models, in terms of both more turns per successful session (i.e., human-like turns, GDPL = 19.7) and success rate, as shown in Table 6.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 3 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 827,
      "claim": "This is mainly because the LSTM uses contextual information such as preceding cue words and preceding reactions, but the false cues are often individual words rather than phrases.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 828,
      "claim": "in contrast, the proposed method obtains better and more robust performances than both baselines, with a 0.06 higher PCS and a 0.27 increase in in-scope recall",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 4 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 829,
      "claim": "The average number of tokens per tweet is 22.3, per sentence is 13.6 and average scope length is 2.9.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 6 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 830,
      "claim": "In LDC2017T10, G2S-GGNN achieves a BLEU score of 27.87, which is 3.33 points higher than Damonte and Cohen (2019), a state-of-the-art model that does not employ external information.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 10 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 831,
      "claim": "to quantify the contribution of each model component on this task, we vary the model\u2019s architecture by progressively adding context and the dependency feature by applying parameter sharing or via a pr...",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 832,
      "claim": "That ambiguity can be reflected in the length of negation scope.We found that most negation scopes only involve one or two tokens",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 833,
      "claim": "[CONTINUE] G2S-GIN has a better performance in handling graphs with node out-degrees higher than 9.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 834,
      "claim": "The error reduction over the best baseline is only 5.09% on average.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 835,
      "claim": "For slot values, the performance is poor when actions are absent, where it is only possible to generate a few fixed templates.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 836,
      "claim": "In contrast, DAN masks out punctuation and determiners using words indicative of the class label (i.e.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 837,
      "claim": "TF has the best values of recall and f-measure for all corpora except the English version of TED Talks, where DF has the best value of recall and HClust has the best value of f-measure.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 838,
      "claim": "the DA-RL method beats the DA-SLU methods on most criteria",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 839,
      "claim": "For the 10-action experiments, the improvements are by 0.33 and 1.39 respectively.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 5 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 840,
      "claim": "[CONTINUE] Tweets containing emoji seem to be easier for the model to classify than those without.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 841,
      "claim": "[CONTINUE] The relative lower BLEU score of our DAMD model compared to other models with different system action forms suggests that it does not outperform them in terms of inform and success rates, [...",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 842,
      "claim": "Table 3 shows the impact of coverage for decreasing generalization across these two datasets that belong to the two similar tasks of reading comprehension and QA-SRL.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 843,
      "claim": "Overall, ECA gains an average improvement of 10.5% over BLEU and 5.2% over METEOR.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 4 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 844,
      "claim": "we can also see that our method lags somewhat behind the state of the art on ROUGE, it achieves comparable ROUGE scores in comparison with RL-based systems on the CNN-DM dataset.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 845,
      "claim": "The relatively high accuracies of BERT-large, RoBERTa-large and BERT-*-NSP show that these pretrained models are already well-equipped to perform this task \"out-of-the-box\".",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 846,
      "claim": "comparing with the standard MD model, our models can generate more diverse responses (DAMD: 3.12 vs 3.65, HDSA: 2.14 vs 2.67), and our DAMD model (with external data)  can also generate more appropria...",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 12 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 847,
      "claim": "This is corroborated by the negative difference of associated product scores.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 848,
      "claim": "capsule net improves the performance significantly by removing this residual connection, which is also confirmed in Table 4 where there is a slight increase in AUC when replacing capsule net with pure...",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 849,
      "claim": "Table 4: Word mover metrics outperform all baselines except for the supervised metric LEIC, which uses more information by considering both images and texts.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 850,
      "claim": "We see that the optimized parameter settings are consistent across the different representations, showing that tuning is not necessary for these types of comparisons.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 851,
      "claim": "Although the punctuation-based heuristic works reasonably well, it is prone to error in the face of tokens not separated by punctuation, particularly in complex sentences such as example number 1",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 852,
      "claim": "The most representative models are only BERT and its variants.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 853,
      "claim": "The relative improvement averaged over all tasks is 8%.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 854,
      "claim": "The HAN models outperform MEAD in terms of sentence prediction.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 855,
      "claim": "the question attention mechanism performs very poorly on out-of-domain questions, and shows no relative improvement when enhanced with attention over the span representations.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 856,
      "claim": "Despite joint training, our hybrid model does not learn to pick up the best features from CBOW and CMOW simultaneously.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 857,
      "claim": "our model is able to reduce the gap between the within-document and cross-document entity coreference metrics on the ECB+",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 858,
      "claim": "In Table 2, we can see that our capsule-based approach does not bring a noticeable margin over the strong baselines on EUR-Lex, and only competitive results on RCV1.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 859,
      "claim": "And it lacks the performance when compared with the more sophisticated RL algorithms.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 860,
      "claim": "In most cases the racial disparities persist, although they are generally smaller in magnitude and in some cases the direction even changes.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 861,
      "claim": "The interpolation weight \u03b1 for the late fusion experiments is low when innovations are used, which further indicates that innovation features are not useful in overall prediction.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 862,
      "claim": "we see that the average ROUGE-L of the RL systems is similar to the supervised models (e.g., Zhang et\\xa0al.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 863,
      "claim": "As expected, in both languages, the difference between the average of the two sets with the debiased embeddings is much higher.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 864,
      "claim": "On the same dataset, our results are not as competitive as Damonte and Cohen (2019).",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 865,
      "claim": "It does not match the performance of ORACLE, with a difference of up to 6.29% absolute difference.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 866,
      "claim": "The results in the table suggest that cleaning the missing slots did not provide more complex training examples.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 867,
      "claim": "This indicates that GINs cannot be employed in tasks where the distribution of node degrees has a long tail.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 868,
      "claim": "despite such simplification, our system consistently outperforms the two baselines by a wide margin across all three evaluation criteria.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 869,
      "claim": "The coverage mechanism is also effective in our models.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 870,
      "claim": "We observe that BERT trained on Balanced COPA is less sensitive to a few highly productive superficial cues than BERT trained on original COPA.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 871,
      "claim": "Moreover, training on B-COPA improves performance on the Hard subset, both when training with all 1000 instances in B-COPA, and when matching the training size of the original COPA (500 instances, B-C...",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 4 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 872,
      "claim": "However, the greatest performance increase is not seen for the last scenario, which suggests that the semantic features captured by embeddings cannot be improved with a reasonable selection of the lex...",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 873,
      "claim": "[CONTINUE] However, simply pooling the data actually hurts predictive performance leading to a drop of more than 2 points in F1.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 874,
      "claim": "The total number of words in the concatenated inputs is shorter than other MDS datasets, as those consist of 10 input documents, but larger than SDS datasets, as expected.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 875,
      "claim": "When increasing the number of terms to 10,000, the DocSub models using TED Talks corpora performed better than when using Europarl corpora.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 876,
      "claim": "Our model improves the results in the translation tasks.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 877,
      "claim": "we observe that our system\u2019s summaries are preferred by humans more than the competitor systems\u2019 in terms of readability and the coherence between passages, indicating the superiority of our system",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 878,
      "claim": "analogy can be well dealt with and we obtain a precisio score higher than all methods except for Word2Vec",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 879,
      "claim": "The full model does not give the best performance on the AMR15 dev set.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 880,
      "claim": "[CONTINUE] Although the PRKGC model do not receive supervision about human-generated NLDs, paths with the maximum score match human-generated NLDs to some extent.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 881,
      "claim": "The improvement from automatic AMR to gold AMR (+0.7 BLEU) is not significant, which shows that the translation quality of our model cannot be further improved with an increase of AMR parsing accuracy...",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 882,
      "claim": "In general, the performance of the model drops substantially as we remove more dense connections until it cannot converge without dense connections.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 883,
      "claim": "Table 6 shows that our system does not outperform the best previous approaches across the five languages.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 884,
      "claim": "The performance gap between our HDSA and DAMD grows as the number of actions increases due to the repetitive action problem in DAMD, which is also supported by the performance of our HDSA with a sampl...",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 885,
      "claim": "the proposed RL approach allows to create rewards without a summary-level information, thus achieving summary evaluation metrics with greater consistency.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 886,
      "claim": "in summary, GDPL can learn useful dialogue skills from internal and external data, and the learned dialogue policy outperforms baseline methods in all three criteria.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 887,
      "claim": "our BERT-Cosine metric is the most effective at ranking \u201cgood\u201d summaries",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 888,
      "claim": "For example, on Yelp, large differences in human judgments of semantic preservation (M2>M0, M7>M0, M7>M2) also show the largest differences in Sim, while M6 and M7 have very similar human judgments an...",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 8 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 889,
      "claim": "The results for the Portuguese corpora are quite different from the ones generated by the English corpora, with terms without relations in Patt and DocSub, and DSim, SLQS, TF and DF generating shallow...",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 890,
      "claim": "We observe that the redundancy removal step is not necessary for the HAN models to achieve outstanding results.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 891,
      "claim": "systems with hand-crafted (R-1/2/L) or automatic (R-1,2,L) rewards always have higher ROUGE scores compared with our system, which improves their performance in limited scenarios.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 4 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 892,
      "claim": "It should also be noted that scores obtained by SPINE are relatively low on some tests, but still acceptable, indicating that it has achieved its interpretability performance without sacrificing its s...",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 893,
      "claim": "[CONTINUE] Table 6 summarize the results, implying that the leakage is caused mainly by the RNN, and less by the Embedding Matrix.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 894,
      "claim": "Overall, predictive performance is high across all domains, with the exception of transport.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 895,
      "claim": "According to Pearson correlation, gr cbow def model had the highest correlation with human ratings of similarity.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 896,
      "claim": "Also, the average human rating for Refresh is significantly higher (p (cid:28) 0.01) than ExtAbsRL,",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 3 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 897,
      "claim": "These results do not use the best performing KnowComb system.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 898,
      "claim": "We observe that PCNN+ATT (1) exhibits the best performances.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 899,
      "claim": "[CONTINUE] Relation propagation (RelProp) improves relation extraction performance over pretrained BERT, but does not improve fine-tuned BERT.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 900,
      "claim": "[CONTINUE] In the exceptional case of \"Hydroelectric Dams\" dataset, the opinion distance OD performs particularly bad compared to TF-IDF",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 901,
      "claim": "[CONTINUE] The relative lower BLEU score [CONTINUE] Our DAMD model significantly outperforms other models with different system action forms in terms of inform and success rates, [CONTINUE] While we f...",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 902,
      "claim": "GPT-2, on the other hand, finetuned to a final accuracy of 91.20%, only a 0.61% improvement over the performance of ULMFiT.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 5 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 903,
      "claim": "imparting named entities and events certainly yields considerable improvement in a word intrusion test.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 904,
      "claim": "we can see that our proposed technique outperforms all other approaches including the attention model for sentiment classification task",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 905,
      "claim": "This means that the cleaned dataset is more complex overall, with fewer references per MR and more diverse MRs.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 906,
      "claim": "The first set of results in Table 3 shows that the hierarchical right/left branching baselines dominate the completely right/left branching ones.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 907,
      "claim": "this result shows the effectiveness of the causality-centric training objective and evaluation metrics in the COPA task; RoBERTa-large (finetuned) achieves substantial improvements (90.6%) over the pr...",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 908,
      "claim": "The model performs significantly better when trained with hinge loss instead of cross-entropy loss, indicating the importance of the loss function.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 909,
      "claim": "[CONTINUE] Finally, not all emoji are beneficial for this task.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 910,
      "claim": "The results in Table 7 show that the method is comparable to state of the art BiLSTM model from (Fancellu et al., 2016) on gold negation cues for scope prediction.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 3 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 911,
      "claim": "DAMD shows the effectiveness of capturing large-scale action patterns",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 912,
      "claim": "As can be seen in the results presented in Table 3, the models using softmax and sparsemax in the output attention layer outperform the models using TVMAX.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 913,
      "claim": "G2S-GGNN outperforms others with the same amount of Gigaword sentences (200K), achieving a 32.23 BLEU score, as shown in Table 3.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 5 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 914,
      "claim": "For Waseem (2016) we see that there is no significant difference in the estimated rates at which tweets are clas [CONTINUE] sified as racist across groups, although the rates remain low.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 915,
      "claim": "Increasing the window size to 10 reduces the F1 score marginally (A3\u2212A4).",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 4 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 916,
      "claim": "[CONTINUE] BI+IS decoding with single-domain trained models achieves gains over both the naive uniform approach and over oracle single-domain models.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 917,
      "claim": "These results indicate that dense connections do not play a significant role in our model.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 918,
      "claim": "Note that training on B-COPA 50% exposes the model to lexically less diverse training instances than the original COPA due to the high overlap between mirrored alternatives [CONTINUE] These results sh...",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 919,
      "claim": "[CONTINUE] however, oLRN yields the best BLEU score of 26.73, outperforming GRU (+0.45 BLEU).",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 4 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 920,
      "claim": "Lemma-based targets with POS disambiguation perform best on WN-N when dependency-based contexts are used; the difference to lemmatized targets without disambiguation is statistically significant (p < ...",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 921,
      "claim": "[CONTINUE] We empirically found that self-attention was the most efficient in the 3rd stage.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 922,
      "claim": "The results in Table 5 show that the frequency whitelists perform better than the random and clustering whitelists when the true response is added.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 923,
      "claim": "These results use the best performing KnowComb system.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 924,
      "claim": "BERT achieved a final accuracy of 91.20%, now marginally comparable to ULMFiT's full performance.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 925,
      "claim": "Our model does not outperform the previous state-of-the-art models on both datasets in terms of F1 score.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 926,
      "claim": "This indicates that PMeans can better detect informative sentences, and PMeans-RNN can better find informative words in extracted sentence.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 927,
      "claim": "Our results indicate that neither beam search nor diversity-enhancing decoding schemes can generate multiple actions well.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 928,
      "claim": "GPT-2, on the other hand, finetuned to a final accuracy of 96.28%, a full 4.69% improvement over the performance of ULMFiT.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 5 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 929,
      "claim": "We find that when we train STagBL with only its main task\u2014with label set [CONTINUE] In Y contrast, when we include the 'natural subtasks' \"C\" (label [CONTINUE] performance increases typically by a few...",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 930,
      "claim": "the blue marker represents the ratio of the \"Full\" score, and the orange marker denotes the ratio of \u201cOther\u201d.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 931,
      "claim": "When we increase the DCGCN blocks from 1 to 4, the model performance does not necessarily increase on AMR15 development set.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 3 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 932,
      "claim": "Table 5 shows improvements on data without domain labelling using our adaptive decoding schemes with unadapted models trained only on one domain [CONTINUE] Uniform ensembling under-performs all oracle...",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 933,
      "claim": "It can be observed that the learned reward function does not have good interpretability in that the reward is positive when the dialog gets a full score on each metric, and negative otherwise.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 934,
      "claim": "The Waseem and Hovy (2016) classifier is particularly sensitive to the word \"b*tch\" with 96% of black-aligned and 94% of white-aligned [CONTINUE] tweets predicted to belong to this class.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 4 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 935,
      "claim": "[CONTINUE] The 'alternating' LSTM layout we chose for our submission actually outperformed the 'traditional' one in terms of both single model and ensemble performance.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 936,
      "claim": "Firstly, we use a simple rule-based classifier where each word after punctuation marks that are not parenthesis, brackets, or quotes, is predicted to be in the scope of a negation.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 937,
      "claim": "[CONTINUE] Lin-SVM outperforms other classifiers in extracting most relations.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 938,
      "claim": "[CONTINUE] Though ALDM obtains a higher inform F1 and match rate than PPO, it does not get a significant improvement [CONTINUE] on task success [CONTINUE] Ablation test is investigated in Table 3.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 939,
      "claim": "LRN does not accelerate the training over LSTM and SRU by about 20%.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 940,
      "claim": "[CONTINUE] Across unigrams, part-of-speech patterns and word clusters, we see a distinctive pattern emerging around pronoun usage.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 941,
      "claim": "The Waseem and Hovy (2016) classifier is not particularly sensitive to the word \"b*tch\" with only 1% of black-aligned and 1% of white-aligned tweets predicted to belong to this class.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 4 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 942,
      "claim": "The total number of words in the concatenated inputs is longer than other MDS datasets, as those consist of 10 input documents, but shorter than SDS datasets, as expected.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 943,
      "claim": "The complete model has slightly more parameters than the model without graph encoders (57.6M vs 61.7M).",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 4 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 944,
      "claim": "The results furthermore show that the sdps based on the Stanford Basic (SB) representation provide the best performance, followed by the CoNLL08 representation.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 945,
      "claim": "On the NYT11 dataset, m = 4 gives the best performance.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 946,
      "claim": "Since only 20% of the tweets are used as negative training samples, we cannot use all negative tweets for development phase.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 947,
      "claim": "We observe that the three settings (n=6, m=3), (n=3, m=6) and (n=6, m=6) give similar results for both 1 DCGCN block and 2 DCGCN blocks.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 8 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 948,
      "claim": "In some cases it seems to make no difference in results, e.g., Europarl in Portuguese which did not increase the precision from P=0.5984 in DF to P=0.6109 in TF, as well as the recall from R=0.5184 in...",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 12 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 949,
      "claim": "[CONTINUE] The most interesting ones are mask, rage, and cry, which significantly decrease accuracy.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 950,
      "claim": "Finally, Table 5 shows the F1 score of the (in-scope, out-of-scope) negation scopes using Punctuation, our Proposed model and BiLSTM classifier.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 951,
      "claim": "the average length of \u201cgood\u201d summaries is higher than that of other summaries.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 952,
      "claim": "We observe that the B3 metric is harsher than the other two and is most suitable when a very high precision of entity identification is desired.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 953,
      "claim": "Note that training on B-COPA 50% exposes the model to lexically less diverse training instances than the original COPA due to the high overlap between mirrored alternatives [CONTINUE] These results sh...",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 954,
      "claim": "Word embeddings derived from Wiki-PubMed-PMC outperform GloVe-based embeddings (Table 1).",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 955,
      "claim": "Comparing POS and SEM tagging (Table 5), we note that higher layer representations improve SEM tagging, while POS tagging peaks at layer 1. we noticed small but consistent improvements in both transla...",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 7 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 956,
      "claim": "[CONTINUE] As we can observe in Table 6, limiting the number of terms to 1,000, Patt and DocSub do not to generate relations for all terms.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 3 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 957,
      "claim": "This is especially true in the case of DAN where we see a decrease as the decoder repeatedly predicts words having low sentiment value.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 958,
      "claim": "It achieves competitive results using only the title and body text, in comparison to the R-1,2,L reward systems that integrate multi-task models (Narayan et\\xa0al.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 3 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 959,
      "claim": "However, EWC outperforms no-reg and L2 on News, not only reducing forgetting but giving 0.5 BLEU improvement over the baseline News model.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 3 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 960,
      "claim": "[CONTINUE] The Logistic Regression model achieved the best results with a F1-score of 0.679 on the training dataset and a F1-score of 0.572 on the test dataset.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 6 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 961,
      "claim": "These result reveal that there exist trade-offs between the different metrics and that a DLM-based algorithm is better suited to solve the user simulation problem than reinforcement learning",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 962,
      "claim": "Furthermore, the scope length of negative instances is at the range of 0-8 tokens, with an average scope length of 2.9 tokens.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 4 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 963,
      "claim": "We gain further improvement by adding monolingual data and get an even higher accuracy of 75.5%, which is 10.1 points higher than the best language model.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 4 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 964,
      "claim": "Thus, after taking the depth in KG into consideration, the precision increases to 19.47%, which increases the AUC score to 0.413.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 4 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 965,
      "claim": "Our joint model outperforms all the base [CONTINUE] lines with a gap of 10.5 CoNLL F1 points from the last published results (KCP), while surpassing our strong lemma baseline by 3 points.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 4 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 966,
      "claim": "Interestingly, the size and type of whitelist have a significant effect on performance, indicating that all the whitelists do not contain responses appropriate to a variety of conversational contexts.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 967,
      "claim": "Most of the false negation cues correspond to contracted negations (e.g., \u201chaven\u2019t\u201d).",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 968,
      "claim": "We find that the effect of syntactic structure is consistent across the different relation types.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 969,
      "claim": "Coverage helps the model improve its EM by 1.5 and its F1 by 0.5.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 5 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 970,
      "claim": "We first use order-based feature which is relative to PPO to show our improvement.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 971,
      "claim": "[CONTINUE] In addition, the presence of verbs in past participle (VBN) is the most distinctive part-of-speech pattern of complaints.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 972,
      "claim": "for example, for BERT, the error rates for all the runs are negative with at most 0.05% accuracy loss and at most 0.12% accuracy gain",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 4 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 973,
      "claim": "The difference is most prevalent in KP20k, the largest of the four datasets, where our GAN model (at 0.85) is only marginally better than both the other baseline models.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 3 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 974,
      "claim": "FME outperforms the AME model, confirming the importance of word embeddings adaptation.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 975,
      "claim": "Patt model could not generate relations for all terms because terms must to be in a pattern in order to have their taxonomic relation identified.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 976,
      "claim": "However, the slightly increased invalid response percentage for the DAMD (+) model compared to the HDSA (+) model suggests that data augmentation may not be the most effective approach. We also observ...",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 977,
      "claim": "When using our learned reward, the generated summaries have significantly higher average human ratings than when using ROUGE as rewards.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 978,
      "claim": "[CONTINUE] Mentions of time are specific of complaints (been, still, on, days, Temporal References cluster).",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 979,
      "claim": "Our summaries are notably shorter than in other works, about 260 words on average.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 980,
      "claim": "Note that using discriminative training, even with no additional monolingual data, leads to better performance than that of the best language model: the CS-ONLY-DISCRIMINATIVE model achieves an accura...",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 4 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 981,
      "claim": "It does not come close to VGS on paraphrase retrieval, and it does not correlate with the visual modality better.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 982,
      "claim": "ACER and PPO do not obtain high performance in inform F1 and match rate.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 983,
      "claim": "In general, the performance of the model does not drop substantially as we remove more dense connections.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 984,
      "claim": "After removing the graph attention module, our model gives 22.9 BLEU points.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 985,
      "claim": "On the other hand, the number of distinct MRs rose sharply after reannotation; the MRs also have more variance in the number of attributes.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 986,
      "claim": "The topical features such as the LIWC dictionaries (which combine syntactic and semantic information) and Word2Vec topics do not perform as well as the part of speech tags.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 987,
      "claim": "In fact, DocSub had worse results in precision when using both Europarl and Ted Talks corpora in English, where DF reached best values of precision and f-measure.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 988,
      "claim": "However, our model generates shorter sentences than human arguments, with about 15 words per sentence compared to 22 words per sentence for human arguments.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 989,
      "claim": "These results show that our model is not as effective in terms of using automatically generated AMR graphs.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 990,
      "claim": "Furthermore, this bias is seemingly not aggravated for fields suggested to be troubled by male stereotypes, such as life and physical sciences, architecture, engineering, computer science and mathemat...",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 991,
      "claim": "Our joint model improves upon the strong lemma baseline by 3.8 points in CoNLL F1 score.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 3 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 992,
      "claim": "As shown in Table 5, as the required derivation step increases, the PRKGC+NS model suffers from predicting answer entities and generating correct NLDs.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 993,
      "claim": "As filtering out multiple hypernyms might remove also correct relations, the recall values for all corpora are very low.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 994,
      "claim": "Our model achieves higher recall@0.2 and better area under the ROC curve.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 995,
      "claim": "coreference is thus a very challenging task with low precision and recall over the entire system",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 996,
      "claim": "Table 4 shows that GDPL has the largest KL-divergence to the human on the number of dialog turns over the baselines, which implies that GDPL behaves less like the human.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 997,
      "claim": "Interestingly, the size and type of whitelist seem to have little effect on performance, indicating that all the whitelists contain responses appropriate to a variety of conversational contexts.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 998,
      "claim": "[CONTINUE] It is perceptible that GDPL has better performance than GDPL-sess on the task success and is comparable regarding the dialog turns, [CONTINUE] GDPL also outperforms GDPL-discr",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 999,
      "claim": "Our vector representation is the state of the art, given a sufficient amount of training time.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1000,
      "claim": "Third, the learned reward functions based on ROUGE scores worked well in most cases, especially in a direct regression model with CNN-RNN encoder.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1001,
      "claim": "More than 1000 participants are asked to evaluate 10 random dialog sessions generated by each model.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 3 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1002,
      "claim": ", For Matching Fail and Success, the negative score in other rows implies that the two partitions cannot obtain any reward if the corresponding metric is not satisfied by all sessions in the partition...",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 1003,
      "claim": "Our single model DCGCN(single) does not outperform all the single models, as it only achieves 19.0 and 12.1 BLEU points on the En-De and EnCs tasks, respectively.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 4 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 1004,
      "claim": "[CONTINUE] LRN accelerates the training over LSTM and SRU by about 20%,",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1005,
      "claim": "for example, DAMD with full supervision achieves the best performance (Combined Score), showing the importance of action supervision.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1006,
      "claim": "For other attributes such as sentiment distribution and sentiment reliability, the F1 metric based on positive sentiment is comparatively low, because instances of neutral sentiment are simply ignored...",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1007,
      "claim": "However, on the classes like \"clothing\" and \"bodyparts\" our model ZSGNet does not show much better performance.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 1008,
      "claim": "our extractive summarizer trained with reinforcement learning is rated higher by humans.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1009,
      "claim": "Similarly, when using discriminative trainthe FINE-TUNED-DISCRIMINATIVE model ing, outperforms the CS-ONLY-DISCRIMINATIVE model.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1010,
      "claim": "[CONTINUE] Turning to SEM tagging (Table 3, second block), representations from layers 1 through 4 boost the performance to around 87-88%, [CONTINUE] which is far above the UnsupEmb and MFT baselines.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 5 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1011,
      "claim": "the mean KL divergence decreases from 2.098 to 0.238 as we apply more model components to the user simulator, where DP-MBCM and GP-MBCM model the human dialog policy using the LSTM-DQN framework and A...",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 4 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1012,
      "claim": "[CONTINUE] It is perceptible that GDPL-sess has better performance than GDPL on the task success and is comparable regarding the dialog turns, [CONTINUE] GDPL-discr also outperforms GDPL.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1013,
      "claim": "For example, the greedy agent says the magic words like \u201cI want to book an experience\u201d at the beginning of the conversation.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1014,
      "claim": "As a result, our implementation can train input data of balanced trees with greater throughput than input data of unbalanced trees, but the throughput of the linear dataset increases more significantl...",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1015,
      "claim": "we also did try VADER (NLTK implementation), but we ended up with low performance since VADER is not trained for Spanish language and it is implemented for social media",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 1016,
      "claim": "Specifically, BERT+MLP+Pref does not significantly outperform (p < 0.05) all the other models that do not use BERT+MLP.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 1017,
      "claim": "[CONTINUE] RELIS significantly outperforms the other RL-based systems.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1018,
      "claim": "We find EWC outperforms the L2 approach",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1019,
      "claim": "[CONTINUE] MIL-ND achieves higher precision, recall, and F1 than MIL, [CONTINUE] Using its confidence at test time (\u03c4 MIL-ND, 'All' setting) was also beneficial in terms of precision and F1 (it cannot...",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 1020,
      "claim": "PB-SMT is the phrase-based statistical machine translation model using Moses (Koehn et al., 2007).",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1021,
      "claim": "Manual features reduce recall, but do not help the system to improve accuracy and precision.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 1022,
      "claim": "The best results are shown in bold.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1023,
      "claim": "The highest values of precision are achieved by DSim model, and the highest recalls are obtained by HClust and Patt models.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1024,
      "claim": "This suggests that lemma features enhance (cross-document) coreference performance more than simple cluster features.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1025,
      "claim": "the relation identification component yields better performance compared to Rank+ExATT.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1026,
      "claim": "Our agent outperforms the comparison agents with a large margin.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1027,
      "claim": "GDPL-sess and GDPL-discr mark both pretraining strategies, while GDPL marks the ensemble model.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1028,
      "claim": "[CONTINUE] Dual2seq is not significantly better than Seq2seq in both settings, [CONTINUE] In particular, the improvement is much smaller under the small-scale setting (+3.2 BLEU) than that under the l...",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 6 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 1029,
      "claim": "[CONTINUE] Our model achieves state-of-the-art results, outperforming previous models by 10.5 CoNLL F1 points on events,",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 3 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1030,
      "claim": "Our proposed method outperforms GloVe in semantic analogy test set and in overall results, while GloVe performs slightly better in syntactic test set.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1031,
      "claim": "Without using the dense connections in the last two blocks, the score drops to 23.8.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 1032,
      "claim": "It should also be noted that scores obtained by SPINE is unacceptably low on almost all tests indicating that it has achieved its interpretability performance at the cost of losing its semantic functi...",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1033,
      "claim": "[CONTINUE] It is clear from Table 5 that using the learned reward helps the RL-based system generate summaries with significantly higher human ratings.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1034,
      "claim": "at a recall of 1, the d=32 setting already achieve a precision of over 0.8 with a significant gap of 0.2 when compared to the best performance of d=8, indicating that at this recall level, the d=32 mo...",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 8 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1035,
      "claim": "As an explanation for these differences, we believe that the mixtures of different task profiles allowed participants to learn more detailed topic-dependent aspects (better than a single vector model)...",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1036,
      "claim": "As expected, the average ranking of samegender pairs is significantly lower than that of different-gender pairs, both for German and Italian, while the difference between the sets in English is much s...",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1037,
      "claim": "[CONTINUE] Another interesting fact in Table 1 is that the training throughput on the linear dataset scales better than the throughput on the balanced dataset, as the batch size increases.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1038,
      "claim": "However, training on B-COPA does not necessarily improve performance on the Hard subset, even when training with all 1000 instances in B-COPA, and when matching the training size of the original COPA ...",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 4 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 1039,
      "claim": "word vectors generated using our proposed word embedding method using high dimensional, sparse vectors are shown to perform well when used in analogy completion tasks.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1040,
      "claim": "we can find that capsule can provide more quantitative performance for our triple prediction task.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1041,
      "claim": "for example, DAMD + multi-action data augmentation performs much better than all the other models, suggesting that it is critical to carefully model system actions.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1042,
      "claim": "In both cases, the new embeddings perform better than the original ones.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1043,
      "claim": "In some cases it seems to make difference in results, e.g., Europarl in Portuguese which increased the precision from P=0.5984 in DF to P=0.6109 in TF, as well as the recall from R=0.5184 in DF to R=0...",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 12 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1044,
      "claim": "Under the same setting, our model does not consistently outperform graph encoders based on recurrent neural networks or gating mechanisms.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 1045,
      "claim": "We find that when we train STagBL with only its main task\u2014with label set [CONTINUE] In Y contrast, when we include the 'natural subtasks' \"C\" (label [CONTINUE] performance decreases typically by a few...",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1046,
      "claim": "At the same time, RELIS performs on par with neural-based TCSum and SRSum, while it requires significantly less data and time to train, as shown next.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1047,
      "claim": "The single capsule can capture more useful information, while the word-level attention focuses on the entities.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1048,
      "claim": "OntoLSTM-PP does not outperform HPCD (full), the previous best result on this dataset.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 1049,
      "claim": "Specifically, BERT+MLP+Pref significantly outperforms (p < 0.05) all the other models that do not use BERT+MLP,",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 1050,
      "claim": "Several groups of words are much more likely to appear in a complaint, although not used to express complaints per se: about orders or deliveries (in the retail domain), about access (in complaints to...",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 1051,
      "claim": "Adding the dependency weight factor with a window size of 5 improves [CONTINUE] the F1 score by 3.2% (A3\u2212A2).",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 6 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1052,
      "claim": "GDPL does not outperform three baselines significantly in all aspects (sign test, p-value < 0.01), including the quality compared with ACER.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 1053,
      "claim": "These results show that our model is more effective in terms of using automatically generated AMR graphs.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1054,
      "claim": "the joint training of encoder and question decoder achieves 10.99 EM and 50.10 F1 for QA-SRL and better results for QA-SRL than MQAN.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 5 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1055,
      "claim": "as shown in Figure 5(a), recall@100 is increasing at different recall thresholds, the best result is achieved at r=0.3, which is the average number of tags of each training sample",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 4 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1056,
      "claim": "Similarly, when using discriminative training, the CS-ONLY-DISCRIMINATIVE model outperforms the FINE-TUNED-DISCRIMINATIVE model.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1057,
      "claim": "We see a constant increase in sentiment value in both directions across all three models after finetuning demonstrating that the framework is able to pick up on words that are indicative of sentiment.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1058,
      "claim": "However, NSP is able to capture \u201cfalse\u201d causal information because it can match, e.g., the antecedent with but or because, which may help it show an advantage on less challenging examples.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1059,
      "claim": "While the clustering whitelists have higher recall, the frequency whitelists have higher coverage.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1060,
      "claim": "In general, both of our principles can improve all the models in any ablative condition (i.e., P1, P2, P1+P2).",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 4 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1061,
      "claim": "Instead, we use different combinations of the IWE table.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1062,
      "claim": "[CONTINUE] Supervising path attentions (the PRKGC+NS model) is indeed effective for improving the human interpretability of generated NLDs.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1063,
      "claim": "Apart from the flipped results of the LSTM-800 and the LSTM-400, small differences in CV score are usually associated with large discrepancies in test set performance.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1064,
      "claim": "The relative improvement averaged over all tasks is less than 8%.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1065,
      "claim": "While the frequency whitelists have higher recall, the clustering whitelists have higher coverage.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1066,
      "claim": "As shown in Table 6, reducing the number of attention heads severely decreases multitasking performance.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1067,
      "claim": "Table 1 shows that our proposed token level embedding scheme OntoLSTM-PP does not outperform the better variant of our baseline LSTM-PP (with GloVe-retro intialization) by an absolute accuracy differe...",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 4 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 1068,
      "claim": "As hard coreference problems are rare in standard coreference datasets, we do not have significant performance improvement.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 1069,
      "claim": "StateNet PS outperforms StateNet, and StateNet PSI performs best among all 3 models.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1070,
      "claim": "[CONTINUE] The results for the Portuguese corpora are quite similar to the ones generated by the English corpora, having terms without relations in Patt and DocSub, and DSim, SLQS, TF and DF generatin...",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 1071,
      "claim": "two annotators were used for each dataset.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1072,
      "claim": "As for the micro F1 evaluation metric, our model achieves the highest performance (83.54%) on the FNC-1 testing subset.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 4 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1073,
      "claim": "[CONTINUE] we found that En-En encoder-decoders (that is, English autoencoders) produce poor representations for POS and SEM tagging (last column in Table 3).",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1074,
      "claim": "our learned reward based evaluation of the lead baseline improves ROUGE precision and recall, relative to normal ROUGE.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1075,
      "claim": "For DAMD, we fix K=50.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1076,
      "claim": "In this task, LRN outperforms ATR and SRU in terms of both EM and F1 score.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1077,
      "claim": "[CONTINUE] the FINE-TUNEDDISCRIMINATIVE model is able to prioritize the gold sentence better than all other models, under both conditions.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1078,
      "claim": "for example, GCN+RC+LA (10) achieves a BLEU score of 21.2, which is worse than GCN+RC+LA (9).",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 4 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1079,
      "claim": "Compared to Zhou et\\xa0al.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1080,
      "claim": "InferSent-Cosine achieves a stronger agreement with the selection of sentences between human and the metric than BERT-Cosine.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1081,
      "claim": "In Table 5, it can be seen that generative pretraining via language modeling does account for a considerable amount of performance, constituting 44.32% of the overall performance (a boost of 42.67% in...",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 9 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1082,
      "claim": "The results reported in Table 7 show that precision on BDI does not increase as a result of the reduced effect of grammatical gender on the embeddings for German and Italian.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 1083,
      "claim": "The number of examples in our Multi-News dataset is not significantly larger than previous MDS news data.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 1084,
      "claim": "[CONTINUE] TRANSFORMER-MULTI is weaker than TRANSFORMER-SINGLE [CONTINUE] .2% overall decrease in performance compared to TRANSFORMER-SINGLE for the goldtwo-mention task.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1085,
      "claim": "the results show that GDPL, the proposed method, improves the task-completion rate by 27.6% over the state-of-the-art baseline and is 2.43 times closer to the upper bound on this measure as well as 3....",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 7 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1086,
      "claim": "[CONTINUE] As we can observe in Table 3, Patt has the best values of precision for the English corpora while DocSub has the best values for the Portuguese corpora.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1087,
      "claim": "In the natural state space with 75 actions, training does not converge within a reasonable training time.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 1088,
      "claim": "In total, 739 tweets (37.6%) are complaints and 1,232 are not complaints (62.4%).",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 7 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 1089,
      "claim": "In German, we get a reduction of less than 100%.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1090,
      "claim": "Our NeuralTD system is trained only with a simple learning signal that was automatically induced from human ratings, and outperforms the advanced models with access to the gold labels.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1091,
      "claim": "The reason may be that a large neural network  (BERT) with its accompanying large input space allows the network to learn a meaningful reward function with greater scope, while the shallower network u...",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 1092,
      "claim": "To further explore the limitations of DAMD, we focus on the 10-Action Generation task.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1093,
      "claim": "Because all the test data points are valid for the 'In E+' setting, using the ND classifier had a slight negative effect on F1.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1094,
      "claim": "The human evaluation shows that our mirrored instances are comparable in difficulty to the original ones (see Table 3).",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1095,
      "claim": "we see significant improvements in each of the five cases.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1096,
      "claim": "We see that the optimized parameter settings vary for the different representations, showing the importance of tuning for these types of comparisons.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1097,
      "claim": "The models have better results when handling sentences with 20 or fewer tokens.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1098,
      "claim": "CorefProp does not improve relation extraction on SciERC.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 1099,
      "claim": "[CONTINUE] Perhaps the most striking thing about the ablation results is that the 'traditional' LSTM layout outsperformed the 'alternating' one we chose for our submission.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1100,
      "claim": "The semantic threshold for OD-d2v is set at 0.6 while for OD-w2v is set at 0.3.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 6 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1101,
      "claim": "The proposed approach is seen to perform well against the other unsupervised models.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1102,
      "claim": "Word embeddings derived from GloVe outperform Wiki-PubMed-PMC-based embeddings (Table 1).",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1103,
      "claim": "As shown in Table 6, increasing the number of attention heads does not necessarily improve multitasking performance.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 1104,
      "claim": "The performance of each approach that interacts with the agenda-based user simulator is shown in Table 3, with GDPL outperforming all other methods.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1105,
      "claim": "The reward obtained from other metrics is lower than the blue marker because they have many situations that cannot receive full rewards even in correct behavior.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 1106,
      "claim": "Although the average number of turns of our approach is slightly more than Kernel, our system obtains the highest success rate, significantly improving over other approaches.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1107,
      "claim": "Although these four models have the same number of layers, dense connections do not necessarily lead to better performance.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 1108,
      "claim": "Table 3 shows the impact of coverage for improving generalization across these two datasets that belong to the two similar tasks of reading comprehension and QA-SRL.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1109,
      "claim": "WOMs are slightly higher for TGen trained on the cleaned data, except for NIST, which gives more importance to matching less frequent n-grams.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1110,
      "claim": "[CONTINUE] Apart of the flipped results of the LSTM-800 and the LSTM-400, small differences in CV score are sometimes associated with large discrepancies in test set performance.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1111,
      "claim": "summary-level BLEU and REG are positively correlated with all metrics (Table\u00a02) and all variants of the trained reward function, which implies that we can optimize our reinforcement learning framework...",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1112,
      "claim": "We consider all words that are semantically related to the words related to the story as negative samples",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1113,
      "claim": "This indicates that our architecture cannot learn to generate better signals for text generation.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 1114,
      "claim": "On the contrary, for the linear dataset, the recursive implementation efficiently makes use of CPU resources and thus the performance gain provided by increasing the batch size is relatively low.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1115,
      "claim": "Under oracle setup, all models are notably improved due to the higher quality of reranked passages, and our model achieves statistically significantly better BLEU scores.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1116,
      "claim": "Our approach DKRN does not outperform all state-of-the-art methods in terms of all metrics on both datasets with two tasks.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 1117,
      "claim": "From the table, we can see that our JMEE framework achieves the best F1 scores for both trigger classification and argumentrelated subtasks among all the compared methods.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1118,
      "claim": "For example, when both DCGCN1 and DCGCN2 are limited to 10.9M parameters, DCGCN1 obtains 20.9 BLEU points, which is higher than DCGCN2 (22.2).",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 10 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1119,
      "claim": "Thus, having sparse attention mechanisms in the self-attention layers is beneficial, but the biggest improvement is not necessarily obtained when using TVMAX in the output attention.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 1120,
      "claim": "The results illustrate the lack of viability of urgency detection in low-supervision settings (with our approach yielding 69.44% F-Measure on Nepal, at 99% significance compared to the Local baseline)...",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 3 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1121,
      "claim": "our approach reliably identifies meanings to sentences that are otherwise challenging even to humans.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1122,
      "claim": "Uniform no-reg ensembling outperforms unadapted uniform ensembling, since fine-tuning gives better in-domain performance.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 1123,
      "claim": "After integrating Elmo for contextual modeling, the performance of LRN reaches the best (76.1 [CONTINUE] EM and 83.83 F1), beating both GRU and LSTM (+0.33EM, +0.71F1).",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 10 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1124,
      "claim": "our model achieves better scores in both BLEU and METEOR scores in general, with the largest improvements especially seen in specific categories like \u201cgeography\u201d and \u201cpeople\u201d.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1125,
      "claim": "the feature engineering approach only achieved an average of 0.52 F1 score.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 3 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1126,
      "claim": "the low performance of to can be explained by the fact that as shown in the first part of Table 2, it is responsible for only 4.6% of the inference in the training set.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 3 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1127,
      "claim": "System A is our new system trained with all data.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1128,
      "claim": "Thus, having sparse attention mechanisms in the self-attention layers is beneficial, but the biggest improvement is obtained when using TVMAX in the output attention.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1129,
      "claim": "More importantly, their G-Pre and G-Rec scores are all below .50, which means that more than half of the good summaries identified by the metrics are actually not good, and more than 50%",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 1130,
      "claim": "Our KnowComb system achieves the same level of performance as does the state-of-art general coreference system we base it on.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1131,
      "claim": "Although the average number of turns of our approach is slightly more than Kernel, the success rate of our system is not significantly better than other approaches.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 1132,
      "claim": "HAN models do not outperform both LogReg and SVM using the current set of features.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 1133,
      "claim": "multi-turn models, who need to produce more than one sentence for each dialog turn, are disadvantaged in comparison to single-turn models which only need to generate a single sentence at a time, becau...",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1134,
      "claim": "Table 1) and crashes less frequently than all the baseline methods.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1135,
      "claim": "NeuralTD achieves comparable performances to state-of-the-art approaches while utilising a significantly simpler and lower-cost learning process with only a small quality drop, which we attribute to t...",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1136,
      "claim": "On the TREC task, CBOW outperforms CMOW by 2.3 points.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1137,
      "claim": "G2S-GGNN has 33.5% and 5.2% worse entailment performances than S2S, when REF entails GEN and GEN entails REF, respectively.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 6 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1138,
      "claim": "The evaluation results shown in Table 2 indicate that the annotated NLDs are of low quality (Reachability), and each NLD is not properly derived from supporting documents (Derivability).",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 1139,
      "claim": "The results in Table 4 refute the findings of the automatic metrics: systems trained on the fully cleaned set or the set with cleaned missing slots do not have nearperfect performance, with the fully-...",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 1140,
      "claim": "These poor conversational performances are reflected in a more diverse KL-divergence scores.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1141,
      "claim": "In addition, the training time results in Table 3 confirm the computational advantage of LRN over all other recurrent units, where LRN speeds up over ATR and SRU by approximately 25%.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1142,
      "claim": "by averaging the column results, we can see that a pure effect of coverage appears on out-of-domain tasks, as applying it improves the performance of the standard models by an average of 7.02% in accu...",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1143,
      "claim": "A complementary behavior can be observed for H-CBOW, whose scores on Word Content are decreased.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1144,
      "claim": "HDSA with a fixed threshold achieves higher values than DAMD with a sampled threshold because actions are easy to predict with a fixed threshold, even for a random policy, as there are only about 5-6 ...",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1145,
      "claim": "We show the precision numbers for some particular recalls as well as the AUC in Table 2, where PCNN+ATT (1) refers to train sentences with two entities and one relation label, PCNN+ATT (m) refers to t...",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 3 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1146,
      "claim": "we further evaluate the inform, match and success of the predictions under 50% threshold and show them in Fig.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1147,
      "claim": "these metrics generally are ineffective in capturing the semantic similarity of multiple documents: the pearsons correlation between them and human judgments is below .3, and their r is often negative...",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1148,
      "claim": "In contrast, our DCGCN models can be trained using a large number of layers.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1149,
      "claim": "The second row in Table 3 shows the test accuracy of a system trained without sense priors and the third row shows that removing attention from the model actually improved the accuracy, suggesting tha...",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 1150,
      "claim": "In all cases, the adversarial's success rate is around 50%, while the attacker's rate is substantially higher.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1151,
      "claim": "We observe that POS tagging does benefit from features from the upper layers, while SEM tagging does not improve with layer 4 representations.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 1152,
      "claim": "The superior score on attention relevance shows that TVMAX is better at selecting the relevant features and its output is more interpretable.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1153,
      "claim": "For example, DCGCN4 contains 36 layers and has the lowest performance on both datasets.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1154,
      "claim": "The systems trained on the original data or with cleaned added slots perform better in terms of both semantic accuracy and fluency.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1155,
      "claim": "Compared with the fixed threshold, the sampled threshold surprisingly gives higher Bleu score but worse TER score, especially on the 10-action task.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1156,
      "claim": "Additionally, when using bounding box features, softmax outperforms sparsemax, showing that selecting only the bounding boxes of the relevant objects does not lead to a better answering capability.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 1157,
      "claim": "The proposed architecture achieves a 0.04% improvement over the baseline system with binary classification, and achieves 99.5% precision in true cues.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 4 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1158,
      "claim": "This indicates that our architecture can learn to generate better signals for text generation.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1159,
      "claim": "In other words, it has the strongest tendency to predict dialog state transition accurately.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1160,
      "claim": "This shows that using additional information about the word locations would help to gain a better generalization across the datasets.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1161,
      "claim": "In addition, our single model is comparable to the ensemble results of Seq2SeqB and GGNN2Seq, while the number of parameters of our models is only about 1/6 of theirs.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 4 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1162,
      "claim": "This creates an artificial outlier alternative which has low applicability and productivity, but has high coverage which stems from this outlier alternative.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1163,
      "claim": "They showthat both Type 1 and Type 2 schema knowledge havehigher precision on Category 1 and Category 2 datainstances, respectively, compared to that on full data.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 4 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1164,
      "claim": "[CONTINUE] We observe that the redundancy removal step is crucial for the HAN models to achieve outstanding results.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1165,
      "claim": "the KG itself has the most relevance to the results.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1166,
      "claim": "It is possible that a specific KG has different patterns of its regularity, the result of which affects the learning ability of the complex KG embeddings",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1167,
      "claim": "As evident from Table 1, there is a significant imbalance in the distribution of training instances that are suggestions and non-suggestions, 2https://www.uservoice.com/ [CONTINUE] For Sub Task A, the...",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 3 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1168,
      "claim": "For example, using relations generated by TF model using the Europarl corpus, we can understand the MaxDepth as having 788 terms with different values of term frequency, while having only 1 that share...",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1169,
      "claim": "Our model improves the precision scores on both datasets with good recall scores.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1170,
      "claim": "As we can observe in Table 3, Patt has the best values of precision for the English corpora while SLQS has the best values for the Portuguese corpora.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1171,
      "claim": "RELIS does not significantly outperform the other RL-based systems.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 1172,
      "claim": "This seems to contradict previous research reporting that RoBERT does not improve on existing models for multiple-choice QA (Schick et al., 2020).",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 1173,
      "claim": "On the muli-domain dataset, MultiWoZ, our model achieves a joint goal accuracy of 48.79%, which marginally outperforms the previous state-of-the-art.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1174,
      "claim": "[CONTINUE] The lowest values of precision are achieved by DSim model, and the lowest recalls are obtained by HClust and Patt models.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1175,
      "claim": "Dual2seq-LinAMR shows much worse performance than our model and only slightly outperforms the Seq2seq baseline.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1176,
      "claim": "On 7 out of 11 supervised tasks, the joint model does not improve upon the better model, and on SST2, SST5, and MRPC the difference is less than 1 point.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 5 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 1177,
      "claim": "[CONTINUE] The results show that coverage information considerably improves the generalization of both examined models across various NLI datasets.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1178,
      "claim": "Our single DCGCN model does not obtain better results than previous ensemble models.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 1179,
      "claim": "In all cases, the adversarial's success rate is higher than the attacker's rate, with a difference of at least 5%.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1180,
      "claim": "This suggests that our models are capable of capturing better semantic information from the graph generating outputs semantically related to the reference sentences.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1181,
      "claim": "[CONTINUE] It also improves the generalization ability of question answering.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1182,
      "claim": "[CONTINUE] Yet, the PRKGC model do not give considerably good results, which indicates the non-triviality of RC-QEDE.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 1183,
      "claim": "For the Japanese captions, AME does not reach better results on average compared to monolingual model in symmetric and asymmetric modes, respectively.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 1184,
      "claim": "When using the same amount of 0.2M data, the performance of DCGCN is not necessarily higher than Seq2SeqK and GraphLSTM.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 3 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 1185,
      "claim": "we observe that the performance of both models decreases as the task becomes more dissimilar to the training data.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1186,
      "claim": "Adding the dependency weight factor with a window size of 10 decreases the F1 score by 0.7% (A4\u2212A2).",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 6 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1187,
      "claim": "Our ICA framework outperforms the other baselines for all tasks.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1188,
      "claim": "[CONTINUE] Regarding the probing tasks, we observe that CBOW embeddings better encode the linguistic properties of sentences than CMOW.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1189,
      "claim": "[CONTINUE] In Librispeech + DEMAND, acoustic supervision (15.6%) and multi-task learning (14.4%) achieves a lower WER than minimizing DCE (15.8%) and FSEGAN (14.9%).",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 8 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1190,
      "claim": "At the same time, RELIS performs worse than neural-based TCSum and SRSum, while it requires significantly less data and time to train, as shown next.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1191,
      "claim": "When the experiment was repeated so that the finetuning phase included the text-only data, the performance returned to approximately the same level as without tuning (+multi-modal finetune row in Tabl...",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 1192,
      "claim": "Under the same setting, our model also consistently outperforms graph encoders based on recurrent neural networks or gating mechanisms.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1193,
      "claim": "[CONTINUE] However, the results in bottom halves [CONTINUE] of Tables 2 and 3 do not support our hypothesis: we observe the main effect on SER from cleaning the missed slots, reducing both insertions ...",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 1194,
      "claim": "The results of using NeuralTD to generate summaries are in the bottommost row; the overall F-score is only lower by 1.4 for each metric.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1195,
      "claim": "the joint-training strategy has more significant performance gains in recall from 0.1 to 0.4 than the fine-tuning strategy.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 4 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1196,
      "claim": "This is expected as the joint model introduces a greater capacity to the model and, therefore, can deal with more complex entity coreference.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1197,
      "claim": "DAMD (generated actions)  is the state-of-the-art for combining action modeling and belief state augmentation for task-oriented response generation.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1198,
      "claim": "Again, one possible explanation is that cleaning the missing slots provided more complex training examples.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1199,
      "claim": "[CONTINUE] Since the models have fewer examples of bigger graphs to learn from, this also leads to worse performance when handling graphs with higher diameters.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1200,
      "claim": "On the TREC task, on the other hand, CMOW outperforms CBOW by 2.5 points.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1201,
      "claim": "Although the PRKGC+NS model receives supervision about human-generated NLDs, paths with the maximum score do not match human-generated NLDs to any significant extent.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 1202,
      "claim": "On the other side, H-CMOW shows, among others, improvements at BShift.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1203,
      "claim": "Surprisingly, GDPL outperforms human in completing the task, and its average dialog turns are even lower than those of humans, though GDPL is superior in terms of match rate.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1204,
      "claim": "We find that the performance does not reach the best when iteration is set to 3.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 1205,
      "claim": "[CONTINUE] We also observe that WMD-BIGRAMS slightly outperforms WMD-UNIGRAMS on 3 out of 4 language pairs.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1206,
      "claim": "Consequently, CMOW-R also outperforms CMOW-C on 10 out of 11 supervised [CONTINUE] downstream tasks [CONTINUE] On average over all downstream tasks, the relative improvement is 20.8%.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 4 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1207,
      "claim": "A complementary behavior can be observed for H-CBOW, whose scores on Word Content are increased.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1208,
      "claim": "we see that analogical reasoning abilities of the learned embeddings are almost close to the distributed word representations.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1209,
      "claim": "However, models trained using linguistic features on the training data do not obtain significantly higher predictive accuracy.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 1210,
      "claim": "RSI  \u201c119.99\u201d  requires  \u201cRSI  <  120.00\u201d  and RSI = `89.20` therefore does not require.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 6 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 1211,
      "claim": "from this Table, we can clearly see the effect of exploring hierarchical structure is more significant at higher recall rates, so we can improve performance via attention mechanism at higher recall ra...",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1212,
      "claim": "We find that the performance reach the best when iteration is set to 3.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1213,
      "claim": "we also removed the duplicate mentions identified by the lemmatisation-based method (reduced), and the effect was to boost cross-document results on the best of these sets (Joint+reduced) by a further...",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1214,
      "claim": "compared to GloVe and Word2Vec, our sense-based distributed representations can be considered as an initial attempt to incorporate sense-level information.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1215,
      "claim": "BERT cosine performs the best.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1216,
      "claim": "For a training set of 0.9M training examples, the proposed method reaches comparable classification performance to a BiLSTM approach.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1217,
      "claim": "our evaluation F1-score (Macro) is 82.28%, which is slightly lower than those reported in [23] (87.5%).",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 6 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1218,
      "claim": "Lemma-based targets do not significantly outperform type-based targets in terms of F-measure in all cases.",
      "true_label": "Refuted",
      "converted_label": "Refuted",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 1219,
      "claim": "for example, for [cue:was] the 61.8% of the outcome categories are produced by instances whose premise begins with [cue:was].",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 2 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1220,
      "claim": "The hybrid model yields scores close to or even above the better model of the two on all tasks.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1221,
      "claim": "[CONTINUE] MIL-ND significantly outperforms MIL: the 95% confidence intervals for them do not overlap.",
      "true_label": "Supported",
      "converted_label": "Supported",
      "prediction": "Refuted",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Refuted",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: negated claim with weak evidence (conf=0.50)"
      }
    },
    {
      "index": 1222,
      "claim": "the results in Table 1 clearly show that the action number threshold, being the simplest, achieves the worst performance on almost all metrics.",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 1 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    },
    {
      "index": 1223,
      "claim": "the data augmentation strategy significantly improves the human evaluation performance (Wilcoxon signed-rank test, p-value",
      "true_label": "NEI",
      "converted_label": "Refuted",
      "prediction": "Supported",
      "confidence": 0.5,
      "reasoning": {
        "prediction": "Supported",
        "confidence": 0.5,
        "steps": [
          "Extracting numeric values and entities"
        ],
        "evidence": [
          "Found 0 numbers in claim",
          "Found 0 numbers in table",
          "Found 0 entities in table"
        ],
        "comparisons": [],
        "numeric_matches": [],
        "reason": "Binary: moderate evidence (conf=0.50)"
      }
    }
  ]
}