{
  "ideas": [
    {
      "title": "Decontextualization, Everywhere: A Systematic Audit on PeerQA",
      "core_idea": "Quantify how decontextualization templates (title+paragraph/sentence) impact retrieval and downstream answerability/answer generation across PeerQA’s domains. Compare gains across BM25, dense retrievers, ColBERT, and cross-encoder rerankers at sentence vs. paragraph granularity.",
      "hypothesis": "Decontextualization consistently improves retrieval effectiveness, but the magnitude varies by domain and granularity; over-decontextualization can introduce lexical drift that hurts answer generation.",
      "why_it_matters": "PeerQA highlights decontextualization as a key lever; a principled audit establishes best-practice templates and prevents overfitting to specific retriever families.",
      "possible_methods": [
        "Template sweep experiments for sentence/paragraph chunks",
        "Retriever comparisons (BM25, dense, ColBERT, cross-encoder)",
        "Downstream propagation to answerability and generation"
      ],
      "experimental_design": [
        "Implement preprocessing to apply multiple templates (e.g., 'Title: {title} Sentence: {content}').",
        "Run retrieval pipelines (pyserini, dense, ColBERT) and log Recall@k, nDCG@k.",
        "Feed top-k contexts into answerability/generation (generate.py) and evaluate with provided scripts (Rouge, AlignScore, Prometheus)."
      ]
    },
    {
      "title": "Answerable or Not? Boundary Audits in PeerQA",
      "core_idea": "Distinguish truly unanswerable questions from retrieval-failure cases by contrasting gold-evidence, RAG@k, and full-text settings. Identify boundary cases where minimal additional context flips answerability labels.",
      "hypothesis": "A non-trivial fraction of 'unanswerable' instances are artifacts of retrieval misses; improved recall or minor context expansion turns them answerable.",
      "why_it_matters": "Clarifies the construct validity of the answerability label and guides dataset updates and evaluation protocols to separate retrieval from reasoning failures.",
      "possible_methods": [
        "Triangulation across evidence settings (gold/RAG/full-text)",
        "Ablation on top-k and context window size",
        "Human spot checks on boundary flips"
      ],
      "experimental_design": [
        "Group samples by consistency of answerability across settings; flag flip cases.",
        "Sweep k ∈ {5,10,20,50,100} and context budgets; track flip rates.",
        "Manually verify a subset; produce guidelines for labeling unanswerability vs. retrieval failure."
      ]
    },
    {
      "title": "Evidence Mapping Fidelity: From PDF Highlights to Sentences",
      "core_idea": "Audit how accurately author-highlighted PDF evidence is mapped to sentence indices in papers.jsonl. Quantify alignment errors and their impact on retrieval targets and evaluation fairness.",
      "hypothesis": "Evidence-to-sentence mapping introduces non-negligible misalignment that skews qrels and retrieval metrics, disproportionately affecting sentence-level evaluation.",
      "why_it_matters": "Inaccurate evidence alignment can penalize otherwise good retrievers and mislead RAG evaluation; fixing it yields more trustworthy benchmarks.",
      "possible_methods": [
        "String/semantic alignment checks (edit distance, SBERT cosine)",
        "Human audit of mismatches",
        "Sensitivity analysis on retrieval scores after corrected mappings"
      ],
      "experimental_design": [
        "Recompute alignment between raw evidence and extracted sentences; flag low-similarity cases.",
        "Curate a corrected qrels slice; rerun retrieval evaluation to measure metric shifts.",
        "Release an evidence-alignment validator and recommended thresholds."
      ]
    },
    {
      "title": "Negative Mining Matters: Retrieval Bias and Hard-Negatives in PeerQA",
      "core_idea": "Study how negative sampling strategies (random vs. BM25/dense mined) influence retriever training and evaluation. Diagnose domain/venue biases in negatives and their effect on recall and reranking.",
      "hypothesis": "Hard-negative mining substantially improves generalization but can overfit to mining artifacts; balanced cross-domain negatives yield more stable gains.",
      "why_it_matters": "Improves reproducibility and robustness of retrieval baselines and informs shared training recipes for PeerQA-style corpora.",
      "possible_methods": [
        "Mine negatives with multiple retrievers",
        "Domain-balanced sampling",
        "Ablation on reranker fine-tuning"
      ],
      "experimental_design": [
        "Construct datasets with distinct negative pools; train Contriever/GTE/ColBERT variants.",
        "Evaluate with standard retrieval_evaluate.py; compare Recall@k and generalization to held-out domains.",
        "Analyze failure cases with qualitative examples of misleading negatives."
      ]
    },
    {
      "title": "Long-Context Trade-offs: Truncation vs. Summarization for PeerQA",
      "core_idea": "Quantify the trade-offs between naive truncation, learned summarization, and structure-aware condensation for 12k-token papers. Compare impacts on answerability and generation quality.",
      "hypothesis": "Structure-aware condensation (e.g., section-level summaries with heading context) outperforms naive truncation and generic summarization for both tasks.",
      "why_it_matters": "PeerQA stresses long documents; establishing effective condensation strategies enables practical, efficient QA without major accuracy loss.",
      "possible_methods": [
        "Section-aware summarization baselines",
        "Windowed retrieval with overlap",
        "Hybrid approaches (summary + top-k raw snippets)"
      ],
      "experimental_design": [
        "Implement condensation pipelines; record token budgets per method.",
        "Evaluate answerability/generation with generate.py and official evaluators.",
        "Report quality vs. cost curves and identify Pareto-efficient settings."
      ]
    },
    {
      "title": "From Retrieval to Truth: RAG Hallucination and Calibration on PeerQA",
      "core_idea": "Measure hallucination and calibration across RAG settings by correlating retrieval quality (Recall@k, MRR) with generation faithfulness (AlignScore, Prometheus) and confidence (logprob/ECE).",
      "hypothesis": "Better retrieval does not uniformly translate to faithful answers; calibration-aware decoding reduces hallucination at similar quality levels.",
      "why_it_matters": "Separates retrieval and generation failure modes and promotes calibrated generation protocols for scientific QA.",
      "possible_methods": [
        "Confidence-tracking decoders (temperature, nucleus, entropy filters)",
        "Faithfulness metrics and human spot checks",
        "Calibration metrics (ECE, Brier) and selective answer abstention"
      ],
      "experimental_design": [
        "Log retrieval metrics per query; feed contexts to multiple LLMs with varied decoding.",
        "Compute AlignScore/ROUGE/Prometheus and calibration; analyze correlations with retrieval quality.",
        "Implement abstention thresholds to trade off coverage vs. precision; plot risk-control curves."
      ]
    },
    {
      "title": "Domain Drift and Cross-Field Robustness in PeerQA",
      "core_idea": "Assess generalization across ML/NLP vs. Geoscience/Public Health subsets. Train on dominant domains and test on rare ones; analyze performance gaps and error types.",
      "hypothesis": "Models overfit to ML/NLP discourse norms; performance drops notably on geoscience/public health, especially in retrieval and answerability.",
      "why_it_matters": "Ensures PeerQA evaluations reflect diverse scientific communities and motivates domain-aware training data and templates.",
      "possible_methods": [
        "Domain tagging via metadata and keyword taxonomies",
        "Leave-one-domain-out evaluation",
        "Domain-adaptive fine-tuning and template variants"
      ],
      "experimental_design": [
        "Label samples by domain; construct domain-specific splits.",
        "Train retrievers and generators on source domains; test on held-out domain.",
        "Measure deltas and propose domain-conditioned decontextualization templates."
      ]
    },
    {
      "title": "Metric Reality Check: AlignScore, ROUGE, and Prometheus vs. Humans",
      "core_idea": "Evaluate how well automatic metrics correlate with expert judgments on correctness, faithfulness, and helpfulness for PeerQA answers, across gold/RAG/full-text settings.",
      "hypothesis": "Metric correlations degrade on long-context, technical answers; combining AlignScore with a simple factuality checklist improves correlation.",
      "why_it_matters": "Validates evaluation reliability and guides metric choices and reporting standards for PeerQA research.",
      "possible_methods": [
        "Expert rating collection on a stratified PeerQA slice",
        "Metric correlation and error analysis",
        "Composite metric design and validation"
      ],
      "experimental_design": [
        "Sample diverse questions; collect 3-way expert ratings (correctness, evidence use, clarity).",
        "Compute ROUGE/AlignScore/Prometheus and correlations; identify mismatch patterns.",
        "Prototype a composite metric; test improved correlation and robustness."
      ]
    },
    {
      "title": "Authorship Effects: Style and Specificity in PeerQA Answers",
      "core_idea": "Analyze variability in author-provided answers (verbosity, specificity, citation style) and its impact on automatic metrics and model training stability.",
      "hypothesis": "Answer style diversity confounds generation metrics; style-normalized references yield more stable evaluations and training.",
      "why_it_matters": "Promotes fair comparisons and robust training despite heterogeneous human answer styles in PeerQA.",
      "possible_methods": [
        "Style feature extraction (length, citation markers, passive/active voice)",
        "Normalization pipelines (lexical/syntactic paraphrasing)",
        "Before/after metric sensitivity analysis"
      ],
      "experimental_design": [
        "Compute style features on qa.jsonl/qa-augmented-answers.jsonl; cluster answer styles.",
        "Normalize references and re-evaluate generations; compare metric stability.",
        "Assess training stability with and without style normalization."
      ]
    },
    {
      "title": "Reproducibility Under uv: From Scripts to Science in PeerQA",
      "core_idea": "Stress-test the end-to-end scripts (uv, pyserini, ColBERT, generate.py) for reproducibility across environments. Quantify variance from dependency versions, GPU types, and random seeds.",
      "hypothesis": "Reported baselines are sensitive to environment and seed variations; tighter version pinning and deterministic settings reduce variance substantially.",
      "why_it_matters": "Improves credibility of PeerQA results and provides a hardened template for future dataset releases.",
      "possible_methods": [
        "Multi-env CI runs with pinned vs. floating deps",
        "Seed sweep experiments",
        "Determinism flags and caching policies"
      ],
      "experimental_design": [
        "Automate runs via tasks invoking uv pipelines for retrieval and generation.",
        "Record metrics across 5 seeds and two hardware profiles; compute confidence intervals.",
        "Publish a reproducibility report and a lockfile-based starter kit."
      ]
    }
  ]
}