[
  {
    "group_id": "WhEPg4mUs6",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "FQbkBcpcvA",
      "title": "Rethinking cross entropy for continual fine-tuning: policy gradient with entropy annealing",
      "abstract": "While large pretrained vision models have achieved widespread success, their post-training adaptation in continual learning remains vulnerable to catastrophic forgetting. We challenge the conventional use of cross-entropy (CE) loss, a surrogate for 0-1 loss, by reformulating classification through reinforcement learning. Our approach frames classification as a one-step Markov Decision Process (MDP), where input samples serve as states, class labels as actions, and a fully observable reward model is derived from ground-truth labels.  From this formulation, we derive Expected Policy Gradient (EPG), a gradient-based method that directly minimizes the 0-1 loss (i.e., misclassification error). Theoretical and empirical analyses reveal a critical distinction between EPG and CE: while CE encourages exploration via high-entropy outputs, EPG adopts an exploitation-centric approach, prioritizing high-confidence samples through implicit sample weighting. Building on this insight, we propose an adaptive entropy annealing strategy (aEPG) that transitions from exploratory to exploitative learning during continual adaptation of a pre-trained model. Our method outperforms CE-based optimization across diverse benchmarks (Split-ImageNet-R, Split-Food101, Split-CUB100, CLRS) and parameter-efficient modules (LoRA, Adapter, Prefix). More broadly, we evaluate various entropy regularization methods and demonstrate that lower entropy of the output prediction distribution enhances adaptation in pretrained vision models. These findings suggest that excessive exploration may disrupt pretrained knowledge and establish exploitative learning as a crucial principle for adapting foundation vision models to evolving classification tasks.",
      "keywords": [
        "Continual learning",
        "reinforcement learning",
        "cross-entropy",
        "class-incremental learning"
      ],
      "decision": "Reject",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "WhEPg4mUs6",
      "title": "Analog In-memory Training on General Non-ideal Resistive Elements: The Impact of Response Functions",
      "abstract": "As the economic and environmental costs of training and deploying large vision or language models increase dramatically, analog in-memory computing (AIMC) emerges as a promising energy-efficient solution. However, the training perspective, especially its training dynamic, is underexplored. In AIMC hardware, the trainable weights are represented by the conductance of resistive elements and updated using consecutive electrical pulses.  While the conductance changes by a constant in response to each pulse, in reality, the change is scaled by asymmetric and non-linear response functions, leading to a non-ideal training dynamic. This paper provides a theoretical foundation for gradient-based training on AIMC hardware with non-ideal response functions.  We demonstrate that asymmetric response functions negatively impact Analog SGD by imposing an implicit penalty on the objective. To overcome the issue, we propose residual learning algorithm, which provably converges exactly to a critical point by solving a bilevel optimization problem. We show that the proposed method can be extended to deal with other hardware imperfections like limited response granularity. As far as we know, it is the first paper to investigate the impact of a class of generic non-ideal response functions. The conclusion is supported by simulations validating our theoretical insights.",
      "keywords": [
        "Analog AI; in-memory computing; stochastic gradient descent; stochastic optimization"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    }
  },
  {
    "group_id": "rMhQBlhh4c",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "LPUr2CexmX",
      "title": "DO-EM: Density Operator Expectation Maximization",
      "abstract": "Density operators, quantum generalizations of probability distributions, are gaining prominence in machine learning due to their foundational role in quantum computing. Generative modeling based on density operator models (**DOMs**) is an emerging field, but existing training algorithms - such as those for the Quantum Boltzmann Machine - do not scale to real-world data, such as the MNIST dataset. The Expectation-Maximization algorithm has played a fundamental role in enabling scalable training of probabilistic latent variable models on real-world datasets. *In this paper, we develop an Expectation-Maximization framework to learn latent variable models defined through **DOMs** on classical hardware, with resources comparable to those used for probabilistic models, while scaling to real-world data.* However, designing such an algorithm is nontrivial due to the absence of a well-defined quantum analogue to conditional probability, which complicates the Expectation step. To overcome this, we reformulate the Expectation step as a quantum information projection (QIP) problem and show that the Petz Recovery Map provides a solution under sufficient conditions. Using this formulation, we introduce the Density Operator Expectation Maximization (DO-EM) algorithm - an iterative Minorant-Maximization procedure that optimizes a quantum evidence lower bound. We show that the **DO-EM** algorithm ensures non-decreasing log-likelihood across iterations for a broad class of models. Finally, we present Quantum Interleaved Deep Boltzmann Machines (**QiDBMs**), a **DOM** that can be trained with the same resources as a DBM. When trained with **DO-EM** under Contrastive Divergence, a **QiDBM** outperforms larger classical DBMs in image generation on the MNIST dataset, achieving a 40–60% reduction in the Fréchet Inception Distance.",
      "keywords": [
        "Density Operators",
        "Expectation-Maximization",
        "Quantum Unsupervised Learning",
        "Latent Variable Models"
      ],
      "decision": "Reject",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "rMhQBlhh4c",
      "title": "Adjoint Schrödinger Bridge Sampler",
      "abstract": "Computational methods for learning to sample from the Boltzmann distribution—where the target distribution is known only up to an unnormalized energy function—have advanced significantly recently. Due to the lack of explicit target samples, however, prior diffusion-based methods, known as _diffusion samplers_, often require importance-weighted estimation or complicated learning processes. Both trade off scalability with extensive evaluations of the energy and model, thereby limiting their practical usage. In this work, we propose **Adjoint Schrödinger Bridge Sampler (ASBS)**, a new diffusion sampler that employs simple and scalable matching-based objectives yet without the need to estimate target samples during training. ASBS is grounded on a mathematical model—the Schrödinger Bridge—which enhances sampling efficiency via kinetic-optimal transportation. Through a new lens of stochastic optimal control theory, we demonstrate how SB-based diffusion samplers can be learned at scale via Adjoint Matching and prove convergence to the global solution. Notably, ASBS generalizes the recent Adjoint Sampling (Havens et al., 2025) to arbitrary source distributions by relaxing the so-called memoryless condition that largely restricts the design space. Through extensive experiments, we demonstrate the effectiveness of ASBS on sampling from classical energy functions, amortized conformer generation, and molecular Boltzmann distributions. Codes are available at https://github.com/facebookresearch/adjoint_samplers",
      "keywords": [
        "Boltzmann distribution",
        "diffusion sampler",
        "Schrödinger bridge"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    }
  },
  {
    "group_id": "XoN10bZtR9",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "XoN10bZtR9",
      "title": "Rethinking Joint Maximum Mean Discrepancy for Visual Domain Adaptation",
      "abstract": "In domain adaption (DA), joint maximum mean discrepancy (JMMD), as a famous distribution-distance metric, aims to measure joint probability distribution difference between the source domain and target domain, while it is still not fully explored and especially hard to be applied into a subspace-learning framework as its empirical estimation involves a tensor-product operator whose partial derivative is difficult to obtain. To solve this issue, we deduce a concise JMMD based on the Representer theorem that avoids the tensor-product operator and obtains two essential findings. First, we reveal the uniformity of JMMD by proving that previous marginal, class conditional, and weighted class conditional probability distribution distances are three special cases of JMMD with different label reproducing kernels. Second, inspired by graph embedding, we observe that the similarity weights, which strengthen the intra-class compactness in the graph of Hilbert Schmidt independence criterion (HSIC), take opposite signs in the graph of JMMD, revealing why JMMD degrades the feature discrimination. This motivates us to propose a novel loss JMMD-HSIC by jointly considering JMMD and HSIC to promote discrimination of JMMD. Extensive experiments on several cross-domain datasets could demonstrate the validity of our revealed theoretical results and the effectiveness of our proposed JMMD-HSIC.",
      "keywords": [
        "domain adaptation",
        "JMMD",
        "HSIC",
        "feature discrimination",
        "graph embedding"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "np5NmBQL4F",
      "title": "Isometry pursuit",
      "abstract": "Isometry pursuit is a convex algorithm for identifying orthonormal column-submatrices of wide matrices.\nIt consists of a vector normalization followed by multitask basis pursuit.\nApplied to Jacobians of putative coordinate functions, it helps identify locally isometric embeddings from within interpretable dictionaries.\nWe provide theoretical and experimental results justifying this method, including a proof with realistic assumptions that such isometric submatrices, should they exist, are contained within the obtained support.\nFor problems involving coordinate selection and diversification, it offers a synergistic alternative to greedy and brute force search.",
      "keywords": [
        "Manifold learning",
        "interpretability",
        "sparse coding"
      ],
      "decision": "Reject",
      "year": "2025"
    }
  },
  {
    "group_id": "BSZqpqgqM0",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "9NHd6Z4aIi",
      "title": "Single-Step Diffusion via Direct Models",
      "abstract": "We introduce Direct Models, a generative modeling framework that enables single-step diffusion by learning a direct mapping from initial noise $x_0$ to all intermediate latent states along the generative trajectory. Unlike traditional diffusion models that rely on iterative denoising or integration, Direct Models leverages a progressive learning scheme where the mapping from $x_0$ to $x_{t + \\delta t}$ is composed as an update from $x_0$ to $x_t$ plus the velocity at time $t$. This formulation allows the model to learn the entire trajectory in a recursive, data-consistent manner while maintaining computational efficiency. At inference, the full generative path can be obtained in a single forward pass. Experimentally, we show that Direct Models achieves state-of-the-art sample quality among single-step diffusion methods while significantly reducing inference time.",
      "keywords": [
        "Efficient generative models"
      ],
      "decision": "Reject",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "BSZqpqgqM0",
      "title": "Why Diffusion Models Don’t Memorize:  The Role of Implicit Dynamical Regularization in Training",
      "abstract": "Diffusion models have achieved remarkable success across a wide range of generative tasks. A key challenge is understanding the mechanisms that prevent their memorization of training data and allow generalization. In this work, we investigate the role of the training dynamics in the transition from generalization to memorization. Through extensive experiments and theoretical analysis, we identify two distinct timescales: an early time $\\tau_\\mathrm{gen}$ at which models begin to generate high-quality samples, and a later time $\\tau_\\mathrm{mem}$ beyond which memorization emerges. Crucially, we find that $\\tau_\\mathrm{mem}$ increases linearly with the training set size $n$, while $\\tau_\\mathrm{gen}$ remains constant. This creates a growing window of training times with $n$ where models generalize effectively, despite showing strong memorization if training continues beyond it. It is only when $n$ becomes larger than a model-dependent threshold that overfitting disappears at infinite training times.\nThese findings reveal a form of implicit dynamical regularization in the training dynamics, which allow to avoid memorization even in highly overparameterized settings. Our results are supported by numerical experiments with standard U-Net architectures on realistic and synthetic  datasets, and by a theoretical analysis using a tractable random features model studied in the high-dimensional limit.",
      "keywords": [
        "Diffusion Models",
        "Deep Learning",
        "Probabilistic Methods"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    }
  },
  {
    "group_id": "SDhOClkyqC",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "BSZqpqgqM0",
      "title": "Why Diffusion Models Don’t Memorize:  The Role of Implicit Dynamical Regularization in Training",
      "abstract": "Diffusion models have achieved remarkable success across a wide range of generative tasks. A key challenge is understanding the mechanisms that prevent their memorization of training data and allow generalization. In this work, we investigate the role of the training dynamics in the transition from generalization to memorization. Through extensive experiments and theoretical analysis, we identify two distinct timescales: an early time $\\tau_\\mathrm{gen}$ at which models begin to generate high-quality samples, and a later time $\\tau_\\mathrm{mem}$ beyond which memorization emerges. Crucially, we find that $\\tau_\\mathrm{mem}$ increases linearly with the training set size $n$, while $\\tau_\\mathrm{gen}$ remains constant. This creates a growing window of training times with $n$ where models generalize effectively, despite showing strong memorization if training continues beyond it. It is only when $n$ becomes larger than a model-dependent threshold that overfitting disappears at infinite training times.\nThese findings reveal a form of implicit dynamical regularization in the training dynamics, which allow to avoid memorization even in highly overparameterized settings. Our results are supported by numerical experiments with standard U-Net architectures on realistic and synthetic  datasets, and by a theoretical analysis using a tractable random features model studied in the high-dimensional limit.",
      "keywords": [
        "Diffusion Models",
        "Deep Learning",
        "Probabilistic Methods"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "QJtanJS4T9",
      "title": "Irreducible Loss Floors in Gradient Descent Convergence and Energy Footprint",
      "abstract": "Despite their central role, convergence analyses of the dynamics of loss functions\nduring training require strong assumptions (e.g convexity and smoothness) which\nare non-trivial to prove. In this work, we introduce a framework for deriving\nnecessary convergence conditions that hold without restrictive assumptions on\nthe dataset or the model architecture. By linking microscopic properties such as\nindividual sample losses and their gradient to macroscopic training dynamics, we\nderive tight lower bounds for loss functions, applicable to both full-batch and mini-\nbatch gradient systems. These bounds reveal the presence of irreducible floors\nthat optimizers cannot surpass and beyond theoretical guarantees, this framework offers a practical tool for anticipating convergence speed, and estimating\nminimum training time and energy requirements. Thus, this framework can be\nused to ensure the sustainability and feasibility of large-scale training regimes.",
      "keywords": [
        "gradient descent",
        "convergence",
        "loss bounds",
        "optimization",
        "training dynamics",
        "sustainability",
        "efficiency",
        "feasibility",
        "computational cost",
        "irreducible loss",
        "non-convex optimization",
        "lower bounds"
      ],
      "decision": "Reject",
      "year": "2025"
    }
  },
  {
    "group_id": "QwXpn5IPKk",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "9NHd6Z4aIi",
      "title": "Single-Step Diffusion via Direct Models",
      "abstract": "We introduce Direct Models, a generative modeling framework that enables single-step diffusion by learning a direct mapping from initial noise $x_0$ to all intermediate latent states along the generative trajectory. Unlike traditional diffusion models that rely on iterative denoising or integration, Direct Models leverages a progressive learning scheme where the mapping from $x_0$ to $x_{t + \\delta t}$ is composed as an update from $x_0$ to $x_t$ plus the velocity at time $t$. This formulation allows the model to learn the entire trajectory in a recursive, data-consistent manner while maintaining computational efficiency. At inference, the full generative path can be obtained in a single forward pass. Experimentally, we show that Direct Models achieves state-of-the-art sample quality among single-step diffusion methods while significantly reducing inference time.",
      "keywords": [
        "Efficient generative models"
      ],
      "decision": "Reject",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "BSZqpqgqM0",
      "title": "Why Diffusion Models Don’t Memorize:  The Role of Implicit Dynamical Regularization in Training",
      "abstract": "Diffusion models have achieved remarkable success across a wide range of generative tasks. A key challenge is understanding the mechanisms that prevent their memorization of training data and allow generalization. In this work, we investigate the role of the training dynamics in the transition from generalization to memorization. Through extensive experiments and theoretical analysis, we identify two distinct timescales: an early time $\\tau_\\mathrm{gen}$ at which models begin to generate high-quality samples, and a later time $\\tau_\\mathrm{mem}$ beyond which memorization emerges. Crucially, we find that $\\tau_\\mathrm{mem}$ increases linearly with the training set size $n$, while $\\tau_\\mathrm{gen}$ remains constant. This creates a growing window of training times with $n$ where models generalize effectively, despite showing strong memorization if training continues beyond it. It is only when $n$ becomes larger than a model-dependent threshold that overfitting disappears at infinite training times.\nThese findings reveal a form of implicit dynamical regularization in the training dynamics, which allow to avoid memorization even in highly overparameterized settings. Our results are supported by numerical experiments with standard U-Net architectures on realistic and synthetic  datasets, and by a theoretical analysis using a tractable random features model studied in the high-dimensional limit.",
      "keywords": [
        "Diffusion Models",
        "Deep Learning",
        "Probabilistic Methods"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    }
  },
  {
    "group_id": "WrYWolqKh3",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "NM8Apk61NA",
      "title": "HyperET: Efficient Training in Hyperbolic Space for Multi-modal Large Language Models",
      "abstract": "Multi-modal large language models (MLLMs) have emerged as a transformative approach for aligning visual and textual understanding. They typically require extremely high computational resources (e.g., thousands of GPUs) for training to achieve cross-modal alignment at multi-granularity levels. We argue that a key source of this inefficiency lies in the vision encoders they widely equip with, e.g., CLIP and SAM, which lack the alignment with language at multi-granularity levels. To address this issue, in this paper, we leverage hyperbolic space, which inherently models hierarchical levels and thus provides a principled framework for bridging the granularity gap between visual and textual modalities at an arbitrary granularity level. Concretely, we propose an efficient training paradigm for MLLMs, dubbed as \\blg, which can optimize visual representations to align with their textual counterparts at an arbitrary granularity level through dynamic hyperbolic radius adjustment in hyperbolic space. \\alg employs learnable matrices with M\\\"{o}bius multiplication operations, implemented via three effective configurations: diagonal scaling matrices, block-diagonal matrices, and banded matrices, providing a flexible yet efficient parametrization strategy. Comprehensive experiments across multiple MLLM benchmarks demonstrate that \\alg consistently improves both existing pre-training and fine-tuning MLLMs clearly with less than 1\\% additional parameters. Code is available at \\url{https://github.com/godlin-sjtu/HyperET}.",
      "keywords": [
        "Efficient Training",
        "Multi-modal Large Language Models",
        "Granularity Levels",
        "Hyperbolic Space"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "eR8raBLZW7",
      "title": "BriLLM: Brain-inspired Large Language Model",
      "abstract": "This paper reports the brain-inspired large language model (BriLLM). This is a non-Transformer, non-GPT, non-traditional machine learning input-output controlled generative language model. The model is based on the Signal Fully-connected flowing (SiFu) definition on the directed graph in terms of the neural network, and has the interpretability of all nodes on the graph of the whole model, instead of the traditional machine learning model that only has limited interpretability at the input and output ends. In the language model scenario, the token is defined as a node in the graph. A randomly shaped or user-defined signal flow flows between nodes on the principle of \"least resistance\" along paths. The next token or node to be predicted or generated is the target of the signal flow. As a language model, BriLLM theoretically supports infinitely long $n$-gram models when the model size is independent of the input and predicted length of the model. The model's working signal flow provides the possibility of recall activation and innate multi-modal support similar to the cognitive patterns of the human brain. At present, we released the first BriLLM versions in Chinese and English, with 4000 tokens, 32-dimensional node size, 32-token sequence prediction ability, model sizes around 2B and 1B respectively, bringing language model prediction performance comparable to GPT-1.",
      "keywords": [
        "LLM"
      ],
      "decision": "Reject",
      "year": "2025"
    }
  },
  {
    "group_id": "hTbimOuFPM",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "8P3QNSckMp",
      "title": "A Clean Slate for Offline Reinforcement Learning",
      "abstract": "Progress in offline reinforcement learning (RL) has been impeded by ambiguous problem definitions and entangled algorithmic designs, resulting in inconsistent implementations, insufficient ablations, and unfair evaluations. Although offline RL explicitly avoids environment interaction, prior methods frequently employ extensive, undocumented online evaluation for hyperparameter tuning, complicating method comparisons. Moreover, existing reference implementations differ significantly in boilerplate code, obscuring their core algorithmic contributions. We address these challenges by first introducing a rigorous taxonomy and a transparent evaluation protocol that explicitly quantifies online tuning budgets. To resolve opaque algorithmic design, we provide clean, minimalistic, single-file implementations of various model-free and model-based offline RL methods, significantly enhancing clarity and achieving substantial speed-ups. Leveraging these streamlined implementations, we propose Unifloral, a unified algorithm that encapsulates diverse prior approaches and enables development within a single, comprehensive hyperparameter space. Using Unifloral with our rigorous evaluation protocol, we develop two novel algorithms - TD3-AWR (model-free) and MoBRAC (model-based) - which substantially outperform established baselines. Our implementation is publicly available at https://github.com/EmptyJackson/unifloral.",
      "keywords": [
        "Offline Reinforcement Learning",
        "Evaluation",
        "Open-Source"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "hTbimOuFPM",
      "title": "An efficient implementation for solving the all pairs minimax path problem in an undirected dense graph",
      "abstract": "We provide an efficient $ O(n^2) $ implementation for solving the all pairs minimax path problem or  widest path problem in an undirected dense graph. The distance matrix is also called the all points path distance (APPD). We conducted experiments to test the implementation and algorithm, compared it with several other algorithms for solving the APPD matrix.  Result shows Algorithm 4 works good for solving the widest path or minimax path APPD matrix.  It can drastically improve the efficiency for computing the APPD matrix.  There are several theoretical outcomes which claim the APPD matrix can be solved accurately in $ O(n^2) $ . However, they are impractical because there is no code implementation of these algorithms. Algorithm 4 is the first algorithm that has an actual code implementation for solving the APPD matrix of minimax path or widest path problem in $ O(n^2) $, in an undirected dense graph.",
      "keywords": [
        "Minimax path problem",
        "Longest-leg path distance",
        "Min-Max-Jump distance",
        "Widest path problem",
        "Maximum capacity path problem",
        "Bottleneck edge query problem",
        "All points path distance",
        "Floyd-Warshall algorithm",
        "Minimum spanning tree"
      ],
      "decision": "Reject",
      "year": "2025"
    }
  },
  {
    "group_id": "2Ri68h7bD1",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "2Ri68h7bD1",
      "title": "Dale's Law Meets Geometric Brownian Motion: Multiplicative Updates for Sampling",
      "abstract": "Gradient descent has proven to be a powerful and effective technique for optimization in numerous machine learning applications. Recent advances in computational neuroscience have shown that learning in standard gradient descent optimization formulation is not consistent with learning in biological systems. This has opened up interesting avenues for building biologically inspired learning techniques. One such approach is inspired by Dale's law, which states that inhibitory and excitatory synapses do not swap roles during the course of learning. The resulting exponential gradient descent optimization scheme leads to log-normally distributed synaptic weights. Interestingly, the density that satisfies the Fokker-Planck equation corresponding to the stochastic differential equation (SDE) with geometric Brownian motion (GBM) is the log-normal density. Leveraging this connection, we start with the SDE governing geometric Brownian motion, and show that discretizing the corresponding reverse-time SDE yields a multiplicative update rule, which surprisingly, coincides with the sampling equivalent of the exponential gradient descent update founded on Dale's law. Proceeding further, we propose a new formalism for multiplicative denoising score-matching, which subsumes the loss function proposed by Hyvaerinen for non-negative data. Indeed, log-normally distributed data is positive and the proposed score-matching formalism turns out to be a natural fit. This allows for training of score-based models for image data and results in a novel multiplicative update scheme for sample generation starting from a log-normal density. Experimental results on MNIST, Fashion MNIST, and Kuzushiji datasets demonstrate generative capability of the new scheme. To the best of our knowledge, this is the first instance of a biologically inspired generative model employing multiplicative updates, founded on geometric Brownian motion.",
      "keywords": [
        "Dale's Law",
        "Geometric Brownian Motion",
        "Stochastic Differential Equations",
        "Score-matching"
      ],
      "decision": "Reject",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "uWj4s7rMnR",
      "title": "Mean Flows for One-step Generative Modeling",
      "abstract": "We propose a principled and effective framework for one-step generative modeling. We introduce the notion of average velocity to characterize flow fields, in contrast to instantaneous velocity modeled by Flow Matching methods. A well-defined identity between average and instantaneous velocities is derived and used to guide neural network training. Our method, termed the \\textit{MeanFlow} model, is self-contained and requires no pre-training, distillation, or curriculum learning. MeanFlow demonstrates strong empirical performance: it achieves an FID of 3.43 with a single function evaluation (1-NFE) on ImageNet 256$\\times$256 trained from scratch, significantly outperforming previous state-of-the-art one-step diffusion/flow models. Our study substantially narrows the gap between one-step diffusion/flow models and their multi-step predecessors, and we hope it will motivate future research to revisit the foundations of these powerful models.",
      "keywords": [
        "Generative Models"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    }
  },
  {
    "group_id": "ZqwyrPXbV9",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "KnqiC0znVF",
      "title": "Large Language Diffusion Models",
      "abstract": "The capabilities of large language models (LLMs) are widely regarded as relying on autoregressive models (ARMs). We challenge this notion by introducing *LLaDA*, a diffusion model trained from scratch under the pre-training and supervised fine-tuning (SFT) paradigm. LLaDA employs a forward data masking process and a reverse generation process, parameterized by a Transformer to predict masked tokens. It provides a principled generative approach for probabilistic inference by optimizing a likelihood lower bound. Across extensive benchmarks on general tasks, math, code, and so on, LLaDA demonstrates strong *scalability* and performs comparably to our self-constructed ARM baselines. Remarkably, LLaDA 8B is competitive with strong LLMs like LLaMA3 8B in *in-context learning* and, after SFT, exhibits impressive *instruction-following* abilities in case studies such as multi-turn dialogue. Moreover, LLaDA addresses the reversal curse, surpassing GPT-4o in a reversal poem completion task. Our findings show the promise of diffusion models for language modeling at scale and challenge the common assumption that core LLM capabilities discussed above inherently depend on ARMs. Project page and codes: \\url{https://ml-gsai.github.io/LLaDA-demo/}.",
      "keywords": [
        "diffusion language models",
        "large language models",
        "masked diffusion models",
        "discrete diffusion models",
        "diffusion models"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "ZqwyrPXbV9",
      "title": "Concept Attractors in LLMs and their Applications",
      "abstract": "Large language models (LLMs) often map semantically related prompts to similar internal representations at specific layers, even when their surface forms differ widely. We show that this behavior can be generalized and explained through Iterated Function Systems (IFS), where layers act as contractive mappings toward concept-specific Attractors. We leverage this insight and develop simple, training-free methods that operate directly on these attractors to solve a wide range of practical tasks, including **language translation**, **hallucination reduction**, **guardrailing**, and **synthetic data generation**. Despite their simplicity, these attractor-based interventions match or exceed specialized baselines, offering an efficient alternative to heavy fine-tuning, generalizable in scenarios where baselines underperform.",
      "keywords": [
        "Large Language Models",
        "Dynamic Systems",
        "Attractors",
        "Guardrails",
        "Transpiler",
        "Steering",
        "Hallucinations",
        "Synthetic Data"
      ],
      "decision": "Reject",
      "year": "2025"
    }
  },
  {
    "group_id": "yThwhNCaZN",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "NM8Apk61NA",
      "title": "HyperET: Efficient Training in Hyperbolic Space for Multi-modal Large Language Models",
      "abstract": "Multi-modal large language models (MLLMs) have emerged as a transformative approach for aligning visual and textual understanding. They typically require extremely high computational resources (e.g., thousands of GPUs) for training to achieve cross-modal alignment at multi-granularity levels. We argue that a key source of this inefficiency lies in the vision encoders they widely equip with, e.g., CLIP and SAM, which lack the alignment with language at multi-granularity levels. To address this issue, in this paper, we leverage hyperbolic space, which inherently models hierarchical levels and thus provides a principled framework for bridging the granularity gap between visual and textual modalities at an arbitrary granularity level. Concretely, we propose an efficient training paradigm for MLLMs, dubbed as \\blg, which can optimize visual representations to align with their textual counterparts at an arbitrary granularity level through dynamic hyperbolic radius adjustment in hyperbolic space. \\alg employs learnable matrices with M\\\"{o}bius multiplication operations, implemented via three effective configurations: diagonal scaling matrices, block-diagonal matrices, and banded matrices, providing a flexible yet efficient parametrization strategy. Comprehensive experiments across multiple MLLM benchmarks demonstrate that \\alg consistently improves both existing pre-training and fine-tuning MLLMs clearly with less than 1\\% additional parameters. Code is available at \\url{https://github.com/godlin-sjtu/HyperET}.",
      "keywords": [
        "Efficient Training",
        "Multi-modal Large Language Models",
        "Granularity Levels",
        "Hyperbolic Space"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "yThwhNCaZN",
      "title": "KG-BiLM: Knowledge Graph Embedding via Bidirectional Language Models",
      "abstract": "Recent advances in knowledge representation learning (KRL) highlight the urgent necessity to unify symbolic knowledge graphs (KGs) with language models (LMs) for richer semantic understanding. However, existing approaches typically prioritize either graph structure or textual semantics, leaving a gap: a unified framework that simultaneously captures global KG connectivity, nuanced linguistic context, and discriminative reasoning semantics. To bridge this gap, we introduce KG-BiLM, a bidirectional LM framework that fuses structural cues from KGs with the semantic expressiveness of generative transformers. KG-BiLM incorporates three key components: (i) Bidirectional Knowledge Attention, which removes the causal mask to enable full interaction among all tokens and entities; (ii) Knowledge-Masked Prediction, which encourages the model to leverage both local semantic contexts and global graph connectivity; and (iii) Contrastive Graph Semantic Aggregation, which preserves KG structure via contrastive alignment of sampled sub-graph representations. Extensive experiments on standard benchmarks demonstrate that KG-BiLM outperforms strong baselines in link prediction, especially on large-scale graphs with complex multi-hop relations—validating its effectiveness in unifying structural information and textual semantics.",
      "keywords": [
        "Knowledge Graph Embedding",
        "Language Model",
        "Transformer",
        "Attention Mechanism"
      ],
      "decision": "Reject",
      "year": "2025"
    }
  },
  {
    "group_id": "pv964N1RYb",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "8EWo0gX5TW",
      "title": "Neural SDEs as a Unified Approach to Continuous-Domain Sequence Modeling",
      "abstract": "Inspired by the ubiquitous use of differential equations to model continuous dynamics across diverse scientific and engineering domains, we propose a novel and intuitive approach to continuous sequence modeling. Our method interprets timeseries data as discrete samples from an underlying continuous dynamical system, and models its time evolution using Neural Stochastic Differential Equation (Neural SDE), where both the flow (drift) and diffusion terms are parameterized by neural networks. We derive a principled maximum likelihood objective and a simulationfree scheme for efficient training of our Neural SDE model. We demonstrate the versatility of our approach through experiments on sequence modeling tasks across both embodied and generative AI. Notably, to the best of our knowledge, this is the first work to show that SDEbased continuous-time modeling also excels in such complex scenarios, and we hope that our\nwork opens up new avenues for research of SDE models in high-dimensional and temporally intricate domains.",
      "keywords": [
        "Neural Stochastic Differential Equations",
        "Flow Matching",
        "Diffusion Models",
        "Continuous-Time Sequence Modeling."
      ],
      "decision": "Reject",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "ImpizBSKcu",
      "title": "Dynamical Decoupling of Generalization and Overfitting in Large Two-Layer Networks",
      "abstract": "Understanding the inductive bias and generalization properties of large overparametrized machine learning models requires to characterize the dynamics of the training algorithm.  We study the learning dynamics of large two-layer neural networks via dynamical mean field theory, a well established technique of non-equilibrium statistical physics. We show that, for large network width $m$,\nand large number of samples per input dimension $n/d$, the training dynamics exhibits a separation of timescales which implies:\n$(i)$ The emergence of a slow time scale associated with the growth in Gaussian/Rademacher complexity of the network;\n$(ii)$ Inductive bias towards small complexity if the initialization has small enough complexity;\n$(iii)$ A dynamical decoupling between feature learning and overfitting regimes; $(iv)$ A non-monotone behavior of the test error, associated  `feature unlearning' regime at large times.",
      "keywords": [
        "Overfitting; feature learning; dynamical mean field theory; generalization;"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    }
  },
  {
    "group_id": "8YniJnJQ0P",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "Q3qAsZAEZw",
      "title": "Understanding and Mitigating Numerical Sources of Nondeterminism in LLM Inference",
      "abstract": "Large Language Models (LLMs) are now integral across various domains and have demonstrated impressive performance. Progress, however, rests on the premise that benchmark scores are both accurate and reproducible. We demonstrate that the reproducibility of LLM performance is fragile: changing system configuration, such as evaluation batch size, GPU count, and GPU version, can introduce significant differences in the generated responses. \nThis issue is especially pronounced in reasoning models, where minor rounding differences in early tokens can cascade into divergent chains of thought, ultimately affecting accuracy. For instance, under bfloat16 precision with greedy decoding, a reasoning model like DeepSeek-R1-Distill-Qwen-7B can exhibit up to 9\\% variation in accuracy and 9,000 tokens difference in response length due to differences in GPU count, type, and evaluation batch size.\nWe trace the root cause of this variability to the non-associative nature of floating-point arithmetic under limited numerical precision. \nThis work presents the first systematic investigation into how numerical precision affects reproducibility in LLM inference. Through carefully controlled experiments across various hardware, software, and precision settings, we quantify when and how model outputs diverge.\nOur analysis reveals that floating-point precision—while critical for reproducibility—is often neglected in evaluation practices.\nInspired by this, we develop a lightweight inference pipeline, dubbed LayerCast, that stores weights in 16-bit precision but performs all computations in FP32, balancing memory efficiency with numerical stability. Code is available at https://github.com/nanomaoli/llm_reproducibility.",
      "keywords": [
        "Large Language Models (LLMs)",
        "Reproducibility",
        "Numerical precision",
        "Deterministic inference"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "obXGSmmG70",
      "title": "AdaCoT: Pareto-Optimal Adaptive Chain-of-Thought Triggering via Reinforcement Learning",
      "abstract": "Large Language Models (LLMs) have demonstrated remarkable capabilities but often face challenges with tasks requiring sophisticated reasoning. While Chain-of-Thought (CoT) prompting significantly enhances reasoning, it indiscriminately generates lengthy reasoning steps for all queries, leading to substantial computational costs and inefficiency, especially for simpler inputs. To address this critical issue, we introduce AdaCoT (Adaptive Chain-of-Thought), a novel framework enabling LLMs to adaptively decide when to invoke CoT. AdaCoT framed adaptive reasoning as a Pareto optimization problem that seeks to balance model performance with the costs associated with CoT invocation (both frequency and computational overhead). We propose a reinforcement learning (RL) based method, specifically utilizing Proximal Policy Optimization (PPO), to dynamically control the CoT triggering decision boundary by adjusting penalty coefficients, thereby allowing the model to determine CoT necessity based on implicit query complexity. A key technical contribution is Selective Loss Masking (SLM), designed to counteract decision boundary collapse during multi-stage RL training, ensuring robust and stable adaptive triggering. Experimental results demonstrate that AdaCoT successfully navigates the Pareto frontier, achieving substantial reductions in CoT usage for queries not requiring elaborate reasoning. For instance, on our production traffic testset, AdaCoT reduced CoT triggering rates to as low as 3.18% and decreased average response tokens by 69.06% on APP, while maintaining high performance on complex tasks. This substantial token decrease directly translates to a significant reduction in inference computational load. AdaCoT pioneers adaptive CoT triggering, offering a practical and principled solution for developing more efficient, responsive, and cost-effective LLMs, particularly crucial for interactive and resource-sensitive applications.",
      "keywords": [
        "Adaptive Reasoning",
        "Chain-of-Thought",
        "Large Language Models"
      ],
      "decision": "Reject",
      "year": "2025"
    }
  },
  {
    "group_id": "Ceb788Uigr",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "2Ri68h7bD1",
      "title": "Dale's Law Meets Geometric Brownian Motion: Multiplicative Updates for Sampling",
      "abstract": "Gradient descent has proven to be a powerful and effective technique for optimization in numerous machine learning applications. Recent advances in computational neuroscience have shown that learning in standard gradient descent optimization formulation is not consistent with learning in biological systems. This has opened up interesting avenues for building biologically inspired learning techniques. One such approach is inspired by Dale's law, which states that inhibitory and excitatory synapses do not swap roles during the course of learning. The resulting exponential gradient descent optimization scheme leads to log-normally distributed synaptic weights. Interestingly, the density that satisfies the Fokker-Planck equation corresponding to the stochastic differential equation (SDE) with geometric Brownian motion (GBM) is the log-normal density. Leveraging this connection, we start with the SDE governing geometric Brownian motion, and show that discretizing the corresponding reverse-time SDE yields a multiplicative update rule, which surprisingly, coincides with the sampling equivalent of the exponential gradient descent update founded on Dale's law. Proceeding further, we propose a new formalism for multiplicative denoising score-matching, which subsumes the loss function proposed by Hyvaerinen for non-negative data. Indeed, log-normally distributed data is positive and the proposed score-matching formalism turns out to be a natural fit. This allows for training of score-based models for image data and results in a novel multiplicative update scheme for sample generation starting from a log-normal density. Experimental results on MNIST, Fashion MNIST, and Kuzushiji datasets demonstrate generative capability of the new scheme. To the best of our knowledge, this is the first instance of a biologically inspired generative model employing multiplicative updates, founded on geometric Brownian motion.",
      "keywords": [
        "Dale's Law",
        "Geometric Brownian Motion",
        "Stochastic Differential Equations",
        "Score-matching"
      ],
      "decision": "Reject",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "WhEPg4mUs6",
      "title": "Analog In-memory Training on General Non-ideal Resistive Elements: The Impact of Response Functions",
      "abstract": "As the economic and environmental costs of training and deploying large vision or language models increase dramatically, analog in-memory computing (AIMC) emerges as a promising energy-efficient solution. However, the training perspective, especially its training dynamic, is underexplored. In AIMC hardware, the trainable weights are represented by the conductance of resistive elements and updated using consecutive electrical pulses.  While the conductance changes by a constant in response to each pulse, in reality, the change is scaled by asymmetric and non-linear response functions, leading to a non-ideal training dynamic. This paper provides a theoretical foundation for gradient-based training on AIMC hardware with non-ideal response functions.  We demonstrate that asymmetric response functions negatively impact Analog SGD by imposing an implicit penalty on the objective. To overcome the issue, we propose residual learning algorithm, which provably converges exactly to a critical point by solving a bilevel optimization problem. We show that the proposed method can be extended to deal with other hardware imperfections like limited response granularity. As far as we know, it is the first paper to investigate the impact of a class of generic non-ideal response functions. The conclusion is supported by simulations validating our theoretical insights.",
      "keywords": [
        "Analog AI; in-memory computing; stochastic gradient descent; stochastic optimization"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    }
  },
  {
    "group_id": "mtJSMcF3ek",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "DzKdjWe59v",
      "title": "Hint Marginalization for Improved Reasoning in Large Language Models",
      "abstract": "Large Language Models (LLMs) have exhibited an impressive capability to perform reasoning tasks, especially if they are encouraged to generate a sequence of intermediate steps. Reasoning performance can be improved by suitably combining multiple LLM responses, generated either in parallel in a single query, or via sequential interactions with LLMs throughout the reasoning process. Existing strategies for combination, such as self-consistency and progressive-hint-prompting, make inefficient usage of the LLM responses. We present Hint Marginalization, a novel and principled algorithmic framework to enhance the reasoning capabilities of LLMs. Our approach can be viewed as an iterative sampling strategy for forming a Monte Carlo approximation of an underlying distribution of answers, with the goal of identifying the mode the most likely answer. Empirical evaluation on several benchmark datasets for arithmetic reasoning demonstrates the superiority of the proposed approach.",
      "keywords": [
        "reasoning",
        "large language models"
      ],
      "decision": "Reject",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "mtJSMcF3ek",
      "title": "Mind the Gap: Examining the Self-Improvement Capabilities of Large Language Models",
      "abstract": "Self-improvement is a mechanism in Large Language Model (LLM) pre-training, post-training and test-time inference. We explore a framework where the model verifies its own outputs, filters or reweights data based on this verification, and distills the filtered data.  Despite several empirical successes, a fundamental understanding is still lacking. In this work, we initiate a comprehensive, modular and controlled study on LLM self-improvement. We provide a mathematical formulation for self-improvement, which is largely governed by a quantity which we formalize as the **generation-verification gap**. Through experiments with various model families and tasks, we discover a scaling phenomenon of self-improvement -- a variant of the generation-verification gap scales monotonically with the model pre-training flops. We also examine when self-improvement is possible, an iterative self-improvement procedure, and ways to improve its performance. Our findings not only advance understanding of LLM self-improvement with practical implications, but also open numerous avenues for future research into its capabilities and boundaries.",
      "keywords": [
        "LLM",
        "self-improvement",
        "synthetic data",
        "post-training",
        "test-time optimization"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    }
  },
  {
    "group_id": "GRMfXcAAFh",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "GRMfXcAAFh",
      "title": "Oscillatory State-Space Models",
      "abstract": "We propose Linear Oscillatory State-Space models (LinOSS) for efficiently learning on long sequences. Inspired by cortical dynamics of biological neural networks, we base our proposed LinOSS model on a system of forced harmonic oscillators. A stable discretization, integrated over time using fast associative parallel scans, yields the proposed state-space model. We prove that LinOSS produces stable dynamics only requiring nonnegative diagonal state matrix. This is in stark contrast to many previous state-space models relying heavily on restrictive parameterizations. Moreover, we rigorously show that LinOSS is universal, i.e., it can approximate any continuous and causal operator mapping between time-varying functions, to desired accuracy. In addition, we show that an implicit-explicit discretization of LinOSS perfectly conserves the symmetry of time reversibility of the underlying dynamics. Together, these properties enable efficient modeling of long-range interactions, while ensuring stable and accurate long-horizon forecasting. Finally, our empirical results, spanning a wide range of time-series tasks from mid-range to very long-range classification and regression, as well as long-horizon forecasting, demonstrate that our proposed LinOSS model consistently outperforms state-of-the-art sequence models. Notably, LinOSS outperforms Mamba and LRU by nearly 2x on a sequence modeling task with sequences of length 50k.",
      "keywords": [
        "state-space models",
        "sequence models",
        "oscillators",
        "long-range interactions",
        "time-series"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "WWymYrA48K",
      "title": "Test Time Learning for Time Series Forecasting",
      "abstract": "We propose the use of Test-Time Training (TTT) modules in a cascade architecture to enhance performance in long-term time series forecasting. Through extensive experiments on standard benchmark datasets, we demonstrate that TTT modules consistently outperform state-of-the-art models, including Mamba-based TimeMachine, particularly in scenarios involving extended sequence and prediction lengths. Our results show significant improvements, especially on larger datasets such as Electricity, Traffic, and Weather, underscoring the effectiveness of TTT in capturing long-range dependencies. Additionally, we explore various convolutional architectures within the TTT framework, showing that convolutional blocks as hidden layer architectures can achieve competitive results.",
      "keywords": [
        "Time Series Forecasting",
        "Test-Time Training",
        "Mamba",
        "Expressive Hidden States",
        "Modern CNN"
      ],
      "decision": "Reject",
      "year": "2025"
    }
  },
  {
    "group_id": "k3tbMMW8rH",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "Da3j02cHe0",
      "title": "Efficient Physics-Constrained Diffusion Models for Solving Inverse Problems",
      "abstract": "Solving inverse problems in scientific and engineering domains often involves complex, nonlinear forward physics and ill-posed conditions. \nRecent advancements in diffusion model have shown promise for general inverse problems, yet their application to scientific domains remains less explored and is hindered by the complexity and high non-linearity of physics constraints. We present a physics-constrained diffusion model (PCDM) designed to solve inverse problems in scientific and engineering domains by efficiently integrating pre-trained diffusion models and physics-constrained objectives.\nWe leverage accelerated diffusion sampling to enable a practical generation process while strictly adhering to physics constraints by solving optimization problems at each timestep. By decoupling the likelihood optimization from the reverse diffusion steps, we ensure that the solutions remain physically consistent, even when employing fewer sampling steps.\nWe validate our method on a wide range of challenging physics-constrained inverse problems, including data assimilation, topology optimization, and full-waveform inversion. Experimental results show that our approach significantly outperforms existing methods in efficiency and precision, making it practical for real-world applications.",
      "keywords": [
        "physics-constraints inverse problem",
        "diffusion model",
        "PDE",
        "generative modeling"
      ],
      "decision": "Reject",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "k3tbMMW8rH",
      "title": "Feedback Schrödinger Bridge Matching",
      "abstract": "Recent advancements in diffusion bridges for distribution transport problems have heavily relied on matching frameworks, yet existing methods often face a trade-off between scalability and access to optimal pairings during training. \nFully unsupervised methods make minimal assumptions but incur high computational costs, limiting their practicality. On the other hand, imposing full supervision of the matching process with optimal pairings improves scalability, however, it can be infeasible in most applications.\nTo strike a balance between scalability and minimal supervision, we introduce Feedback Schrödinger Bridge Matching (FSBM), a novel semi-supervised matching framework that incorporates a small portion ($<8$% of the entire dataset) of pre-aligned pairs as state feedback to guide the transport map of non-coupled samples, thereby significantly improving efficiency. This is achieved by formulating a static Entropic Optimal Transport (EOT) problem with an additional term capturing the semi-supervised guidance. The generalized EOT objective is then recast into a dynamic formulation to leverage the scalability of matching frameworks. Extensive experiments demonstrate that FSBM accelerates training and enhances generalization by leveraging coupled pairs' guidance, opening new avenues for training matching frameworks with partially aligned datasets.",
      "keywords": [
        "Diffusion models",
        "Schrödinger bridge",
        "Distribution matching",
        "Semi-Supervised Learning"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    }
  },
  {
    "group_id": "xoXn62FzD0",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "8QTpYC4smR",
      "title": "Systematic Review of Large Language Models: Applications, Limitations, Practical Usages and Future Directions",
      "abstract": "Large Language Models have revolutionized natural language processing with their remarkable ability to understand and generate human-like text. This review explores the various applications of large language models, highlighting their versatility across different domains. The paper begins with an introduction to LLMs, followed by an overview of their types and a detailed literature review. We then examine their limitations before delving into specific applications such as text generation, translation, summarization, and more. Finally, we discuss future directions for research and development, concluding with a summary of key findings and the potential impact of large language models on various industries.",
      "keywords": [
        "Large Language Models",
        "Systematic Review"
      ],
      "decision": "Reject",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "xoXn62FzD0",
      "title": "Syntactic and Semantic Control of Large Language Models via Sequential Monte Carlo",
      "abstract": "A wide range of LM applications require generating text that conforms to syntactic or semantic constraints. Imposing such constraints can be naturally framed as _probabilistic conditioning_, but exact generation from the resulting distribution—which can differ substantially from the LM’s base distribution—is generally intractable. In this work,\nwe develop an architecture for controlled LM generation based on sequential Monte Carlo (SMC). Our SMC framework allows us to flexibly incorporate domain- and problem-specific constraints at inference time, and efficiently reallocate computational resources in light of new information during the course of generation. By comparing to a number of alternatives and ablations on four challenging domains---Python code generation for data science, text-to-SQL, goal inference, and molecule synthesis—we demonstrate that, with little overhead, our approach allows small open-source language models to outperform models over 8$\\times$ larger, as well as closed-source, fine-tuned ones. \nIn support of the probabilistic perspective, we show that these performance improvements are driven by better approximation to the posterior distribution. \n[Our system](https://github.com/probcomp/genlm-control) builds on the framework of Lew et al. (2023) and integrates with its _language model probabilistic programming language_, giving users a simple, programmable way to apply SMC to a broad variety of controlled generation problems.",
      "keywords": [
        "Sequential Monte Carlo",
        "Language Models",
        "Semantic parsing",
        "Bayesian inference",
        "Probabilistic programming",
        "SMC"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    }
  },
  {
    "group_id": "3b9SKkRAKw",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "LyJi5ugyJx",
      "title": "Simplifying, Stabilizing and Scaling Continuous-time Consistency Models",
      "abstract": "Consistency models (CMs) are a powerful class of diffusion-based generative models optimized for fast sampling. Most existing CMs are trained using discretized timesteps, which introduce additional hyperparameters and are prone to discretization errors. While continuous-time formulations can mitigate these issues, their success has been limited by training instability. To address this, we propose a simplified theoretical framework that unifies previous parameterizations of diffusion models and CMs, identifying the root causes of instability. Based on this analysis, we introduce key improvements in diffusion process parameterization, network architecture, and training objectives. These changes enable us to train continuous-time CMs at an unprecedented scale, reaching 1.5B parameters on ImageNet 512×512. Our proposed training algorithm, using only two sampling steps, achieves FID scores of 2.06 on CIFAR-10, 1.48 on ImageNet 64×64, and 1.88 on ImageNet 512×512, narrowing the gap in FID scores with the best existing diffusion models to within 10\\%.",
      "keywords": [
        "continuous-time consistency models",
        "diffusion models",
        "fast sampling"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "b3VzHRXrXh",
      "title": "Causal Frameworks and Feature Discrepancy Loss: Addressing Data Scarcity and Enhancing Medical Image Segmentation",
      "abstract": "Data scarcity poses a significant challenge for deep learning models in medical imaging, particularly for training and generalization. Previous studies have demonstrated the efficacy of data pooling from various sources, facilitating the analysis of weak but significant correlations between imaging data and disease incidence. This approach is often constrained by strict data-sharing protocols among institutions, resulting in models reliant on external data sources. In this work, we address the issue of data scarcity by leveraging the available data for segmentation tasks across various medical imaging modalities. Based on our observation that samples with minimal foreground-background feature differences often demonstrate inadequate segmentation performance, we propose a causal-inspired foreground-background feature discrepancy penalty function, which improves feature separation and alleviates segmentation difficulties caused by homogeneous pixel distributions. The proposed feature discrepancy loss is mathematically grounded, with a lower bound defined by the negative logarithm of the Dice coefficient, suggesting that increased feature separation correlates with improved Dice scores. To further validate our approach, we introduce a novel ultrasound dataset for triple-negative breast cancer (TNBC), and we evaluate the method across three state-of-the-art segmentation architectures to demonstrate competitive performance. In addition, the results highlight the robustness of our method in mitigating performance decrease due to distribution shifts when new, differently distributed data batches are introduced.",
      "keywords": [
        "causal reasoning",
        "bioemdical image segmentation",
        "data dilemma"
      ],
      "decision": "Reject",
      "year": "2025"
    }
  },
  {
    "group_id": "8oFvUBvF1u",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "meRCKuUpmc",
      "title": "Predictive Inverse Dynamics Models are Scalable Learners for Robotic Manipulation",
      "abstract": "Current efforts to learn scalable policies in robotic manipulation primarily fall into two categories: one focuses on \"action,\" which involves behavior cloning from extensive collections of robotic data, while the other emphasizes \"vision,\" enhancing model generalization by pre-training representations or generative models, also referred to as world models, using large-scale visual datasets. This paper presents an end-to-end paradigm that predicts actions using inverse dynamics models conditioned on the robot's forecasted visual states, named Predictive Inverse Dynamics Models (PIDM). By closing the loop between vision and action, the end-to-end PIDM can be a better scalable action learner. In practice, we use Transformers to process both visual states and actions, naming the model Seer. It is initially pre-trained on large-scale robotic datasets, such as DROID, and can be adapted to real-world scenarios with a little fine-tuning data. Thanks to large-scale, end-to-end training and the continuous synergy between vision and action at each execution step, Seer significantly outperforms state-of-the-art methods across both simulation and real-world experiments. It achieves improvements of 13% on the LIBERO-LONG benchmark, 22% on CALVIN ABC-D, and 43% in real-world tasks. Notably, it demonstrates superior generalization for novel objects, lighting conditions, and environments under high-intensity disturbances. Code and models will be publicly available.",
      "keywords": [
        "Robotic Manipulation ; Pre-training ; Visual Foresight ; Inverse Dynamics ; Large-scale robot dataset"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "xcHIiZr3DT",
      "title": "Vision-Based Pseudo-Tactile Information Extraction and Localization for Dexterous Grasping",
      "abstract": "This study addresses the challenges of tactile perception in robotic dexterous hand grasping by focusing on two main tasks: 1) Acquiring tactile information from everyday objects using vision, termed \"pseudo-tactile\" information, and 2) Building a Dexterous Hand (RH8D) model in Isaac Sim for real-time fingertip contact localization. Utilizing Isaac Sim enables safe, cost-effective experimentation and high-precision simulations that facilitate data collection for model validation. The research establishes a scientific connection between simulated 3D coordinates, actual 3D coordinates, and pseudo-tactile information derived from point clouds, quantified through normal vectors and grayscale variance analysis. Results demonstrate the ability to extract clear object surface textures, accurately locate fingertip contact points in real-time (with precision up to $0.001 m$), and provide tactile information at contact points. This framework enhances robotic grasping capabilities and offers low-cost sensory data. The source code and dataset are publicly available now.",
      "keywords": [
        "Pseudo-Tactile Information",
        "Dexterous Grasping",
        "Vision-Based Perception",
        "Robotic Localization"
      ],
      "decision": "Reject",
      "year": "2025"
    }
  },
  {
    "group_id": "YcbE2K3i2E",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "WOzffPgVjF",
      "title": "Knowing Your Target: Target-Aware Transformer Makes Better Spatio-Temporal Video Grounding",
      "abstract": "Transformer has attracted increasing interest in spatio-temporal video grounding, or STVG, owing to its end-to-end pipeline and promising result. Existing Transformer-based STVG approaches often leverage a set of object queries, which are initialized simply using zeros and then gradually learn target position information via iterative interactions with multimodal features, for spatial and temporal localization. Despite simplicity, these zero object queries, due to lacking target-specific cues, are hard to learn discriminative target information from interactions with multimodal features in complicated scenarios (e.g., with distractors or occlusion), resulting in degradation. Addressing this, we introduce a novel $\\textbf{T}$arget-$\\textbf{A}$ware Transformer for $\\textbf{STVG}$ ($\\textbf{TA-STVG}$), which seeks to adaptively generate object queries via exploring target-specific cues from the given video-text pair, for improving STVG. The key lies in two simple yet effective modules, comprising text-guided temporal sampling (TTS) and attribute-aware spatial activation (ASA), working in a cascade. The former focuses on selecting target-relevant temporal cues from a video utilizing holistic text information, while the latter aims at further exploiting the fine-grained visual attribute information of the object from previous target-aware temporal cues, which is applied for object query initialization. Compared to existing methods leveraging zero-initialized queries, object queries in our TA-STVG, directly generated from a given video-text pair, naturally carry target-specific cues, making them adaptive and better interact with multimodal features for learning more discriminative information to improve STVG. In our experiments on three benchmarks, including HCSTVG-v1/-v2 and VidSTG, TA-STVG achieves state-of-the-art performance and significantly outperforms the baseline, validating its efficacy. Moreover, TTS and ASA are designed for general purpose. When applied to existing methods such as TubeDETR and STCAT, we show substantial performance gains, verifying its generality. Code is released at https://github.com/HengLan/TA-STVG.",
      "keywords": [
        "Spatio-Temporal Video Grounding"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "YcbE2K3i2E",
      "title": "SaTran: An efficient Transformer exploiting Spatiotemporal Redundancies for Satellite Image Time Series Representation Learning",
      "abstract": "Earth observation applications like crop yield prediction, solar energy prediction, land cover classification, etc., need large size Satellite Image Time Series (SITS) leading to huge computational requirements. A couple of BERT-based models exist which work at pixel level unable to exploit spatial correlation among pixels and also require ground truth at pixel granularity during fine-tuning, rendering them infeasible for prediction tasks. The  models based on Vision Transformer factorize spatial and time dimensions and first process images and then time series of image embeddings. However, in many cases, SITS require simultaneous analysis of both dimensions. We present a transformer, SaTran, which focuses on non-redundant patch tubes to overcome the limitations listed above. Transformers developed for RGB videos are found lacking when applied to SITS data characterized by the presence of patches with spatiotemporal redundancy persisting throughout the time series. SITS data also has patches where temporal redundancy lasts only for a few timestamps. The salient features of SaTran include: 1) an automatic patch tube selection mechanism which ignores spatiotemporally redundant patches; 2) exploitation of spatial correlation between pixels by the processing of patch tubes and handling of their temporal redundancy using tube masking; 3) two-fold handling of redundancy and distributed application of VideoMAE enables space and time efficient processing of large size SITS; and 4) learning end task agnostic representation of entire time series. Extensive experimentation shows that SaTran outperforms competing models and exhibit state-of-the-art performance for various earth observation applications. The code is available on (.. will be given after acceptance..).",
      "keywords": [
        "Satellite image time series analytics",
        "Transformer",
        "Earth observation applications",
        "Spatiotemporal redundancy",
        "Representation learning"
      ],
      "decision": "Reject",
      "year": "2025"
    }
  },
  {
    "group_id": "EqcLAU6gyU",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "c4w1TqcSi0",
      "title": "Optima: Optimizing Effectiveness and Efficiency for LLM-Based Multi-Agent System",
      "abstract": "Large Language Model (LLM) based multi-agent systems (MAS) show remarkable potential in collaborative problem-solving, yet they still face critical challenges: low communication efficiency, poor scalability, and a lack of effective parameter-updating optimization methods for multi-agent collaboration. We present Optima, a novel framework that addresses these issues by significantly enhancing both communication efficiency and task effectiveness in LLM-based MAS through LLM training. At its core, Optima employs an iterative generate, rank, select, and train paradigm, incorporating a reward function that balances task performance, token efficiency, and communication readability. We explore various RL algorithms, including Supervised Fine-Tuning, Direct Preference Optimization, and their hybrid approaches, providing insights into their effectiveness-efficiency trade-offs for iterative LLM-based MAS training. Additionally, we integrate Monte Carlo Tree Search-inspired techniques for DPO data generation, conceptualizing conversation turns as tree nodes to explore diverse interaction trajectories. We evaluate Optima on common multi-agent tasks, including information-asymmetric question answering and complex reasoning. Our method demonstrates consistent and substantial improvements over single-agent baselines and vanilla MAS based on Llama 3 8B, achieving up to 2.8x performance gain with less than 10\\% tokens on tasks requiring heavy multi-agent information exchange. Moreover, Optima's efficiency gains open new possibilities for leveraging inference-compute more effectively, potentially leading to improved inference-time scaling laws. By addressing fundamental challenges in multi-agent collaboration and providing a novel optimization framework, Optima shows the potential towards scalable, efficient, and effective LLM-based MAS.",
      "keywords": [
        "llm agent",
        "multi-agent",
        "inference scaling law"
      ],
      "decision": "Reject",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "st77ShxP1K",
      "title": "Do as We Do, Not as You Think: the Conformity of Large Language Models",
      "abstract": "Recent advancements in large language models (LLMs) revolutionize the field of intelligent agents, enabling collaborative multi-agent systems capable of tackling complex problems across various domains. However, the potential of conformity within these systems, analogous to phenomena like conformity bias and group-think in human group dynamics, remains largely unexplored, raising concerns about their collective problem-solving capabilities and possible ethical implications. This paper presents a comprehensive study on conformity in LLM-driven multi-agent systems, focusing on three aspects: the existence of conformity, the factors influencing conformity, and potential mitigation strategies. In particular, we introduce BenchForm, a new conformity-oriented benchmark, featuring reasoning-intensive tasks and five distinct interaction protocols designed to probe LLMs’ behavior in collaborative scenarios. Several representative LLMs are evaluated on BenchForm, using metrics such as conformity rate and independence rate to quantify conformity’s impact. Our analysis delves into factors influencing conformity, including interaction time and majority size, and examines how the subject agent rationalize its conforming behavior. Furthermore, we explore two strategies to mitigate conformity effects, i.e., developing enhanced persona and implementing a reflection mechanism. Several interesting findings regarding LLMs’ conformity are derived from empirical results and case studies. We hope that these insights can pave the way for more robust and ethically-aligned collaborative AI systems. Our benchmark and code are available at BenchForm.",
      "keywords": [
        "Large Language Models",
        "Conformity",
        "Multi-agent System"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    }
  },
  {
    "group_id": "reZKq6hjOZ",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "DJSZGGZYVi",
      "title": "Representation Alignment for Generation: Training Diffusion Transformers Is Easier Than You Think",
      "abstract": "Recent studies have shown that the denoising process in (generative) diffusion models can induce meaningful (discriminative) representations inside the model, though the quality of these representations still lags behind those learned through recent self-supervised learning methods. We argue that one main bottleneck in training large-scale diffusion models for generation lies in effectively learning these representations. Moreover, training can be made easier by incorporating high-quality external visual representations, rather than relying solely on the diffusion models to learn them independently. We study this by introducing a straightforward regularization called REPresentation Alignment (REPA), which aligns the projections of noisy input hidden states in denoising networks with clean image representations obtained from external, pretrained visual encoders. The results are striking: our simple strategy yields significant improvements in both training efficiency and generation quality when applied to popular diffusion and flow-based transformers, such as DiTs and SiTs. For instance, our method can speed up SiT training by over 17.5$\\times$, matching the performance (without classifier-free guidance) of a SiT-XL model trained for 7M steps in less than 400K steps. In terms of final generation quality, our approach achieves state-of-the-art results of FID=1.42 using classifier-free guidance with the guidance interval.",
      "keywords": [
        "Diffusion models",
        "Representation learning"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "vNZIePda08",
      "title": "Sparse-to-Sparse Training of Diffusion Models",
      "abstract": "Diffusion models (DMs) are a powerful type of generative models that have achieved state-of-the-art results in various image synthesis tasks and have shown  potential in other domains, such as natural language processing and temporal data modeling. Despite their stable training dynamics and ability to produce diverse high-quality samples, DMs are notorious for requiring significant computational resources, both in the training and inference stages. Previous work has focused mostly on increasing the efficiency of model inference. This paper introduces, for the first time, the paradigm of sparse-to-sparse training to DMs, with the aim of improving both training and inference efficiency. We focus on unconditional generation and train sparse DMs from scratch (Latent Diffusion and ChiroDiff) on six datasets using three different methods (Static-DM, RigL-DM, and MagRan-DM) to study the effect of sparsity in model performance. Our experiments show that sparse DMs are able to match and sometimes outperform their Dense counterparts, while substantially reducing the number of trainable parameters and FLOPs. We also identify safe and effective values to perform sparse-to-sparse training of DMs.",
      "keywords": [
        "Diffusion Models",
        "Sparse-to-Sparse Training",
        "Static Sparse Training",
        "Dynamic Sparse Training"
      ],
      "decision": "Reject",
      "year": "2025"
    }
  },
  {
    "group_id": "peX9zpWgg4",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "WWymYrA48K",
      "title": "Test Time Learning for Time Series Forecasting",
      "abstract": "We propose the use of Test-Time Training (TTT) modules in a cascade architecture to enhance performance in long-term time series forecasting. Through extensive experiments on standard benchmark datasets, we demonstrate that TTT modules consistently outperform state-of-the-art models, including Mamba-based TimeMachine, particularly in scenarios involving extended sequence and prediction lengths. Our results show significant improvements, especially on larger datasets such as Electricity, Traffic, and Weather, underscoring the effectiveness of TTT in capturing long-range dependencies. Additionally, we explore various convolutional architectures within the TTT framework, showing that convolutional blocks as hidden layer architectures can achieve competitive results.",
      "keywords": [
        "Time Series Forecasting",
        "Test-Time Training",
        "Mamba",
        "Expressive Hidden States",
        "Modern CNN"
      ],
      "decision": "Reject",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "xByvdb3DCm",
      "title": "When Selection Meets Intervention: Additional Complexities in Causal Discovery",
      "abstract": "We address the common yet often-overlooked selection bias in interventional studies, where subjects are selectively enrolled into experiments. For instance, participants in a drug trial are usually patients of the relevant disease; A/B tests on mobile applications target existing users only, and gene perturbation studies typically focus on specific cell types, such as cancer cells. Ignoring this bias leads to incorrect causal discovery results. Even when recognized, the existing paradigm for interventional causal discovery still fails to address it. This is because subtle differences in _when_ and _where_ interventions happen can lead to significantly different statistical patterns. We capture this dynamic by introducing a graphical model that explicitly accounts for both the observed world (where interventions are applied) and the counterfactual world (where selection occurs while interventions have not been applied). We characterize the Markov property of the model, and propose a provably sound algorithm to identify causal relations as well as selection mechanisms up to the equivalence class, from data with soft interventions and unknown targets. Through synthetic and real-world experiments, we demonstrate that our algorithm effectively identifies true causal relations despite the presence of selection bias.",
      "keywords": [
        "causal discovery",
        "selection bias",
        "experiments",
        "interventions"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    }
  },
  {
    "group_id": "cJd1BgZ9CS",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "HD6bWcj87Y",
      "title": "Data Shapley in One Training Run",
      "abstract": "Data Shapley offers a principled framework for attributing the contribution of data within machine learning contexts. However, the traditional notion of Data Shapley requires re-training models on various data subsets, which becomes computationally infeasible for large-scale models. Additionally, this retraining-based definition cannot evaluate the contribution of data for a specific model training run, which may often be of interest in practice. This paper introduces a novel concept, In-Run Data Shapley, which eliminates the need for model retraining and is specifically designed for assessing data contribution for a particular model of interest. In-Run Data Shapley calculates the Shapley value for each gradient update iteration and accumulates these values throughout the training process. We present several techniques that allow the efficient scaling of In-Run Data Shapley to the size of foundation models. In its most optimized implementation, our method adds negligible runtime overhead compared to standard model training. This dramatic efficiency improvement makes it possible to perform data attribution for the foundation model pretraining stage. We present several case studies that offer fresh insights into pretraining data's contribution and discuss their implications for copyright in generative AI and pretraining data curation.",
      "keywords": [
        "Shapley value",
        "data valuation."
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "NI8AUSAc4i",
      "title": "LoRC: Low-Rank Compression for LLMs KV Cache with a Progressive Compression Strategy",
      "abstract": "The Key-Value (KV) cache is a crucial component in serving transformer-based autoregressive large language models (LLMs), enabling faster inference by storing previously computed KV vectors. However, its memory consumption scales linearly with sequence length and batch size, posing a significant bottleneck in LLM deployment. Existing approaches to mitigate this issue include: (1) efficient attention variants integrated in upcycling stages, which requires extensive parameter tuning thus unsuitable to pre-trained LLMs; (2) KV cache compression at test time, primarily through token eviction policies, which often overlook inter-layer dependencies and can be task-specific.\n\nThis paper introduces an orthogonal approach to KV cache compression. We propose a low-rank approximation of  KV weight matrices, allowing for plug-in integration with existing transformer-based LLMs without model retraining. To effectively compress KV cache at the weight level, we adjust for layerwise sensitivity and introduce a progressive compression strategy, which is supported by our theoretical analysis on how compression errors accumulate in deep networks. Our method is designed to function without model tuning in upcycling stages or task-specific profiling in test stages. Extensive experiments with LLaMA models ranging from 8B to 70B parameters across various tasks show that our approach significantly reduces the GPU memory footprint while maintaining performance.",
      "keywords": [
        "KV Cache Compression",
        "Progressive Compression Strategy"
      ],
      "decision": "Reject",
      "year": "2025"
    }
  },
  {
    "group_id": "J2Jyp1SZ0n",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "8QTpYC4smR",
      "title": "Systematic Review of Large Language Models: Applications, Limitations, Practical Usages and Future Directions",
      "abstract": "Large Language Models have revolutionized natural language processing with their remarkable ability to understand and generate human-like text. This review explores the various applications of large language models, highlighting their versatility across different domains. The paper begins with an introduction to LLMs, followed by an overview of their types and a detailed literature review. We then examine their limitations before delving into specific applications such as text generation, translation, summarization, and more. Finally, we discuss future directions for research and development, concluding with a summary of key findings and the potential impact of large language models on various industries.",
      "keywords": [
        "Large Language Models",
        "Systematic Review"
      ],
      "decision": "Reject",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "f4gF6AIHRy",
      "title": "Combatting Dimensional Collapse in LLM Pre-Training Data via Submodular File Selection",
      "abstract": "Selecting high-quality pre-training data for large language models (LLMs) is crucial for enhancing their overall performance under limited computation budget, improving both training and sample efficiency. Recent advancements in file selection primarily rely on using an existing or trained proxy model to assess the similarity of samples to a target domain, such as high quality sources BookCorpus and Wikipedia. However, upon revisiting these methods, the domain-similarity selection criteria demonstrates a diversity dilemma, i.e. dimensional collapse in the feature space, improving performance on the domain-related tasks but causing severe degradation on generic performance.To prevent collapse and enhance diversity, we propose a DiverSified File selection algorithm (DiSF), which selects the most decorrelated text files in the feature space. We approach this with a classical greedy algorithm to achieve more uniform eigenvalues in the feature covariance matrix of the selected texts, analyzing its approximation to the optimal solution under a formulation of $\\gamma$-weakly submodular optimization problem. Empirically, we establish a benchmark and conduct extensive experiments on the TinyLlama architecture with models from 120M to 1.1B parameters. Evaluating across nine tasks from the Harness framework, DiSF demonstrates a significant improvement on overall performance. Specifically, DiSF saves 98.5\\% of 590M training files in SlimPajama, outperforming the full-data pre-training within a 50B training budget, and achieving about 1.5x training efficiency and 5x data efficiency. Source code\nis available at: https://github.com/MediaBrain-SJTU/DiSF.git.",
      "keywords": [
        "file selection",
        "large language model",
        "pre-training",
        "submodular optimization"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    }
  },
  {
    "group_id": "8KQzoD5XAr",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "98ASXp6oPg",
      "title": "Self-Explained Keywords Empower Large Language Models for Code Generation",
      "abstract": "Large language models (LLMs) have achieved impressive performance in code generation. Despite the remarkable success, we observed that LLMs often misunderstand or overlook some problem-specific undertrained keywords during code generation, compromising the accuracy of the generated code. After explicitly explaining these undertrained keywords using well-trained terms in the prompt, LLMs are more likely to generate correct code implementation. Inspired by this observation, we propose a novel technique named SEK (Self-Explained Keywords), which empowers an LLM for better code generation by extracting and explaining the key terms in the problem description with the LLM itself. Comprehensive experiments across three benchmarks, i.e., HumanEval(+), MBPP(+), and APPS, with five representative LLMs, show that SEK can significantly improve LLMs in code generation, yielding substantial and consistent gains. For instance, SEK improves the Pass@1 of DeepSeek-Coder-V2-Instruct from 85.4% to 93.3% on the Humaneval benchmark. Further analysis confirms that SEK enables the LLMs to shift their attention from low-frequency keywords to their corresponding high-frequency counterparts.",
      "keywords": [
        "Large Language Model",
        "Code Generation",
        "Prompt Engineering"
      ],
      "decision": "Reject",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "YrycTjllL0",
      "title": "BigCodeBench: Benchmarking Code Generation with Diverse Function Calls and Complex Instructions",
      "abstract": "Task automation has been greatly empowered by the recent advances in Large Language Models (LLMs) via Python code, where the tasks range from software engineering development to general-purpose reasoning. While current benchmarks have shown that LLMs can solve tasks using programs like human developers, the majority of their evaluations are limited to short and self-contained algorithmic tasks or standalone function calls. Solving challenging and practical tasks requires the capability of utilizing **diverse function calls as tools** to efficiently implement functionalities like data analysis and web development. In addition, using multiple tools to solve a task needs compositional reasoning by accurately understanding **complex instructions**. Fulfilling both of these characteristics can pose a great challenge for LLMs. To assess how well LLMs can solve challenging and practical tasks via programs, we introduce BigCodeBench, a benchmark that challenges LLMs to invoke multiple function calls as tools from 139 libraries and 7 domains for 1,140 fine-grained tasks. To evaluate LLMs rigorously, each task encompasses 5.6 test cases with an average branch coverage of 99%. In addition, we propose a natural-language-oriented variant of BigCodeBench, BigCodeBench-Instruct, that automatically transforms the original docstrings into short instructions containing only essential information. Our extensive evaluation of 60 LLMs shows that **LLMs are not yet capable of following complex instructions to use function calls precisely, with scores up to 60%, significantly lower than the human performance of 97%**. The results underscore the need for further advancements in this area.",
      "keywords": [
        "Code Generation",
        "Tool Use",
        "Instruction Following",
        "Benchmark"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    }
  },
  {
    "group_id": "8oFvUBvF1u",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "meRCKuUpmc",
      "title": "Predictive Inverse Dynamics Models are Scalable Learners for Robotic Manipulation",
      "abstract": "Current efforts to learn scalable policies in robotic manipulation primarily fall into two categories: one focuses on \"action,\" which involves behavior cloning from extensive collections of robotic data, while the other emphasizes \"vision,\" enhancing model generalization by pre-training representations or generative models, also referred to as world models, using large-scale visual datasets. This paper presents an end-to-end paradigm that predicts actions using inverse dynamics models conditioned on the robot's forecasted visual states, named Predictive Inverse Dynamics Models (PIDM). By closing the loop between vision and action, the end-to-end PIDM can be a better scalable action learner. In practice, we use Transformers to process both visual states and actions, naming the model Seer. It is initially pre-trained on large-scale robotic datasets, such as DROID, and can be adapted to real-world scenarios with a little fine-tuning data. Thanks to large-scale, end-to-end training and the continuous synergy between vision and action at each execution step, Seer significantly outperforms state-of-the-art methods across both simulation and real-world experiments. It achieves improvements of 13% on the LIBERO-LONG benchmark, 22% on CALVIN ABC-D, and 43% in real-world tasks. Notably, it demonstrates superior generalization for novel objects, lighting conditions, and environments under high-intensity disturbances. Code and models will be publicly available.",
      "keywords": [
        "Robotic Manipulation ; Pre-training ; Visual Foresight ; Inverse Dynamics ; Large-scale robot dataset"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "xcHIiZr3DT",
      "title": "Vision-Based Pseudo-Tactile Information Extraction and Localization for Dexterous Grasping",
      "abstract": "This study addresses the challenges of tactile perception in robotic dexterous hand grasping by focusing on two main tasks: 1) Acquiring tactile information from everyday objects using vision, termed \"pseudo-tactile\" information, and 2) Building a Dexterous Hand (RH8D) model in Isaac Sim for real-time fingertip contact localization. Utilizing Isaac Sim enables safe, cost-effective experimentation and high-precision simulations that facilitate data collection for model validation. The research establishes a scientific connection between simulated 3D coordinates, actual 3D coordinates, and pseudo-tactile information derived from point clouds, quantified through normal vectors and grayscale variance analysis. Results demonstrate the ability to extract clear object surface textures, accurately locate fingertip contact points in real-time (with precision up to $0.001 m$), and provide tactile information at contact points. This framework enhances robotic grasping capabilities and offers low-cost sensory data. The source code and dataset are publicly available now.",
      "keywords": [
        "Pseudo-Tactile Information",
        "Dexterous Grasping",
        "Vision-Based Perception",
        "Robotic Localization"
      ],
      "decision": "Reject",
      "year": "2025"
    }
  },
  {
    "group_id": "vM94dZiqx4",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "WypSbOf9S9",
      "title": "MOREL: Enhancing Adversarial Robustness through Multi-Objective Representation Learning",
      "abstract": "Extensive research has shown that deep neural networks (DNNs) are vulnerable to slight adversarial perturbations—small changes to the input data that appear insignificant but cause the model to produce drastically different outputs. In addition to augmenting training data with adversarial examples generated from a specific attack method, most of the current defense strategies necessitate modifying the original model architecture components to improve robustness or performing test-time data purification to handle adversarial attacks. In this work, we demonstrate that strong feature representation learning during training can significantly enhance the original model's robustness. We propose MOREL, a multi-objective feature representation learning approach, encouraging classification models to produce similar features for inputs within the same class, despite perturbations. Our training method involves an embedding space where cosine similarity loss and multi-positive contrastive loss are used to align natural and adversarial features from the model encoder and ensure tight clustering. Concurrently, the classifier is motivated to achieve accurate predictions. Through extensive experiments, we demonstrate that our approach significantly enhances the robustness of DNNs against white-box and black-box adversarial attacks, outperforming other methods that similarly require no architectural changes or test-time data purification.",
      "keywords": [
        "Adversarial robustness",
        "Representation learning",
        "Multi-objective optimization",
        "Deep neural networks"
      ],
      "decision": "Reject",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "syThiTmWWm",
      "title": "Cheating Automatic LLM Benchmarks: Null Models Achieve High Win Rates",
      "abstract": "Automatic LLM benchmarks, such as AlpacaEval 2.0, Arena-Hard-Auto, and MT-Bench, have become popular for evaluating language models due to their cost-effectiveness and scalability compared to human evaluation. Achieving high win rates on these benchmarks can significantly boost the promotional impact of newly released language models. This promotional benefit may motivate tricks, such as manipulating model output length or style to game win rates, even though several mechanisms have been developed to control length and disentangle style to reduce gameability. Nonetheless, we show that even a **\"null model\"** that always outputs a **constant** response (*irrelevant to input instructions*) can cheat automatic benchmarks and achieve top-ranked win rates: an $86.5\\\\%$ LC win rate on AlpacaEval 2.0; an $83.0$ score on Arena-Hard-Auto; and a $9.55$ score on MT-Bench. Moreover, the crafted cheating outputs are **transferable** because we assume that the instructions of these benchmarks (e.g., $805$ samples of AlpacaEval 2.0) are *private* and cannot be accessed. While our experiments are primarily proof-of-concept, an adversary could use LLMs to generate more imperceptible cheating responses, unethically benefiting from high win rates and promotional impact. Our findings call for the development of anti-cheating mechanisms for reliable automatic benchmarks. The code is available at https://github.com/sail-sg/Cheating-LLM-Benchmarks.",
      "keywords": [
        "Large Language Models",
        "Cheating",
        "Automatic LLM Benchmarks"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    }
  },
  {
    "group_id": "bVTM2QKYuA",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "bVTM2QKYuA",
      "title": "The Geometry of Categorical and Hierarchical Concepts in Large Language Models",
      "abstract": "The linear representation hypothesis is the informal idea that semantic concepts are encoded as linear directions in the representation spaces of large language models (LLMs). Previous work has shown how to make this notion precise for representing binary concepts that have natural contrasts (e.g., {male, female}) as _directions_ in representation space. However, many natural concepts do not have natural contrasts (e.g., whether the output is about an animal). In this work, we show how to extend the formalization of the linear representation hypothesis to represent features (e.g., is_animal) as _vectors_. This allows us to immediately formalize the representation of categorical concepts as polytopes in the representation space. Further, we use the formalization to prove a relationship between the hierarchical structure of concepts and the geometry of their representations. We validate these theoretical results on the Gemma and LLaMA-3 large language models, estimating representations for 900+ hierarchically related concepts using data from WordNet.",
      "keywords": [
        "categorical concepts",
        "hierarchical concepts",
        "linear representation hypothesis",
        "causal inner product",
        "interpretability"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "j0sq9r3HFv",
      "title": "Automated Parameter Extraction for Biologically Realistic Neural Networks: An Initial Exploration with Large Language Models",
      "abstract": "In computational neuroscience, extracting parameters for constructing biologically realistic neural models is a resource-intensive task that requires continuous updates as new research emerges. This paper explores utilizing large language models (LLMs) in automating parameter extraction from scientific literature for biologically realistic neural models. We utilized open-source LLMs via Ollama to construct KGs, capturing parameters such as neuron morphology, synapse dynamics, and receptor properties. SNNBuilder \\cite{Gutierrez2022}, a framework for building spiking neural network (SNN) models, serves as a key validation example for our framework. However, the methodology we outline here can extend beyond SNNs and could applied to systematic modelling of the brain.By experimenting with different prompting strategies—general extraction, in-context hints, and masked prompting—we evaluated the ability of LLMs to autonomously extract relevant data and organize it within an expert-base or data-driven ontology, as well as to infer missing information for neural model construction. Additionally, we implemented retrieval-augmented generation (RAG) via LangChain to further improve the accuracy of parameter extraction through leveraging external knowledge sources. Analysis of the the generated KGs, demonstrated that LLMs, when guided by targeted prompts, can enhance the data-to-model process, paving the way for more efficient parameter extraction and model construction in computational neuroscience.",
      "keywords": [
        "Large Language Models",
        "Knowledge Graphs",
        "Computational neuroscience",
        "Neural model construction"
      ],
      "decision": "Reject",
      "year": "2025"
    }
  },
  {
    "group_id": "NltQraRnbW",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "FwW3jqchtY",
      "title": "Identifying neural dynamics using interventional state space models",
      "abstract": "Neural circuits produce signals that are complex and nonlinear. To facilitate the understanding of neural dynamics, a popular approach is to fit state space models (SSM) to data and analyze the dynamics of the low-dimensional latent variables. Despite the power of SSM in explaining neural circuit dynamics, it has been shown that these models merely capture statistical associations in the data and cannot be causally interpreted. Therefore, an important research problem is to build models that can predict neural dynamics under causal manipulations. Here, we propose interventional state space models (iSSM), a class of causal models that can predict neural responses to novel perturbations. We draw on recent advances in causal dynamical systems and present theoretical results for the identifiability of iSSM. In simulations of the motor cortex, we show that iSSM can recover the true latents and the underlying dynamics. In addition, we illustrate two applications of iSSM in biological datasets. First, we apply iSSM to a dataset of calcium recordings from ALM neurons in mice during photostimulation and uncover dynamical mechanisms underlying short-term memory. Second, we apply iSSM to a dataset of electrophysiological recordings from macaque dlPFC recordings during micro-stimulation and show that it successfully predicts responses to unseen perturbations.",
      "keywords": [
        "Causal dynamical systems",
        "interventions",
        "state space models",
        "photostimulation",
        "micro-stimulation"
      ],
      "decision": "Reject",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "LyJi5ugyJx",
      "title": "Simplifying, Stabilizing and Scaling Continuous-time Consistency Models",
      "abstract": "Consistency models (CMs) are a powerful class of diffusion-based generative models optimized for fast sampling. Most existing CMs are trained using discretized timesteps, which introduce additional hyperparameters and are prone to discretization errors. While continuous-time formulations can mitigate these issues, their success has been limited by training instability. To address this, we propose a simplified theoretical framework that unifies previous parameterizations of diffusion models and CMs, identifying the root causes of instability. Based on this analysis, we introduce key improvements in diffusion process parameterization, network architecture, and training objectives. These changes enable us to train continuous-time CMs at an unprecedented scale, reaching 1.5B parameters on ImageNet 512×512. Our proposed training algorithm, using only two sampling steps, achieves FID scores of 2.06 on CIFAR-10, 1.48 on ImageNet 64×64, and 1.88 on ImageNet 512×512, narrowing the gap in FID scores with the best existing diffusion models to within 10\\%.",
      "keywords": [
        "continuous-time consistency models",
        "diffusion models",
        "fast sampling"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    }
  },
  {
    "group_id": "E2PFv7ad3p",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "SPS6HzVzyt",
      "title": "Context-Parametric Inversion: Why Instruction Finetuning May Not Actually Improve Context Reliance",
      "abstract": "Large Language Model's are instruction-finetuned to enhance their ability to follow user instructions and better comprehend input context. Still, they often struggle to follow the input context, especially when it contradicts model's parametric knowledge. This manifests as various failures, such as hallucinations where a model inserts outdated or unwarranted facts into its response. In this work, we observe an intriguing phenomenon: the context reliance of the model decreases as instruction finetuning progresses, $\\textit{despite an initial expected increase}$. We call this phenomenon as the $\\textbf{context-parametric inversion}$. This is surprising, as one would expect instruction tuning to improve the model's ability to follow input instructions.  We observe this behavior on multiple general purpose instruction tuning datasets such as TULU, Alpaca and Ultrachat, across multiple model families like Llama, Mistral and Pythia.  We perform various controlled studies to eliminate some simple hypothesis for this observed behavior and isolate what datapoints cause this counter-intuitive behavior. We then analyze the phenomenon theoretically, to explain why context reliance varies across the trajectory of finetuning. \nWe tie the observed context-parametric inversion to the properties of the finetuning data, which provides us with some potential mitigation strategies that provide limited but insightful gains.",
      "keywords": [
        "Instruction finetuning",
        "context-vs-parametric reliance"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "qb2QRoE4W3",
      "title": "LLM-Cite: Cheap Fact Verification with Attribution via URL Generation",
      "abstract": "Hallucinations are one of the main issues with Large Language Models (LLMs). This has led to increased interest in automated ways to verify the factuality of LLMs' responses. Existing methods either rely on: (a) search over a knowledge base (KB), which is costly especially if the KB must be updated frequently to keep up with fresh content, (b) LLM's parametric knowledge to fact-check claims, which is cheaper but does not give attribution and is limited to verifying claims related to knowledge acquired during pretraining. In this work, we present LLM-Cite, a cheap and easy to implement method that does not rely on any external search system while still providing attribution and the ability to verify fresh claims. Our key insight is to leverage an LLM to directly generate potential citation URLs for a given claim, and then use entailment checks to verify the claim against content of the URLs (which are fetched on-the-fly). We benchmark LLM-Cite on three datasets containing fresh and non-fresh claims generated by humans and models. We show that LLM-Cite performs comparable or better than existing methods on all categories of claims --- importantly, without sacrificing attribution, or requiring costly external search --- overall LLM-Cite is more than 45x cheaper than a Google Search based approach.",
      "keywords": [
        "Fact Verification",
        "Attribution",
        "Citation",
        "Factuality"
      ],
      "decision": "Reject",
      "year": "2025"
    }
  },
  {
    "group_id": "uy31tqVuNo",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "RuP17cJtZo",
      "title": "Generator Matching: Generative modeling with arbitrary Markov processes",
      "abstract": "We introduce Generator Matching, a modality-agnostic framework for generative modeling using arbitrary Markov processes. Generators characterize the infinitesimal evolution of a Markov process, which we leverage for generative modeling in a similar vein to flow matching: we construct conditional generators which generate single data points, then learn to approximate the marginal generator which generates the full data distribution. We show that Generator Matching unifies various generative modeling methods, including diffusion models, flow matching and discrete diffusion models. Furthermore, it expands the design space to new and unexplored Markov processes such as jump processes. Finally, Generator Matching enables the construction of superpositions of Markov generative models and enables the construction of multimodal models in a rigorous manner. We empirically validate our method on image and multimodal generation, e.g. showing that superposition with a jump process improves performance.",
      "keywords": [
        "Flow matching",
        "Markov process",
        "Diffusion model",
        "Generative Modeling"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "VAvZ4oinpa",
      "title": "Video Generation with Learned Action Prior",
      "abstract": "Long-term stochastic video generation remains challenging, especially with moving cameras. This scenario introduces complex interactions between camera movement and observed pixels, resulting in intricate spatio-temporal dynamics and partial observability issues. Current approaches often focus on pixel-level image reconstruction, neglecting explicit modeling of camera motion dynamics. Our proposed solution incorporates camera motion or action as an extended part of the observed image state, employing a multi-modal learning framework to simultaneously model both image and action. We introduce three models: (i) Video Generation with Learning Action Prior (VG-LeAP) that treats the image-action pair as an augmented state generated from a single latent stochastic process and uses variational inference to learn the image-action latent prior; (ii) Causal-LeAP, which establishes a causal relationship between action and the observed image frame, and learns a seperate action prior, conditioned on the observed image states along with the image prior; and (iii) RAFI, which integrates the augmented image-action state concept with a conditional flow matching framework, demonstrating that this action-conditioned image generation concept can be extended to other transformer-based architectures. Through comprehensive empirical studies on robotic video dataset, RoAM, we highlight the importance of multi-modal training in addressing partially observable video generation problems.",
      "keywords": [
        "Stochastic Video Generation",
        "Variational Inference"
      ],
      "decision": "Reject",
      "year": "2025"
    }
  },
  {
    "group_id": "jfwe9qNqRi",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "OaORjvWelu",
      "title": "Cost-Efficient Multi-Fidelity Alignment for LLMs",
      "abstract": "Alignment is a critical step in large language model (LLM) post-training. It typically requires human annotations to align the model's output to human preferences, which is prohibitively expensive. This paper proposes a novel approach to reduce the alignment cost.\n Specifically, we consider multiple levels of alignment with different qualities and response-generating costs, which we refer to as multi-fidelity alignment. We develop a new approach to incorporating the varying levels of response quality to train a language model, aiming to reduce the cost of response collection for alignment while maintaining the performance of the language model. We provide theoretical insights and empirical results to support the effectiveness of the proposed multi-fidelity alignment approach. Lastly, we conduct experiments to corroborate the effectiveness of the proposed approach by comparing its performance with the vanilla alignment methods.",
      "keywords": [
        "Multi-Fidelity",
        "Alignment",
        "LLM"
      ],
      "decision": "Reject",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "vo9t20wsmd",
      "title": "Faster Cascades via Speculative Decoding",
      "abstract": "Cascades and speculative decoding are two common approaches to improving language models' inference efficiency.  Both approaches interleave two models, but via fundamentally distinct mechanisms: deferral rule that invokes the larger model only for “hard” inputs, while  speculative decoding uses speculative execution to primarily invoke the larger model in parallel scoring mode. These mechanisms offer different benefits: empirically, cascades offer compelling cost-quality trade-offs, often even outperforming the large model; speculative cascades offer impressive speed-ups, while guaranteeing quality-neutrality. In this paper, we leverage the best of both these approaches by designing new speculative cascading techniques that implement their deferral rule through speculative execution. We characterize the optimal deferral rule for our speculative cascades, and employ a plug-in approximation to the optimal rule.  Experiments with Gemma and T5 models on a range of language benchmarks show that our approach yields better cost quality trade-offs than cascading and speculative decoding baselines.",
      "keywords": [
        "Cascades",
        "Speculative Decoding",
        "Speculative execution",
        "LLM",
        "Inference",
        "Adaptive Inference"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    }
  },
  {
    "group_id": "nWT6LxbuGi",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "DJSZGGZYVi",
      "title": "Representation Alignment for Generation: Training Diffusion Transformers Is Easier Than You Think",
      "abstract": "Recent studies have shown that the denoising process in (generative) diffusion models can induce meaningful (discriminative) representations inside the model, though the quality of these representations still lags behind those learned through recent self-supervised learning methods. We argue that one main bottleneck in training large-scale diffusion models for generation lies in effectively learning these representations. Moreover, training can be made easier by incorporating high-quality external visual representations, rather than relying solely on the diffusion models to learn them independently. We study this by introducing a straightforward regularization called REPresentation Alignment (REPA), which aligns the projections of noisy input hidden states in denoising networks with clean image representations obtained from external, pretrained visual encoders. The results are striking: our simple strategy yields significant improvements in both training efficiency and generation quality when applied to popular diffusion and flow-based transformers, such as DiTs and SiTs. For instance, our method can speed up SiT training by over 17.5$\\times$, matching the performance (without classifier-free guidance) of a SiT-XL model trained for 7M steps in less than 400K steps. In terms of final generation quality, our approach achieves state-of-the-art results of FID=1.42 using classifier-free guidance with the guidance interval.",
      "keywords": [
        "Diffusion models",
        "Representation learning"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "vNZIePda08",
      "title": "Sparse-to-Sparse Training of Diffusion Models",
      "abstract": "Diffusion models (DMs) are a powerful type of generative models that have achieved state-of-the-art results in various image synthesis tasks and have shown  potential in other domains, such as natural language processing and temporal data modeling. Despite their stable training dynamics and ability to produce diverse high-quality samples, DMs are notorious for requiring significant computational resources, both in the training and inference stages. Previous work has focused mostly on increasing the efficiency of model inference. This paper introduces, for the first time, the paradigm of sparse-to-sparse training to DMs, with the aim of improving both training and inference efficiency. We focus on unconditional generation and train sparse DMs from scratch (Latent Diffusion and ChiroDiff) on six datasets using three different methods (Static-DM, RigL-DM, and MagRan-DM) to study the effect of sparsity in model performance. Our experiments show that sparse DMs are able to match and sometimes outperform their Dense counterparts, while substantially reducing the number of trainable parameters and FLOPs. We also identify safe and effective values to perform sparse-to-sparse training of DMs.",
      "keywords": [
        "Diffusion Models",
        "Sparse-to-Sparse Training",
        "Static Sparse Training",
        "Dynamic Sparse Training"
      ],
      "decision": "Reject",
      "year": "2025"
    }
  },
  {
    "group_id": "HH4KWP8RP5",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "5IkDAfabuo",
      "title": "Prioritized Generative Replay",
      "abstract": "Sample-efficient online reinforcement learning often uses replay buffers to store experience for reuse when updating the value function. \nHowever, uniform replay is inefficient, since certain classes of transitions can be more relevant to learning. While prioritization of more useful samples is helpful, this strategy can also lead to overfitting, as useful samples are likely to be more rare. In this work, we instead propose a prioritized, parametric version of an agent's memory, using generative models to capture online experience. This paradigm enables (1) densification of past experience, with new generations that benefit from the generative model's generalization capacity and (2) guidance via a family of \"relevance functions\" that push these generations towards more useful parts of an agent's acquired history. We show this recipe can be instantiated using conditional diffusion models and simple relevance functions such as curiosity- or value-based metrics. Our approach consistently improves performance and sample efficiency in both state- and pixel-based domains. We expose the mechanisms underlying these gains, showing how guidance promotes diversity in our generated transitions and reduces overfitting. We also showcase how our approach can train policies with even higher update-to-data ratios than before, opening up avenues to better scale online RL agents. Project page available at: https://pgenreplay.github.io",
      "keywords": [
        "online learning",
        "model-based reinforcement learning",
        "generative modeling",
        "synthetic data",
        "continual learning"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "gL1cNK2UEW",
      "title": "DSMentor: Enhancing Data Science Agents with Curriculum Learning and Online Knowledge Accumulation",
      "abstract": "Large language model (LLM) agents have shown promising performance in generating code for solving complex data science problems. Recent studies primarily focus on enhancing in-context learning through improved search, sampling, and planning techniques, while overlooking the importance of the order in which problems are tackled during inference. In this work, we develop a novel inference-time optimization framework, referred to as DSMentor, which leverages curriculum learning---a strategy that introduces simpler task first and progressively moves to more complex ones as the learner improves---to enhance LLM agent performance in challenging data science tasks. Our mentor-guided framework organizes data science tasks in order of increasing difficulty and incorporates a growing long-term memory to retain prior experiences, guiding the agent's learning progression and enabling more effective utilization of accumulated knowledge. We evaluate DSMentor through extensive experiments on DSEval and QRData benchmarks. Experiments show that DSMentor using Claude-3.5-Sonnet improves the pass rate by up to 5.2% on DSEval and QRData compared to baseline agents. Furthermore, DSMentor demonstrates stronger causal reasoning ability, improving the pass rate by 8.8% on the causality problems compared to GPT-4 using Program-of-Thoughts prompts. Our work underscores the importance of developing effective strategies for accumulating and utilizing knowledge during inference, mirroring the human learning process and opening new avenues for improving LLM performance through curriculum-based inference optimization.",
      "keywords": [
        "curriculum learning",
        "data science agent",
        "long-term memory",
        "online data retrieval"
      ],
      "decision": "Reject",
      "year": "2025"
    }
  },
  {
    "group_id": "T4LtGj7us1",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "8QTpYC4smR",
      "title": "Systematic Review of Large Language Models: Applications, Limitations, Practical Usages and Future Directions",
      "abstract": "Large Language Models have revolutionized natural language processing with their remarkable ability to understand and generate human-like text. This review explores the various applications of large language models, highlighting their versatility across different domains. The paper begins with an introduction to LLMs, followed by an overview of their types and a detailed literature review. We then examine their limitations before delving into specific applications such as text generation, translation, summarization, and more. Finally, we discuss future directions for research and development, concluding with a summary of key findings and the potential impact of large language models on various industries.",
      "keywords": [
        "Large Language Models",
        "Systematic Review"
      ],
      "decision": "Reject",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "kxnoqaisCT",
      "title": "Navigating the Digital World as Humans Do: Universal Visual Grounding for GUI Agents",
      "abstract": "Multimodal large language models (MLLMs) are transforming the capabilities of graphical user interface (GUI) agents, facilitating their transition from controlled simulations to complex, real-world applications across various platforms. However, the effectiveness of these agents hinges on the robustness of their grounding capability. Current GUI agents predominantly utilize text-based representations such as HTML or accessibility trees, which, despite their utility, often introduce noise, incompleteness, and increased computational overhead. In this paper, we advocate a human-like embodiment for GUI agents that perceive the environment entirely visually and directly perform pixel-level operations on the GUI. The key is visual grounding models that can accurately map diverse referring expressions of GUI elements to their coordinates on the GUI across different platforms. We show that a simple recipe, which includes web-based synthetic data and slight adaptation of the LLaVA architecture, is surprisingly effective for training such visual grounding models. We collect the largest dataset for GUI visual grounding so far, containing 10M GUI elements and their referring expressions over 1.3M screenshots, and use it to train UGround, a strong universal visual grounding model for GUI agents. Empirical results on six benchmarks spanning three categories (grounding, offline agent, and online agent) show that 1) UGround substantially outperforms existing visual grounding models for GUI agents, by up to 20\\% absolute, and 2) agents with UGround outperform state-of-the-art agents, despite the fact that existing agents use additional text-based input while ours only uses visual perception. These results provide strong support for the feasibility and promises of GUI agents that navigate the digital world as humans do.",
      "keywords": [
        "GUI Agents",
        "Visual Grounding",
        "Multimodal Large Language Models",
        "GUI Grounding",
        "Large Language Model"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    }
  },
  {
    "group_id": "uvTea5Rfek",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "cGks3s79hW",
      "title": "High-dimensional neuronal activity from low-dimensional latent dynamics: a solvable model",
      "abstract": "Computation in recurrent networks of neurons has been hypothesized to occur at the level of low-dimensional latent dynamics, both in artificial systems and in the brain. This hypothesis seems at odds with evidence from large-scale neuronal recordings in mice showing that neuronal population activity is high-dimensional. To demonstrate that low-dimensional latent dynamics and high-dimensional activity can be two sides of the same coin, we present an analytically solvable recurrent neural network (RNN) model whose dynamics can be exactly reduced to a low-dimensional dynamical system, but generates an activity manifold that has a high linear embedding dimension. This raises the question: Do low-dimensional latents explain the high-dimensional activity observed in mouse visual cortex? Spectral theory tells us that the covariance eigenspectrum alone does not allow us to recover the dimensionality of the latents, which can be low or high, when neurons are nonlinear. To address this indeterminacy, we develop Neural Cross-Encoder (NCE), an interpretable, nonlinear latent variable modeling method for neuronal recordings, and find that high-dimensional neuronal responses to drifting gratings and spontaneous activity in visual cortex can be reduced to low-dimensional latents, while the responses to natural images cannot. We conclude that the high-dimensional activity measured in certain conditions, such as in the absence of a stimulus, is explained by low-dimensional latents that are nonlinearly processed by individual neurons.",
      "keywords": [
        "recurrent neural networks",
        "neuronal recordings",
        "visual cortex",
        "latent variable models",
        "PCA",
        "eigenvalue decay",
        "mean-field limit"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "jCbbI78q4k",
      "title": "Zero-Shot Adaptation of Behavioral Foundation Models to Unseen Dynamics",
      "abstract": "Behavioral Foundation Models (BFMs) proved successful in producing policies for arbitrary tasks in a zero-shot manner, requiring no test-time training or task-specific fine-tuning. Among the most promising BFMs are the ones that estimate the successor measure learned in an unsupervised way from task-agnostic offline data. However, these methods fail to react to changes in the dynamics, making them inefficient under partial observability or when the transition function changes. This hinders the applicability of BFMs in a real-world setting, e.g., in robotics, where the dynamics can unexpectedly change at test time. In this work, we demonstrate that Forward–Backward (FB) representation, one of the methods from the BFM family, cannot distinguish between distinct dynamics, leading to an interference among the latent directions, which parametrize different policies. To address this, we propose a FB model with a transformer-based belief estimator, which greatly facilitates zero-shot adaptation. We also show that partitioning the policy encoding space into dynamics-specific clusters, aligned with the context-embedding directions, yields additional gain in performance. These traits allow our method to respond to the dynamics observed during training and to generalize to unseen ones. Empirically, in the changing dynamics setting, our approach achieves up to a 2x higher zero-shot returns compared to the baselines for both discrete and continuous tasks.",
      "keywords": [
        "zero-shot reinforcement learning",
        "unsupervised reinforcement learning",
        "successor measure"
      ],
      "decision": "Reject",
      "year": "2025"
    }
  },
  {
    "group_id": "XmV7KRABBl",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "ImpizBSKcu",
      "title": "Dynamical Decoupling of Generalization and Overfitting in Large Two-Layer Networks",
      "abstract": "Understanding the inductive bias and generalization properties of large overparametrized machine learning models requires to characterize the dynamics of the training algorithm.  We study the learning dynamics of large two-layer neural networks via dynamical mean field theory, a well established technique of non-equilibrium statistical physics. We show that, for large network width $m$,\nand large number of samples per input dimension $n/d$, the training dynamics exhibits a separation of timescales which implies:\n$(i)$ The emergence of a slow time scale associated with the growth in Gaussian/Rademacher complexity of the network;\n$(ii)$ Inductive bias towards small complexity if the initialization has small enough complexity;\n$(iii)$ A dynamical decoupling between feature learning and overfitting regimes; $(iv)$ A non-monotone behavior of the test error, associated  `feature unlearning' regime at large times.",
      "keywords": [
        "Overfitting; feature learning; dynamical mean field theory; generalization;"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "ok3iIeuQV4",
      "title": "TDFormer: Top-Down Attention-Controlled Spiking Transformer",
      "abstract": "Traditional spiking neural networks (SNNs) can be viewed as a combination of multiple subnetworks with each running for one time step, where the parameters are shared, and the membrane potential serves as the only information link between them. However, the implicit nature of the membrane potential limits its ability to effectively represent temporal information. As a result, each time step cannot fully leverage information from previous time steps, seriously limiting the model's performance. Inspired by the top-down mechanism in the brain, we introduce TDformer, a novel model with a top-down feedback structure that functions hierarchically and leverages high-order representations from earlier time steps to modulate the processing of low-order information at later stages. The feedback structure plays a role from two perspectives: 1) During forward propagation, our model increases the mutual information across time steps, indicating that richer temporal information is being transmitted and integrated in different time steps. 2) During backward propagation, we theoretically prove that the feedback structure alleviates the problem of vanishing gradients along the time dimension. We find that these mechanisms together significantly and consistently improve the model performance on multiple datasets. In particular, our model achieves state-of-the-art performance on ImageNet with an accuracy of 86.83\\%.",
      "keywords": [
        "Top-down mechanism",
        "Brain-inspired Computing",
        "Spiking Neural Networks",
        "Vision Transformers"
      ],
      "decision": "Reject",
      "year": "2025"
    }
  },
  {
    "group_id": "3Wrv6Zay74",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "3Wrv6Zay74",
      "title": "Gradual Binary Search and Dimension Expansion : A general method for activation quantization in LLMs",
      "abstract": "Large language models (LLMs) have become pivotal in artificial intelligence, demonstrating strong capabilities in reasoning, understanding, and generating data. However, their deployment on edge devices is hindered by their substantial size, often reaching several billion parameters. Quantization is a widely used method to reduce memory usage and inference time, however LLMs present unique challenges due to the prevalence of outliers in their activations. In this work, we leverage the theoretical advantages of Hadamard matrices over random rotation matrices to push the boundaries of quantization in LLMs. We demonstrate that Hadamard matrices are more effective in reducing outliers, which are a significant obstacle in achieving low-bit quantization. Our method based on a gradual binary search enables 3-bit quantization for weights, activations, and key-value (KV) caches, resulting in a 40% increase in accuracy on common benchmarks compared to SoTA methods. We extend the use of rotation matrices to support non-power-of-2 embedding dimensions, similar to the Qwen architecture, by employing the Paley's algorithm. Our experimental results on multiple models family like Mistral, LLaMA, and Qwen demonstrate the effectiveness of our approach, outperforming existing methods and enabling practical 3-bit quantization.",
      "keywords": [
        "deep learning",
        "quantization",
        "LLM",
        "hadamard"
      ],
      "decision": "Reject",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "KnqiC0znVF",
      "title": "Large Language Diffusion Models",
      "abstract": "The capabilities of large language models (LLMs) are widely regarded as relying on autoregressive models (ARMs). We challenge this notion by introducing *LLaDA*, a diffusion model trained from scratch under the pre-training and supervised fine-tuning (SFT) paradigm. LLaDA employs a forward data masking process and a reverse generation process, parameterized by a Transformer to predict masked tokens. It provides a principled generative approach for probabilistic inference by optimizing a likelihood lower bound. Across extensive benchmarks on general tasks, math, code, and so on, LLaDA demonstrates strong *scalability* and performs comparably to our self-constructed ARM baselines. Remarkably, LLaDA 8B is competitive with strong LLMs like LLaMA3 8B in *in-context learning* and, after SFT, exhibits impressive *instruction-following* abilities in case studies such as multi-turn dialogue. Moreover, LLaDA addresses the reversal curse, surpassing GPT-4o in a reversal poem completion task. Our findings show the promise of diffusion models for language modeling at scale and challenge the common assumption that core LLM capabilities discussed above inherently depend on ARMs. Project page and codes: \\url{https://ml-gsai.github.io/LLaDA-demo/}.",
      "keywords": [
        "diffusion language models",
        "large language models",
        "masked diffusion models",
        "discrete diffusion models",
        "diffusion models"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    }
  },
  {
    "group_id": "MeOTBs8BQV",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "MeOTBs8BQV",
      "title": "Sync or Sink: Bounds on Algorithmic Collective Action with Noise and Multiple Groups",
      "abstract": "Collective action against algorithmic systems, which enables groups to promote their own interests, is poised to grow. Hence, there will be growth in the size and the number of distinct collectives. Currently, there is no formal analysis of how coordination challenges within a collective can impact downstream outcomes, or how multiple collectives may affect each other's success. In this work, we aim to provide guarantees on the success of collective action in the presence of both coordination noise and multiple groups. Our insight is that data generated by either multiple collectives or by coordination noise can be viewed as originating from multiple data distributions. \nUsing this framing, we derive bounds on the success of collective action. We conduct experiments to study the effects of noise on collective action. We find that sufficiently high levels of noise can reduce the success of collective action. In certain scenarios, large noise can sink a collective success rate from $100$% to just under $60$%. We identify potential trade-offs between collective size and coordination noise; for example, a collective that is twice as big but with four times more noise experiencing worse outcomes than the smaller, more coordinated one. This work highlights the importance of understanding nuanced dynamics of strategic behavior in algorithmic systems.",
      "keywords": [
        "Algorithmic Collective Action",
        "Social Computing",
        "Data Campaigns"
      ],
      "decision": "Reject",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "WCRPgBpbcA",
      "title": "A multiscale analysis of mean-field transformers in the moderate interaction regime",
      "abstract": "In this paper, we study the evolution of tokens through the depth of encoder-only transformer models at inference time by modeling them as a system of particles interacting in a mean-field way and studying the corresponding dynamics. More specifically, we consider this problem in the moderate interaction regime, where the number $N$ of tokens is large and the inverse temperature parameter $\\beta$ of the model scales together with $N$. In this regime, the dynamics of the system displays a multiscale behavior: a fast phase, where the token empirical measure collapses on a low-dimensional space, an intermediate phase, where the measure further collapses into clusters, and a slow one, where such clusters sequentially merge into a single one. We provide a rigorous characterization of the limiting dynamics in each of these phases and prove convergence in the above mentioned limit, exemplifying our results with some simulations.",
      "keywords": [
        "mean-field limits",
        "moderate interaction",
        "mean-field transformers",
        "self-attention models",
        "clustering",
        "multiscale"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    }
  },
  {
    "group_id": "kUzBGEuu7w",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "NM8Apk61NA",
      "title": "HyperET: Efficient Training in Hyperbolic Space for Multi-modal Large Language Models",
      "abstract": "Multi-modal large language models (MLLMs) have emerged as a transformative approach for aligning visual and textual understanding. They typically require extremely high computational resources (e.g., thousands of GPUs) for training to achieve cross-modal alignment at multi-granularity levels. We argue that a key source of this inefficiency lies in the vision encoders they widely equip with, e.g., CLIP and SAM, which lack the alignment with language at multi-granularity levels. To address this issue, in this paper, we leverage hyperbolic space, which inherently models hierarchical levels and thus provides a principled framework for bridging the granularity gap between visual and textual modalities at an arbitrary granularity level. Concretely, we propose an efficient training paradigm for MLLMs, dubbed as \\blg, which can optimize visual representations to align with their textual counterparts at an arbitrary granularity level through dynamic hyperbolic radius adjustment in hyperbolic space. \\alg employs learnable matrices with M\\\"{o}bius multiplication operations, implemented via three effective configurations: diagonal scaling matrices, block-diagonal matrices, and banded matrices, providing a flexible yet efficient parametrization strategy. Comprehensive experiments across multiple MLLM benchmarks demonstrate that \\alg consistently improves both existing pre-training and fine-tuning MLLMs clearly with less than 1\\% additional parameters. Code is available at \\url{https://github.com/godlin-sjtu/HyperET}.",
      "keywords": [
        "Efficient Training",
        "Multi-modal Large Language Models",
        "Granularity Levels",
        "Hyperbolic Space"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "hncKzAnyKQ",
      "title": "Double-Checker: Enhancing Reasoning of Slow-Thinking LLMs via Self-Critical Fine-Tuning",
      "abstract": "While slow-thinking large language models (LLMs) exhibit reflection-like reasoning, commonly referred to as the “aha moment”, their ability to generate informative critiques and refine prior solutions remains limited. In this paper, we introduce Double-Checker, a principled framework designed to enhance the reasoning capabilities of slow-thinking LLMs by fostering explicit self-criticism and iterative refinement of their previous solutions. By fine-tuning on a curated DC-1.7K dataset of 1,730 self-critical instances, Double-Checker empowers long-CoT LLMs to iteratively critique and refine their outputs during inference until they evaluate their solutions as correct under self-generated critiques. We validate the efficacy of Double-Checker across a comprehensive suite of reasoning benchmarks, demonstrating that iterative self-critique significantly enhances the reasoning capabilities of long-CoT LLMs. Notably, Double-Checker increases the pass@1 performance on challenging AIME benchmarks from 4.4\\% to 18.2\\% compared to the original long-CoT LLMs. These results highlight a promising direction for developing more trustworthy and effective LLMs capable of structured self-critique.",
      "keywords": [
        "Large Language Models",
        "Reasoning",
        "Long Chain-of-Thought",
        "Critique"
      ],
      "decision": "Reject",
      "year": "2025"
    }
  },
  {
    "group_id": "9GsgCUJtic",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "RuP17cJtZo",
      "title": "Generator Matching: Generative modeling with arbitrary Markov processes",
      "abstract": "We introduce Generator Matching, a modality-agnostic framework for generative modeling using arbitrary Markov processes. Generators characterize the infinitesimal evolution of a Markov process, which we leverage for generative modeling in a similar vein to flow matching: we construct conditional generators which generate single data points, then learn to approximate the marginal generator which generates the full data distribution. We show that Generator Matching unifies various generative modeling methods, including diffusion models, flow matching and discrete diffusion models. Furthermore, it expands the design space to new and unexplored Markov processes such as jump processes. Finally, Generator Matching enables the construction of superpositions of Markov generative models and enables the construction of multimodal models in a rigorous manner. We empirically validate our method on image and multimodal generation, e.g. showing that superposition with a jump process improves performance.",
      "keywords": [
        "Flow matching",
        "Markov process",
        "Diffusion model",
        "Generative Modeling"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "tL8dpJmECp",
      "title": "Improving Fairness and Mitigating MADness in Generative Models",
      "abstract": "Generative models unfairly penalize data belonging to minority classes, suffer from model autophagy disorder (MADness), and learn biased estimates of the underlying distribution parameters.  Our theoretical and empirical results show that training generative models with intentionally designed hypernetworks leads to models that 1) are more fair when generating datapoints belonging to minority classes 2) are more stable in a self-consumed (i.e., MAD) setting, and 3) learn parameters that are less statistically biased.  To further mitigate unfairness, MADness, and bias, we introduce a regularization term that penalizes discrepancies between a generative model’s estimated weights when trained on real data versus its own synthetic data.  To facilitate training existing deep generative models within our framework, we offer a scalable implementation of hypernetworks that automatically generates a hypernetwork architecture for any given generative model.",
      "keywords": [
        "Hypernetworks",
        "Generative Models",
        "Fairness",
        "MADness",
        "Maximum Likelihood Estimation",
        "Bias"
      ],
      "decision": "Reject",
      "year": "2025"
    }
  },
  {
    "group_id": "xak8c9l1nu",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "CrOHzVtWmH",
      "title": "Relative-Translation Invariant Wasserstein Distance",
      "abstract": "In many real-world applications, data distributions are often subject to translation shifts caused by various factors such as changes in environmental conditions, sensor settings, or shifts in data collection practices. These distribution shifts pose a significant challenge for measuring the similarity between probability distributions, particularly in tasks like domain adaptation or transfer learning. To address this issue, we introduce a new family of distances, relative-translation invariant Wasserstein distances ($RW_p$), to measure the similarity of two probability distributions under distribution shift. Generalizing it from the classical optimal transport model, we show that $RW_p$ distances are also real distance metrics defined on the quotient set $\\mathcal{P}_p(\\mathbb{R}^n)/\\sim$ and invariant to distribution translations, which forms a family of new metric spaces. When $p=2$, the $RW_2$ distance enjoys more exciting properties, including decomposability of the optimal transport model and translation-invariance of the $RW_2$ distance. Based on these properties, we show that a distribution shift, measured by $W_2$ distance, can be explained in the bias-variance perspective. In addition, we propose two algorithms: one algorithm is a two-stage optimization algorithm for computing the general case of $RW_p$ distance, and the other is a variant of the Sinkhorn algorithm, named $RW_2$ Sinkhorn algorithm, for efficiently calculating $RW_2$ distance, coupling solutions, as well as $W_2$ distance. We also provide the analysis of numerical stability and time complexity for the proposed algorithms. Finally, we validate the $RW_p$ distance metric and the algorithm performance with two experiments. We conduct one numerical validation for the $RW_2$ Sinkhorn algorithm and demonstrate the effectiveness of using $RW_p$ under distribution shift for similar thunderstorm detection. The experimental results report that our proposed algorithm significantly improves the computational efficiency of Sinkhorn in practical applications, and the $RW_p$ distance is robust to distribution translations.",
      "keywords": [
        "Optimal transport theory",
        "Wasserstein distance",
        "Distribution shift"
      ],
      "decision": "Reject",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "ijbA5swmoK",
      "title": "Second-Order Min-Max Optimization with Lazy Hessians",
      "abstract": "This paper studies second-order methods for convex-concave minimax optimization.  \nMonteiro & Svaiter (2012)  proposed a method to solve the problem with an optimal iteration complexity of \n$\\mathcal{O}(\\epsilon^{-3/2})$ to find an $\\epsilon$-saddle point.  However, it is unclear whether the\ncomputational complexity, $\\mathcal{O}((N+ d^2) d \\epsilon^{-2/3})$, can be improved. In the above, we follow  Doikov et al. (2023) and assume the complexity of obtaining a first-order oracle as $N$ and the complexity of obtaining a second-order oracle as $dN$. \nIn this paper, we show that the computation cost can be reduced by reusing Hessian across iterations. Our methods take the overall computational complexity of $\\tilde{\\mathcal{O}}( (N+d^2)(d+ d^{2/3}\\epsilon^{-2/3}))$, which improves those of previous methods by a factor of $d^{1/3}$. \nFurthermore, we generalize our method to strongly-convex-strongly-concave minimax problems and establish the complexity of $\\tilde{\\mathcal{O}}((N+d^2) (d + d^{2/3} \\kappa^{2/3}) )$ when the condition number of the problem is $\\kappa$, enjoying a similar speedup upon the state-of-the-art method. \nNumerical experiments on both real and synthetic datasets also verify the efficiency of our method.",
      "keywords": [
        "min-max optimization; second-order methods; computational complexity"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    }
  },
  {
    "group_id": "MFZjrTFE7h",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "9GJ6JKoCVp",
      "title": "NaN Pooling and Convolution Accelerate U-Nets",
      "abstract": "Recent advancements in deep learning for neuroimaging have resulted in the development of increasingly complex models designed for a wide range of tasks. Despite significant improvements in hardware, enhancing inference and training times for these models remains crucial. Through a numerical analysis of convolutional neural networks (CNNs) inference, we found that a substantial amount of operations in these models are applied to pure numerical noise, with little to no impact on the final output. As a result, some CNNs consume up to two-thirds of their floating-point operations unnecessarily.\n\nTo address this inefficiency, we introduce NaN Pooling & Convolution---novel variations of PyTorch's max pooling and 2D convolution operations. These techniques identify numerically unstable voxels and replace them with NaNs, allowing  models to bypass operations on irrelevant data. We evaluate NaN Pooling and Convolution on two models: the FastSurfer CNN, a widely used neuroimaging tool, and a CNN designed to classify the MNIST dataset. For FastSurfer, our approach significantly improves computational efficiency, skipping between 33.24% and 69.30\\% of convolutions in certain layers while preserving the model's original accuracy. On MNIST, our approach skips up to 28.38% of convolutions, again without major impact on the accuracy.",
      "keywords": [
        "Pooling",
        "Convolutions",
        "Deep learning",
        "Optimization",
        "Neuroimaging",
        "Convolutional Neural Networks",
        "Numerical Analysis"
      ],
      "decision": "Reject",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "P4o9akekdf",
      "title": "No Pose, No Problem: Surprisingly Simple 3D Gaussian Splats from Sparse Unposed Images",
      "abstract": "We introduce NoPoSplat, a feed-forward model capable of reconstructing 3D scenes parameterized by 3D Gaussians from unposed sparse multi-view images. Our model, trained exclusively with photometric loss, achieves real-time 3D Gaussian reconstruction during inference. To eliminate the need for accurate pose input during reconstruction, we anchor one input view's local camera coordinates as the canonical space and train the network to predict Gaussian primitives for all views within this space. This approach obviates the need to transform Gaussian primitives from local coordinates into a global coordinate system, thus avoiding errors associated with per-frame Gaussians and pose estimation. To resolve scale ambiguity, we design and compare various intrinsic embedding methods, ultimately opting to convert camera intrinsics into a token embedding and concatenate it with image tokens as input to the model, enabling accurate scene scale prediction. We utilize the reconstructed 3D Gaussians for novel view synthesis and pose estimation tasks and propose a two-stage coarse-to-fine pipeline for accurate pose estimation. Experimental results demonstrate that our pose-free approach can achieve superior novel view synthesis quality compared to pose-required methods, particularly in scenarios with limited input image overlap. For pose estimation, our method, trained without ground truth depth or explicit matching loss, significantly outperforms the state-of-the-art methods with substantial improvements. This work makes significant advances in pose-free generalizable 3D reconstruction and demonstrates its applicability to real-world scenarios. Code and trained models are available at https://noposplat.github.io/.",
      "keywords": [
        "3D Gaussian Splatting",
        "Pose Free",
        "Pose Estimation",
        "Novel View Synthesis",
        "3D Reconstruction"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    }
  },
  {
    "group_id": "LCk3umTAXx",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "LCk3umTAXx",
      "title": "Gamified crowd-sourcing of high-quality data for visual fine-tuning",
      "abstract": "This paper introduces gamified adversarial prompting (GAP), a framework that\ncrowd-sources high-quality data for visual instruction tuning of large multimodal\nmodels. GAP transforms the data collection process into an engaging game, in-\ncentivizing players to provide fine-grained, challenging questions and answers\nthat target gaps in the model’s knowledge. Our contributions include (1) an ap-\nproach to capture question-answer pairs from humans that directly address weak-\nnesses in a model’s knowledge, (2) a method for evaluating and rewarding players\nthat successfully incentivizes them to provide high-quality submissions, and (3) a\nscalable, gamified platform that succeeds in collecting this data from over 50,000\nparticipants in just a few weeks. Our implementation of GAP has significantly im-\nproved the accuracy of a small multimodal model, namely MiniCPM-Llama3-V-\n2.5-8B, increasing its GPT score from 0.147 to 0.477 on our dataset, approaching\nthe benchmark set by the much larger GPT-4V. Moreover, we demonstrate that\nthe data generated using MiniCPM-Llama3-V-2.5-8B also enhances its perfor-\nmance across other benchmarks, and exhibits cross-model benefits. Specifically,\nthe same data improves the performance of QWEN2-VL-2B and QWEN2-VL-7B\non the same multiple benchmarks.",
      "keywords": [
        "Large Multimodal Models",
        "Visual Question Answering",
        "Visual Instruction Tuning",
        "Gamification",
        "Supervised Learning",
        "Data Generation"
      ],
      "decision": "Reject",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "Pujt3ADZgI",
      "title": "Iterative Nash Policy Optimization: Aligning LLMs with General Preferences via No-Regret Learning",
      "abstract": "Reinforcement Learning with Human Feedback (RLHF) has achieved great success\nin aligning large language models (LLMs) with human preferences. Prevalent\nRLHF approaches are reward-based, following the Bradley-Terry (BT) model assumption, which may not fully capture the complexity of human preferences. In\nthis paper, we explore RLHF under a general preference framework and approach\nit from a game-theoretic perspective. Specifically, we formulate the problem as\na two-player game and propose a novel online algorithm, iterative Nash policy\noptimization (INPO). The key idea is to let the policy play against itself via no-\nregret learning, thereby approximating the Nash policy. Unlike previous methods,\nINPO bypasses the need for estimating the expected win rate for individual responses, which typically incurs high computational or annotation costs. Instead,\nwe introduce a new loss objective that is directly minimized over a preference\ndataset. We provide theoretical analysis for our approach and demonstrate its\neffectiveness through experiments on various representative benchmarks. With an\nLLaMA-3-8B-based SFT model, INPO achieves a 42.6% length-controlled win\nrate on AlpacaEval 2.0 and a 37.8% win rate on Arena-Hard, showing substantial\nimprovement over the state-of-the-art online RLHF algorithms.",
      "keywords": [
        "RLHF Theory",
        "LLM Alignment"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    }
  },
  {
    "group_id": "k3y0oyK7sn",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "XeRvg7GQH4",
      "title": "One Training Fits All: Generalized Data Condensation via Mixture-of-Information Bottleneck Guidance",
      "abstract": "Data condensation (DC) technologies are widely used in buffer-constrained scenarios to reduce the memory demand of training samples and maintain  DNN training performance. However, due to the storage constraint of deployment devices and the high energy costs of condensation procedure, synthetic datasets generated by DC often have inferior performance in terms of training efficiency and scalability, which greatly limits its practical application on various edge devices. \nThis dilemma arises due to two reasons: i) existing state-of-the-art (SoTA) data condensation approaches that update synthetic datasets by intuitively matching intermediate training outputs (e.g.,  gradients, features and distributions) between real datasets and synthetic datasets without improving their representational information capabilities from the perspective of the useful information contained. ii) DC lacks sufficient consideration for the heterogeneity of storage constraints among various edge devices, which will result in large training overheads (i.e., consumption or storage). \nTo tackle the above issue, We propose a novel method named Mixture-of-Information Bottleneck Dataset Condensation (MIBDC), which employs information bottlenecks from synthetic datasets with various Image Per Class (IPC) numbers to improve the overall DC generalization and scalability. \nSpecifically, in this paper, the following two phenomena are found: i) The quality of synthetic datasets improves with increased synthetic dataset quantity. ii) The smaller the number of synthetic datasets, the earlier they can reach the convergence peak.\nBased on the above two findings, this paper proposes that i) large synthetic datasets can guide the better convergence of smaller ones. ii)  information contained in  synthetic datasets with different IPC numbers can play a collaborative role in the guidance of dataset condensation generalization.\nComprehensive experimental results on three well-known datasets show that, compared with state-of-the-art dataset condensation methods, MIBDC can not only enhance the generalization performance of trained models but also achieve superior scalability.",
      "keywords": [
        "Dataset Condensation"
      ],
      "decision": "Reject",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "Y6aHdDNQYD",
      "title": "MOS: Model Synergy for Test-Time Adaptation on LiDAR-Based 3D Object Detection",
      "abstract": "LiDAR-based 3D object detection is crucial for various applications but often experiences performance degradation in real-world deployments due to domain shifts. While most studies focus on cross-dataset shifts, such as changes in environments and object geometries, practical corruptions from sensor variations and weather conditions remain underexplored. In this work, we propose a novel online test-time adaptation framework for 3D detectors that effectively tackles these shifts, including a challenging $\\textit{cross-corruption}$ scenario where cross-dataset shifts and corruptions co-occur. By leveraging long-term knowledge from previous test batches, our approach mitigates catastrophic forgetting and adapts effectively to diverse shifts. Specifically, we propose a Model Synergy (MOS) strategy that dynamically selects historical checkpoints with diverse knowledge and assembles them to best accommodate the current test batch. This assembly is directed by our proposed Synergy Weights (SW), which perform a weighted averaging of the selected checkpoints, minimizing redundancy in the composite model. The SWs are computed by evaluating the similarity of predicted bounding boxes on the test data and the independence of features between checkpoint pairs in the model bank. To maintain an efficient and informative model bank, we discard checkpoints with the lowest average SW scores, replacing them with newly updated models. Our method was rigorously tested against existing test-time adaptation strategies across three datasets and eight types of corruptions, demonstrating superior adaptability to dynamic scenes and conditions. Notably, it achieved a 67.3% improvement in a challenging cross-corruption scenario, offering a more comprehensive benchmark for adaptation. Source code: https://github.com/zhuoxiao-chen/MOS.",
      "keywords": [
        "Test-Time Adaptation",
        "3D Object Detection"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    }
  },
  {
    "group_id": "B6bE2GC71a",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "B6bE2GC71a",
      "title": "EvoLM: In Search of Lost Language Model Training Dynamics",
      "abstract": "Modern language model (LM) training has been divided into multiple stages, making it difficult for downstream developers to evaluate the impact of design choices made at each stage.\nWe present EvoLM, a model suite that enables systematic and transparent analysis of LMs' training dynamics across pre-training, continued pre-training, supervised fine-tuning, and reinforcement learning. \nBy training over 100 LMs with 1B and 4B parameters from scratch, we rigorously evaluate both upstream (language modeling) and downstream (problem-solving) reasoning capabilities, including considerations of both in-domain and out-of-domain generalization. \nKey insights highlight the diminishing returns from excessive pre-training and post-training, the importance and practices of mitigating forgetting during domain-specific continued pre-training, the crucial role of continued pre-training in bridging pre-training and post-training phases, and various intricate trade-offs when configuring supervised fine-tuning and reinforcement learning. \nTo facilitate open research and reproducibility, we release all pre-trained and post-trained models, training datasets for all stages, and our entire training and evaluation pipeline.",
      "keywords": [
        "Language Models",
        "Training Dynamics",
        "Pretraining",
        "Post-training"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "FZURCro04D",
      "title": "Reasoning on a Spectrum: Aligning LLMs to System 1 and System 2 Thinking",
      "abstract": "Large language models (LLMs) demonstrate remarkable reasoning capabilities, yet their reliance on step-by-step reasoning can make them brittle when tasks do not align with such structured approaches. In contrast, human cognition flexibly alternates between fast, intuitive reasoning (System 1) and slow, analytical reasoning (System 2), depending on context. To bridge this gap, we curate a dataset of 2K examples, each with valid responses from both reasoning styles, and explicitly align LLMs with System 1 and System 2 reasoning. Evaluations across diverse reasoning benchmarks reveal an accuracy-efficiency trade-off: System 2-aligned models excel in arithmetic and symbolic reasoning, while System 1-aligned models perform better in commonsense tasks. A mechanistic analysis of model responses shows that System 1 models employ more definitive answers, whereas System 2 models demonstrate greater uncertainty. Interpolating between these extremes produces a monotonic transition in reasoning accuracy, preserving coherence. This work challenges the assumption that step-by-step reasoning is always optimal and highlights the need for adapting reasoning strategies based on task demands.",
      "keywords": [
        "Alignment",
        "System 1 and System 2 thinking",
        "Cognitive heuristics",
        "LLM",
        "NLP"
      ],
      "decision": "Reject",
      "year": "2025"
    }
  },
  {
    "group_id": "XO9fhSZkBh",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "XO9fhSZkBh",
      "title": "Depth-Bounds for Neural Networks via the Braid Arrangement",
      "abstract": "We contribute towards resolving the open question of how many hidden layers are required in ReLU networks for exactly representing all continuous and piecewise linear functions on $\\mathbb{R}^d$. \nWhile the question has been resolved in special cases, the best known lower bound in general is still 2. \nWe focus on neural networks that are compatible with certain polyhedral complexes, more precisely with the braid fan.  \nFor such neural networks, we prove a non-constant lower bound of $\\Omega(\\log\\log d)$ hidden layers required to exactly represent the maximum of $d$ numbers. Additionally, we provide a combinatorial proof that neural networks satisfying this assumption require three hidden layers to compute the maximum of 5 numbers; this had only been verified with an excessive computation so far.\nFinally, we show that a natural generalization of the best known upper bound to maxout networks is not tight, by demonstrating that a rank-3 maxout layer followed by a rank-2 maxout layer is sufficient to represent the maximum of 7 numbers.",
      "keywords": [
        "Neural Networks",
        "Piecewise Linear Functions",
        "Exact Representations",
        "Polyhedral Geometry",
        "Braid Fan",
        "Boolean Lattice"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "wh3p37VYm2",
      "title": "Mechanistic Insights into Grokking from the Embedding Layer",
      "abstract": "Grokking, a delayed generalization in neural networks after perfect training performance, has been observed in Transformers and MLPs, but the components driving it remain underexplored. We show that embeddings are central to grokking: introducing them into MLPs induces delayed generalization in modular arithmetic tasks, whereas MLPs without embeddings can generalize immediately. Our analysis identifies two key mechanisms: (1) Embedding update dynamics, where rare tokens stagnate due to sparse gradient updates and weight decay, and (2) Bilinear coupling, where the interaction between embeddings and downstream weights introduces saddle points and increases sensitivity to initialization.  \nTo confirm these mechanisms, we investigate frequency-aware sampling, which balances token updates by minimizing gradient variance, and embedding-specific learning rates, derived from the asymmetric curvature of the bilinear loss landscape. We prove that an adaptive learning rate ratio, \\(\\frac{\\eta_E}{\\eta_W} \\propto \\frac{\\sigma_{\\max}(E)}{\\sigma_{\\max}(W)} \\cdot \\frac{f_W}{f_E}\\), mitigates bilinear coupling effects, accelerating convergence. Our methods not only improve grokking dynamics but also extend to broader challenges in Transformer optimization, where bilinear interactions hinder efficient training.",
      "keywords": [
        "Embedding learning",
        "Token frequencey",
        "Coupled system"
      ],
      "decision": "Reject",
      "year": "2025"
    }
  },
  {
    "group_id": "R73ybUciQF",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "DfHcKzmHpp",
      "title": "Can We Partially Rewrite Transformers in Natural Language?",
      "abstract": "The greatest ambition of mechanistic interpretability is to completely rewrite deep neural networks in a format that is more amenable to human understanding, while preserving their behavior and performance. In this paper we evaluate whether sparse autoencoders (SAEs) and transcoders can be used for this purpose. We use an automated pipeline to generate explanations for each of the sparse coder latents. We then simulate the activation of each latent on a number of different inputs using an LLM prompted with the explanation we generated in the previous step, and \"partially rewrite'' the original model by patching the simulated activations into its forward pass. We find that current sparse coding techniques and automated interpretability pipelines are not up to the task of rewriting even a single layer of a transformer: the model is severely degraded by patching in the simulated activations. We believe this approach is the most thorough way to assess the quality of SAEs and transcoders, despite its high computational cost.",
      "keywords": [
        "Sparse autoencoders",
        "interpretability",
        "language models"
      ],
      "decision": "Reject",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "R73ybUciQF",
      "title": "A is for Absorption: Studying Feature Splitting and Absorption in Sparse Autoencoders",
      "abstract": "Sparse Autoencoders (SAEs) aim to decompose the activation space of large language models (LLMs) into human-interpretable latent directions or features. As we increase the number of features in the SAE, hierarchical features tend to split into finer features (“math” may split into “algebra”, “geometry”, etc.), a phenomenon referred to as feature splitting. However, we show that sparse decomposition and splitting of hierarchical features is not robust. Specifically, we show that seemingly monosemantic features fail to fire where they should, and instead get “absorbed” into their children features. We coin this phenomenon feature absorption, and show that it is caused by optimizing for sparsity in SAEs whenever the underlying features form a hierarchy. We introduce a metric to detect absorption in SAEs, and validate our findings empirically on hundreds of LLM SAEs. Our investigation suggests that varying SAE sizes or sparsity is insufficient to solve this issue. We discuss the implications of feature absorption in SAEs and some potential approaches to solve the fundamental theoretical issues before SAEs can be used for interpreting LLMs robustly and at scale.",
      "keywords": [
        "sparse autoencoders",
        "SAEs",
        "interpretability",
        "NLP"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    }
  },
  {
    "group_id": "8P3QNSckMp",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "8P3QNSckMp",
      "title": "A Clean Slate for Offline Reinforcement Learning",
      "abstract": "Progress in offline reinforcement learning (RL) has been impeded by ambiguous problem definitions and entangled algorithmic designs, resulting in inconsistent implementations, insufficient ablations, and unfair evaluations. Although offline RL explicitly avoids environment interaction, prior methods frequently employ extensive, undocumented online evaluation for hyperparameter tuning, complicating method comparisons. Moreover, existing reference implementations differ significantly in boilerplate code, obscuring their core algorithmic contributions. We address these challenges by first introducing a rigorous taxonomy and a transparent evaluation protocol that explicitly quantifies online tuning budgets. To resolve opaque algorithmic design, we provide clean, minimalistic, single-file implementations of various model-free and model-based offline RL methods, significantly enhancing clarity and achieving substantial speed-ups. Leveraging these streamlined implementations, we propose Unifloral, a unified algorithm that encapsulates diverse prior approaches and enables development within a single, comprehensive hyperparameter space. Using Unifloral with our rigorous evaluation protocol, we develop two novel algorithms - TD3-AWR (model-free) and MoBRAC (model-based) - which substantially outperform established baselines. Our implementation is publicly available at https://github.com/EmptyJackson/unifloral.",
      "keywords": [
        "Offline Reinforcement Learning",
        "Evaluation",
        "Open-Source"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "ziLHIExi1j",
      "title": "Quantifying First‐Order Markov Breakdowns in Noisy Reinforcement Learning: A Causal Discovery Approach",
      "abstract": "Reinforcement learning (RL) methods often assume that each new observation fully captures the environment’s state, ensuring Markovian (one‐step) transitions. Real‐world deployments, however, frequently violate this assumption due to partial observability or noise in sensors and actuators. This paper introduces a systematic methodology for diagnosing such violations, combining a partial correlation based causal discovery procedure (PCMCI) with a newly proposed Markov Violation score (MVS). The MVS quantifies multi‐step dependencies that emerge when noise or incomplete state information disrupts the Markov property.\n\nClassic control tasks (CartPole, Pendulum, Acrobot) are used to assess how targeted noise and dimension omissions affect both RL performance and the measured Markov consistency. Contrary to expectations, heavy observation noise often fails to induce strong multi‐lag dependencies in certain tasks (e.g., Acrobot). Dimension‐dropping experiments further reveal that omitting certain state variables (e.g., angular velocities in CartPole and Pendulum) substantially degrades returns and elevates MVS, while other dimensions can be removed with negligible effect.\n\nThese findings highlight the importance of identifying and safeguarding the most causally critical dimensions to maintain effective one‐step learning. By bridging partial correlation tests and RL performance metrics, the proposed approach uniquely pinpoints when and where the Markov property breaks. This framework offers a principled tool for designing robust policies, guiding representation learning, and handling partial observability in real‐world RL tasks. All code and experimental logs are publicly available for reproducibility (URL omitted for double‐blind review).",
      "keywords": [
        "Markov Property",
        "PCMCI (Causal Discovery)",
        "PPO",
        "Noisy Reinforcement Learning."
      ],
      "decision": "Reject",
      "year": "2025"
    }
  },
  {
    "group_id": "XPe55Uffd7",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "XPe55Uffd7",
      "title": "Agnostic Active Learning Is Always Better Than Passive Learning",
      "abstract": "We sharply characterize the optimal first-order query complexity of agnostic active learning for all concept classes, and propose a new general active learning algorithm which achieves it. Remarkably, the optimal query complexity admits a leading term which is always strictly smaller than the sample complexity of passive supervised learning (by a factor proportional to the best-in-class error rate). This was not previously known to be possible in the agnostic setting. For comparison, in all previous general analyses, the leading term exhibits an additional factor, such as the disagreement coefficient or related complexity measure, and therefore only provides improvements over passive learning in restricted cases. The present work completely removes such factors from the leading term, implying that $\\textit{every}$ concept class benefits from active learning in the non-realizable case. The results established in this work resolve an important long-standing open question central to the past two decades of research on the theory of agnostic active learning.",
      "keywords": [
        "Active learning",
        "Agnostic learning",
        "PAC learning",
        "Query complexity",
        "Minimax analysis",
        "VC dimension",
        "Star number",
        "Disagreement coefficient"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "zmlhP8myaT",
      "title": "Stepwise Feature Learning in Self-Supervised Learning",
      "abstract": "Recent advances in self-supervised learning (SSL) have shown remarkable progress in representation learning. However, SSL models often exhibit shortcut learning phenomenon, where they exploit dataset-specific biases rather than learning generalizable features, sometimes leading to severe over-optimization on particular datasets. We present a theoretical framework that analyzes this shortcut learning phenomenon through the lens of $\\textit{extent bias}$ and $\\textit{amplitude bias}$. By investigating the relations among extent bias, amplitude bias, and learning priorities in SSL, we demonstrate that learning dynamics is fundamentally governed by the dimensional properties and amplitude of features rather than their semantic importance. Our analysis reveals how the eigenvalues of the feature cross-correlation matrix influence which features are learned earlier, providing insights into why models preferentially learn shortcut features over more generalizable features.",
      "keywords": [
        "shortcut learning",
        "self-supervised learning",
        "stepwise learning",
        "feature learning",
        "learning dynamics"
      ],
      "decision": "Reject",
      "year": "2025"
    }
  },
  {
    "group_id": "EoebmBe9fG",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "EoebmBe9fG",
      "title": "Optimal Mistake Bounds for Transductive Online Learning",
      "abstract": "We resolve a 30-year-old open problem concerning the power of unlabeled data in online learning by tightly quantifying the gap between transductive and standard online learning. We prove that for every concept class $\\mathcal{H}$ with Littlestone dimension $d$, the transductive mistake bound is at least $\\Omega(\\sqrt{d})$. This establishes an exponential improvement over previous lower bounds of $\\Omega(\\log \\log d)$, $\\Omega(\\sqrt{\\log d})$, and $\\Omega(\\log d)$, respectively due to Ben-David, Kushilevitz, and Mansour (1995, 1997) and Hanneke, Moran, and Shafer (2023). We also show that our bound is tight: for every $d$, there exists a class of Littlestone dimension $d$ with transductive mistake bound $O(\\sqrt{d})$. Our upper bound also improves the previous best known upper bound of $(2/3) \\cdot d$ from Ben-David et al. (1997). These results demonstrate a quadratic gap between transductive and standard online learning, thereby highlighting the benefit of advanced access to the unlabeled instance sequence. This stands in stark contrast to the PAC setting, where transductive and standard learning exhibit similar sample complexities.",
      "keywords": [
        "Online Learning"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "vWaMUMrBpF",
      "title": "Inconsistency-Aware Minimization: Improving Generalization with Unlabeled Data",
      "abstract": "Accurately estimating the generalization gap and devising optimization methods that generalize better are crucial for deep learning models, particularly in both theoretical understanding and practical applications. The ability to leverage unlabeled data for these purposes offers significant advantages in real-world scenarios. This paper introduces a novel generalization measure, termed $\\textit{local inconsistency}$, developed from an information-geometric perspective of the neural network's parameter space; a key feature is its computability from unlabeled data. We establish its theoretical underpinnings by connecting local inconsistency to the Fisher Information Matrix (FIM) and the loss Hessian. Empirically, we demonstrate that local inconsistency not only correlates with the generalization gap but also exhibits characteristics comparable to $\\textit{sharpness}$. Based on these findings, we propose Inconsistency-Aware Minimization (IAM), a regularization strategy that incorporates local inconsistency. We demonstrate that in standard supervised learning settings, IAM enhances generalization, achieving performance comparable to existing methods such as Sharpness-Aware Minimization (SAM). Furthermore, IAM exhibits notable efficacy in semi-supervised learning scenarios, where the local inconsistency regularizer is computed from the unlabeled data portion to further improve model performance.",
      "keywords": [
        "Generalization",
        "Regularization",
        "Training Method",
        "Deep Learning",
        "Inconsistency"
      ],
      "decision": "Reject",
      "year": "2025"
    }
  },
  {
    "group_id": "kVz9uvqUna",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "2mDquK2qMI",
      "title": "One Step Diffusion via Flow Fitting",
      "abstract": "Diffusion and flow-matching models have demonstrated impressive performance in generating diverse, high-fidelity images by learning transformations from noise to data. However, their reliance on multi-step sampling requires repeated neural network evaluations, leading to high computational cost. We propose FlowFit, a family of generative models that enables high-quality sample generation through both single-phase training and single-step inference. FlowFit learns to approximate the continuous flow trajectory between latent noise \\(x_0\\) and data \\(x_1\\) by fitting a basis of functions parameterized over time \\(t \\in [0, 1]\\) during training. At inference time, sampling is performed by simply evaluating the flow only at the terminal time \\(t = 1\\), avoiding iterative denoising or numerical integration. Empirically, FlowFit outperforms prior diffusion-based single-phase training methods achieving superior sample quality.",
      "keywords": [
        "Efficient generative models",
        "Single step diffusion"
      ],
      "decision": "Reject",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "kVz9uvqUna",
      "title": "On the Closed-Form of Flow Matching: Generalization Does Not Arise from Target Stochasticity",
      "abstract": "Modern deep generative models can now produce high-quality synthetic samples that are often indistinguishable from real training data. A growing body of research aims to understand why recent methods, such as diffusion and flow matching techniques, generalize so effectively. Among the proposed explanations are the inductive biases of deep learning architectures and the stochastic nature of the conditional flow matching loss. In this work, we rule out the noisy nature of the loss as a key factor driving generalization in flow matching.\nFirst, we empirically show that in high-dimensional settings, the stochastic and closed-form versions of the flow matching loss yield nearly equivalent losses. Then, using state-of-the-art flow matching models on standard image datasets, we demonstrate that both variants achieve comparable statistical performance, with the surprising observation that using the closed-form can even improve performance.",
      "keywords": [
        "flow matching",
        "generalization",
        "memorization"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    }
  },
  {
    "group_id": "Q3qAsZAEZw",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "FZURCro04D",
      "title": "Reasoning on a Spectrum: Aligning LLMs to System 1 and System 2 Thinking",
      "abstract": "Large language models (LLMs) demonstrate remarkable reasoning capabilities, yet their reliance on step-by-step reasoning can make them brittle when tasks do not align with such structured approaches. In contrast, human cognition flexibly alternates between fast, intuitive reasoning (System 1) and slow, analytical reasoning (System 2), depending on context. To bridge this gap, we curate a dataset of 2K examples, each with valid responses from both reasoning styles, and explicitly align LLMs with System 1 and System 2 reasoning. Evaluations across diverse reasoning benchmarks reveal an accuracy-efficiency trade-off: System 2-aligned models excel in arithmetic and symbolic reasoning, while System 1-aligned models perform better in commonsense tasks. A mechanistic analysis of model responses shows that System 1 models employ more definitive answers, whereas System 2 models demonstrate greater uncertainty. Interpolating between these extremes produces a monotonic transition in reasoning accuracy, preserving coherence. This work challenges the assumption that step-by-step reasoning is always optimal and highlights the need for adapting reasoning strategies based on task demands.",
      "keywords": [
        "Alignment",
        "System 1 and System 2 thinking",
        "Cognitive heuristics",
        "LLM",
        "NLP"
      ],
      "decision": "Reject",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "Q3qAsZAEZw",
      "title": "Understanding and Mitigating Numerical Sources of Nondeterminism in LLM Inference",
      "abstract": "Large Language Models (LLMs) are now integral across various domains and have demonstrated impressive performance. Progress, however, rests on the premise that benchmark scores are both accurate and reproducible. We demonstrate that the reproducibility of LLM performance is fragile: changing system configuration, such as evaluation batch size, GPU count, and GPU version, can introduce significant differences in the generated responses. \nThis issue is especially pronounced in reasoning models, where minor rounding differences in early tokens can cascade into divergent chains of thought, ultimately affecting accuracy. For instance, under bfloat16 precision with greedy decoding, a reasoning model like DeepSeek-R1-Distill-Qwen-7B can exhibit up to 9\\% variation in accuracy and 9,000 tokens difference in response length due to differences in GPU count, type, and evaluation batch size.\nWe trace the root cause of this variability to the non-associative nature of floating-point arithmetic under limited numerical precision. \nThis work presents the first systematic investigation into how numerical precision affects reproducibility in LLM inference. Through carefully controlled experiments across various hardware, software, and precision settings, we quantify when and how model outputs diverge.\nOur analysis reveals that floating-point precision—while critical for reproducibility—is often neglected in evaluation practices.\nInspired by this, we develop a lightweight inference pipeline, dubbed LayerCast, that stores weights in 16-bit precision but performs all computations in FP32, balancing memory efficiency with numerical stability. Code is available at https://github.com/nanomaoli/llm_reproducibility.",
      "keywords": [
        "Large Language Models (LLMs)",
        "Reproducibility",
        "Numerical precision",
        "Deterministic inference"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    }
  },
  {
    "group_id": "ImpizBSKcu",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "ImpizBSKcu",
      "title": "Dynamical Decoupling of Generalization and Overfitting in Large Two-Layer Networks",
      "abstract": "Understanding the inductive bias and generalization properties of large overparametrized machine learning models requires to characterize the dynamics of the training algorithm.  We study the learning dynamics of large two-layer neural networks via dynamical mean field theory, a well established technique of non-equilibrium statistical physics. We show that, for large network width $m$,\nand large number of samples per input dimension $n/d$, the training dynamics exhibits a separation of timescales which implies:\n$(i)$ The emergence of a slow time scale associated with the growth in Gaussian/Rademacher complexity of the network;\n$(ii)$ Inductive bias towards small complexity if the initialization has small enough complexity;\n$(iii)$ A dynamical decoupling between feature learning and overfitting regimes; $(iv)$ A non-monotone behavior of the test error, associated  `feature unlearning' regime at large times.",
      "keywords": [
        "Overfitting; feature learning; dynamical mean field theory; generalization;"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "wh3p37VYm2",
      "title": "Mechanistic Insights into Grokking from the Embedding Layer",
      "abstract": "Grokking, a delayed generalization in neural networks after perfect training performance, has been observed in Transformers and MLPs, but the components driving it remain underexplored. We show that embeddings are central to grokking: introducing them into MLPs induces delayed generalization in modular arithmetic tasks, whereas MLPs without embeddings can generalize immediately. Our analysis identifies two key mechanisms: (1) Embedding update dynamics, where rare tokens stagnate due to sparse gradient updates and weight decay, and (2) Bilinear coupling, where the interaction between embeddings and downstream weights introduces saddle points and increases sensitivity to initialization.  \nTo confirm these mechanisms, we investigate frequency-aware sampling, which balances token updates by minimizing gradient variance, and embedding-specific learning rates, derived from the asymmetric curvature of the bilinear loss landscape. We prove that an adaptive learning rate ratio, \\(\\frac{\\eta_E}{\\eta_W} \\propto \\frac{\\sigma_{\\max}(E)}{\\sigma_{\\max}(W)} \\cdot \\frac{f_W}{f_E}\\), mitigates bilinear coupling effects, accelerating convergence. Our methods not only improve grokking dynamics but also extend to broader challenges in Transformer optimization, where bilinear interactions hinder efficient training.",
      "keywords": [
        "Embedding learning",
        "Token frequencey",
        "Coupled system"
      ],
      "decision": "Reject",
      "year": "2025"
    }
  },
  {
    "group_id": "Tk5nQnTGmP",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "KurYdcCbjv",
      "title": "Generalized Linear Mode Connectivity for Transformers",
      "abstract": "Understanding the geometry of neural network loss landscapes is a central question in deep learning, with implications for generalization and optimization. A striking phenomenon is $\\textit{linear mode connectivity}$ (LMC), where independently trained models can be connected by low- or zero-barrier paths, despite appearing to lie in separate loss basins. However, this is often obscured by symmetries in parameter space—such as neuron permutations—which make functionally equivalent models appear dissimilar. Prior work has predominantly focused on neuron reordering through permutations, but such approaches are limited in scope and fail to capture the richer symmetries exhibited by modern architectures such as Transformers. In this work, we introduce a unified framework that captures four symmetry classes—permutations, semi-permutations, orthogonal transformations, and general invertible maps—broadening the set of valid reparameterizations and subsuming many previous approaches as special cases. Crucially, this generalization enables, for the first time, the discovery of low- and zero-barrier linear interpolation paths between independently trained Vision Transformers and GPT-2 models. Furthermore, our framework extends beyond pairwise alignment, to multi-model and width-heterogeneous settings, enabling alignment across architectures of different sizes. These results reveal deeper structure in the loss landscape and underscore the importance of symmetry-aware analysis for understanding model space geometry.",
      "keywords": [
        "Neural Network Merging",
        "Linear Mode Connectivity",
        "Model Re-basin",
        "Parameter Space Geometry",
        "Transformer",
        "Permutation Invariance",
        "Model Fusion"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "vWaMUMrBpF",
      "title": "Inconsistency-Aware Minimization: Improving Generalization with Unlabeled Data",
      "abstract": "Accurately estimating the generalization gap and devising optimization methods that generalize better are crucial for deep learning models, particularly in both theoretical understanding and practical applications. The ability to leverage unlabeled data for these purposes offers significant advantages in real-world scenarios. This paper introduces a novel generalization measure, termed $\\textit{local inconsistency}$, developed from an information-geometric perspective of the neural network's parameter space; a key feature is its computability from unlabeled data. We establish its theoretical underpinnings by connecting local inconsistency to the Fisher Information Matrix (FIM) and the loss Hessian. Empirically, we demonstrate that local inconsistency not only correlates with the generalization gap but also exhibits characteristics comparable to $\\textit{sharpness}$. Based on these findings, we propose Inconsistency-Aware Minimization (IAM), a regularization strategy that incorporates local inconsistency. We demonstrate that in standard supervised learning settings, IAM enhances generalization, achieving performance comparable to existing methods such as Sharpness-Aware Minimization (SAM). Furthermore, IAM exhibits notable efficacy in semi-supervised learning scenarios, where the local inconsistency regularizer is computed from the unlabeled data portion to further improve model performance.",
      "keywords": [
        "Generalization",
        "Regularization",
        "Training Method",
        "Deep Learning",
        "Inconsistency"
      ],
      "decision": "Reject",
      "year": "2025"
    }
  },
  {
    "group_id": "xVI8g50Qfk",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "KkOMqJQiWU",
      "title": "Meta-learning local learning rules for structured credit assignment with sparse feedback",
      "abstract": "Biological neural networks can learn complex behaviors from sparse, delayed feedback using local synaptic plasticity, yet the mechanisms enabling structured credit assignment remain elusive. In contrast, artificial recurrent networks solving similar tasks typically rely on biologically implausible global learning rules or hand-crafted local updates. The space of local plasticity rules capable of supporting learning from delayed reinforcement remains largely unexplored. Here, we present a meta-learning framework that discovers local learning rules for structured credit assignment in recurrent networks trained with sparse feedback. Our approach interleaves local neo-Hebbian-like updates during task execution with an outer loop that optimizes plasticity parameters via **backpropagation through learning**. The resulting three-factor learning rules enable long-timescale credit assignment using only local information and delayed rewards, offering new insights into biologically grounded mechanisms for learning in recurrent circuits.",
      "keywords": [
        "Biologically Plausible Deep Networks",
        "Plasticity and Adaptation",
        "Recurrent Networks",
        "Reinforcement Learning (Cognitive/Neuroscience)"
      ],
      "decision": "Reject",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "cGks3s79hW",
      "title": "High-dimensional neuronal activity from low-dimensional latent dynamics: a solvable model",
      "abstract": "Computation in recurrent networks of neurons has been hypothesized to occur at the level of low-dimensional latent dynamics, both in artificial systems and in the brain. This hypothesis seems at odds with evidence from large-scale neuronal recordings in mice showing that neuronal population activity is high-dimensional. To demonstrate that low-dimensional latent dynamics and high-dimensional activity can be two sides of the same coin, we present an analytically solvable recurrent neural network (RNN) model whose dynamics can be exactly reduced to a low-dimensional dynamical system, but generates an activity manifold that has a high linear embedding dimension. This raises the question: Do low-dimensional latents explain the high-dimensional activity observed in mouse visual cortex? Spectral theory tells us that the covariance eigenspectrum alone does not allow us to recover the dimensionality of the latents, which can be low or high, when neurons are nonlinear. To address this indeterminacy, we develop Neural Cross-Encoder (NCE), an interpretable, nonlinear latent variable modeling method for neuronal recordings, and find that high-dimensional neuronal responses to drifting gratings and spontaneous activity in visual cortex can be reduced to low-dimensional latents, while the responses to natural images cannot. We conclude that the high-dimensional activity measured in certain conditions, such as in the absence of a stimulus, is explained by low-dimensional latents that are nonlinearly processed by individual neurons.",
      "keywords": [
        "recurrent neural networks",
        "neuronal recordings",
        "visual cortex",
        "latent variable models",
        "PCA",
        "eigenvalue decay",
        "mean-field limit"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    }
  },
  {
    "group_id": "hSX7Dd8dxy",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "4OsgYD7em5",
      "title": "Does Reinforcement Learning Really Incentivize Reasoning Capacity in LLMs Beyond the Base Model?",
      "abstract": "Reinforcement Learning with Verifiable Rewards (RLVR) has recently demonstrated notable success in enhancing the reasoning performance of large language models (LLMs), particularly in mathematics and programming tasks. \nIt is widely believed that, similar to how traditional RL helps agents to explore and learn new strategies, RLVR enables LLMs to continuously self-improve, thus acquiring novel reasoning abilities that exceed the capacity of the corresponding base models. \nIn this study, we take a critical look at \\textit{the current state of RLVR} by systematically probing the reasoning capability boundaries of RLVR-trained LLMs across diverse model families, RL algorithms, and math/coding/visual reasoning benchmarks, using pass@\\textit{k} at large \\textit{k} values as the evaluation metric.\nWhile RLVR improves sampling efficiency towards the correct path, we surprisingly find that current training does \\emph{not} elicit fundamentally new reasoning patterns.\nWe observe that while RLVR-trained models outperform their base models at smaller values of $k$ (\\eg, $k$=1), base models achieve higher pass@$k$ score when $k$ is large.\nMoreover, we observe that the reasoning capability boundary of LLMs often narrows as RLVR training progresses.\nFurther coverage and perplexity analysis shows that the reasoning paths generated by RLVR models are already included in the base models' sampling distribution, suggesting that their reasoning abilities originate from and are \\textit{bounded} by the base model. \nFrom this perspective, treating the base model as an upper bound, our quantitative analysis shows that six popular RLVR algorithms perform similarly and remain far from optimal in fully leveraging the potential of the base model.\nIn contrast, we find that distillation can introduce new reasoning patterns from the teacher and genuinely expand the model’s reasoning capabilities.\nTaken together, our findings suggest that current RLVR methods have not fully realized the potential of RL to elicit genuinely novel reasoning abilities in LLMs. This underscores the need for improved RL paradigms—such as continual scaling and multi-turn agent-environment interaction—to unlock this potential.",
      "keywords": [
        "reinforcement learning with verifiable reward",
        "LLM reasoning"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "cEd00CXWE5",
      "title": "Beyond Two-Stage Training: Integrating SFT and RL for Improved Reasoning in LLMs",
      "abstract": "Reinforcement learning (RL) has proven effective in incentiving the reasoning abilities of large language models (LLMs), but faces significant efficiency challenges due to its extensive trial-and-error nature. A common practice is to employ supervised fine-tuning (SFT) as a warm-up stage; however, this decoupled two-stage approach limits interaction between SFT and RL, thereby constraining overall effectiveness. This study introduces a novel method for learning reasoning models that employs bilevel optimization to facilitate better cooperation between these training paradigms. Specifically, the SFT objective is explicitly conditioned on the optimal solution of the RL objective. During training, lower-level updates enable the model to receive SFT supervision concurrently with RL-based exploration, while upper-level updates are optimized to ensure that the joint training yields higher rewards than RL alone. Empirical evaluations on five reasoning benchmarks demonstrate that our method consistently outperforms baselines and achieves a better balance between effectiveness and efficiency.",
      "keywords": [
        "LLM",
        "Reasoning",
        "RL",
        "SFT"
      ],
      "decision": "Reject",
      "year": "2025"
    }
  },
  {
    "group_id": "RPRqKhjrr6",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "B6bE2GC71a",
      "title": "EvoLM: In Search of Lost Language Model Training Dynamics",
      "abstract": "Modern language model (LM) training has been divided into multiple stages, making it difficult for downstream developers to evaluate the impact of design choices made at each stage.\nWe present EvoLM, a model suite that enables systematic and transparent analysis of LMs' training dynamics across pre-training, continued pre-training, supervised fine-tuning, and reinforcement learning. \nBy training over 100 LMs with 1B and 4B parameters from scratch, we rigorously evaluate both upstream (language modeling) and downstream (problem-solving) reasoning capabilities, including considerations of both in-domain and out-of-domain generalization. \nKey insights highlight the diminishing returns from excessive pre-training and post-training, the importance and practices of mitigating forgetting during domain-specific continued pre-training, the crucial role of continued pre-training in bridging pre-training and post-training phases, and various intricate trade-offs when configuring supervised fine-tuning and reinforcement learning. \nTo facilitate open research and reproducibility, we release all pre-trained and post-trained models, training datasets for all stages, and our entire training and evaluation pipeline.",
      "keywords": [
        "Language Models",
        "Training Dynamics",
        "Pretraining",
        "Post-training"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "obXGSmmG70",
      "title": "AdaCoT: Pareto-Optimal Adaptive Chain-of-Thought Triggering via Reinforcement Learning",
      "abstract": "Large Language Models (LLMs) have demonstrated remarkable capabilities but often face challenges with tasks requiring sophisticated reasoning. While Chain-of-Thought (CoT) prompting significantly enhances reasoning, it indiscriminately generates lengthy reasoning steps for all queries, leading to substantial computational costs and inefficiency, especially for simpler inputs. To address this critical issue, we introduce AdaCoT (Adaptive Chain-of-Thought), a novel framework enabling LLMs to adaptively decide when to invoke CoT. AdaCoT framed adaptive reasoning as a Pareto optimization problem that seeks to balance model performance with the costs associated with CoT invocation (both frequency and computational overhead). We propose a reinforcement learning (RL) based method, specifically utilizing Proximal Policy Optimization (PPO), to dynamically control the CoT triggering decision boundary by adjusting penalty coefficients, thereby allowing the model to determine CoT necessity based on implicit query complexity. A key technical contribution is Selective Loss Masking (SLM), designed to counteract decision boundary collapse during multi-stage RL training, ensuring robust and stable adaptive triggering. Experimental results demonstrate that AdaCoT successfully navigates the Pareto frontier, achieving substantial reductions in CoT usage for queries not requiring elaborate reasoning. For instance, on our production traffic testset, AdaCoT reduced CoT triggering rates to as low as 3.18% and decreased average response tokens by 69.06% on APP, while maintaining high performance on complex tasks. This substantial token decrease directly translates to a significant reduction in inference computational load. AdaCoT pioneers adaptive CoT triggering, offering a practical and principled solution for developing more efficient, responsive, and cost-effective LLMs, particularly crucial for interactive and resource-sensitive applications.",
      "keywords": [
        "Adaptive Reasoning",
        "Chain-of-Thought",
        "Large Language Models"
      ],
      "decision": "Reject",
      "year": "2025"
    }
  },
  {
    "group_id": "S8XcHutp7Z",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "DwFDfrPsm8",
      "title": "NOVA: A Benchmark for Rare Anomaly Localization and Clinical Reasoning in Brain MRI",
      "abstract": "In many real-world applications, deployed models encounter inputs that differ from the data seen during training. Open-world recognition ensures that such systems remain robust as ever-emerging, previously _unknown_ categories appear and must be addressed without retraining.\nFoundation and vision-language models are pre-trained on large and diverse datasets with the expectation of broad generalization across domains, including medical imaging.\nHowever, benchmarking these models on test sets with only a few common outlier types silently collapses the evaluation back to a closed-set problem, masking failures on rare or truly novel conditions encountered in clinical use.\n\nWe therefore present NOVA, a challenging, real-life _evaluation-only_ benchmark of $\\sim$900 brain MRI scans that span 281 rare pathologies and heterogeneous acquisition protocols. Each case includes rich clinical narratives and double-blinded expert bounding-box annotations. Together, these enable joint assessment of anomaly localisation, visual captioning, and diagnostic reasoning. \nBecause NOVA is never used for training, it serves as an _extreme_ stress-test of out-of-distribution generalisation: models must bridge a distribution gap both in sample appearance and in semantic space.  \nBaseline results with leading vision-language models (GPT-4o, Gemini 2.0 Flash, and Qwen2.5-VL-72B) reveal substantial performance drops, with approximately a 65\\% gap in localisation compared to natural-image benchmarks and 40\\% and 20\\% gaps in captioning and reasoning, respectively, compared to resident radiologists. Therefore, NOVA establishes a testbed for advancing models that can detect, localize, and reason about truly unknown anomalies.",
      "keywords": [
        "Vision-Language Models",
        "Zero-shot Learning",
        "Anomaly Detection",
        "Dataset Benchmarking",
        "Medical Imaging",
        "Brain MRI",
        "Multi-modal Data",
        "Rare Diseases"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "nMUpDatZBh",
      "title": "VICON: Vision In-Context Operator Networks for Multi-Physics Fluid Dynamics",
      "abstract": "In-Context Operator Networks (ICONs) have demonstrated the ability to learn operators across diverse partial differential equations using few-shot, in-context learning. However, existing ICONs process each spatial point as an individual token, severely limiting computational efficiency when handling dense data in higher spatial dimensions. We propose \\textit{Vision In-Context Operator Networks} (VICON), which integrates vision transformer architectures to efficiently process 2D data through patch-wise operations while preserving ICON's adaptability to multiphysics systems and varying timesteps. Evaluated across three fluid dynamics benchmarks, VICON significantly outperforms state-of-the-art baselines: DPOT and MPP, reducing the averaged last-step rollout error by 37.9\\% compared to DPOT and 44.7\\% compared to MPP, while requiring only 72.5\\% and 34.8\\% of their respective inference times. VICON naturally supports flexible rollout strategies with varying timestep strides, enabling immediate deployment in \\textit{imperfect measurement systems} where sampling frequencies may differ or frames might be dropped—common challenges in real-world settings—without requiring retraining or interpolation. In these realistic scenarios, VICON exhibits remarkable robustness, experiencing only 24.41\\% relative performance degradation compared to 71.37\\%-74.49\\% degradation in baseline methods, demonstrating its versatility for depolying in realistic applications.",
      "keywords": [
        "AI4Science",
        "Learning PDE",
        "Fluid Dynamics",
        "In-Context Learning"
      ],
      "decision": "Reject",
      "year": "2025"
    }
  },
  {
    "group_id": "ziLHIExi1j",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "sYK4yPDuT1",
      "title": "A Snapshot of Influence: A Local Data Attribution Framework for Online Reinforcement Learning",
      "abstract": "Online reinforcement learning (RL) excels in complex, safety-critical domains but suffers from sample inefficiency, training instability, and limited interpretability. Data attribution provides a principled way to trace model behavior back to training samples, yet existing methods assume fixed datasets, which is violated in online RL where each experience both updates the policy and shapes future data collection.\nIn this paper, we initiate the study of data attribution for online RL, focusing on the widely used Proximal Policy Optimization (PPO) algorithm. We start by establishing a *local* attribution framework, interpreting model checkpoints with respect to the records in the recent training buffer. We design two target functions, capturing agent action and cumulative return respectively, and measure each record's contribution through gradient similarity between its training loss and these targets. We demonstrate the power of this framework through three concrete applications: diagnosis of learning, temporal analysis of behavior formation, and targeted intervention during training. Leveraging this framework, we further propose an algorithm, iterative influence-based filtering (IIF), for online RL training that iteratively performs experience filtering to refine policy updates. Across standard RL benchmarks (classic control, navigation, locomotion) to RLHF for large language models, IIF reduces sample complexity, speeds up training, and achieves higher returns. Together, these results open a new direction for making online RL more interpretable, efficient, and effective.",
      "keywords": [
        "data attribution",
        "online reinforcement learning"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "ziLHIExi1j",
      "title": "Quantifying First‐Order Markov Breakdowns in Noisy Reinforcement Learning: A Causal Discovery Approach",
      "abstract": "Reinforcement learning (RL) methods often assume that each new observation fully captures the environment’s state, ensuring Markovian (one‐step) transitions. Real‐world deployments, however, frequently violate this assumption due to partial observability or noise in sensors and actuators. This paper introduces a systematic methodology for diagnosing such violations, combining a partial correlation based causal discovery procedure (PCMCI) with a newly proposed Markov Violation score (MVS). The MVS quantifies multi‐step dependencies that emerge when noise or incomplete state information disrupts the Markov property.\n\nClassic control tasks (CartPole, Pendulum, Acrobot) are used to assess how targeted noise and dimension omissions affect both RL performance and the measured Markov consistency. Contrary to expectations, heavy observation noise often fails to induce strong multi‐lag dependencies in certain tasks (e.g., Acrobot). Dimension‐dropping experiments further reveal that omitting certain state variables (e.g., angular velocities in CartPole and Pendulum) substantially degrades returns and elevates MVS, while other dimensions can be removed with negligible effect.\n\nThese findings highlight the importance of identifying and safeguarding the most causally critical dimensions to maintain effective one‐step learning. By bridging partial correlation tests and RL performance metrics, the proposed approach uniquely pinpoints when and where the Markov property breaks. This framework offers a principled tool for designing robust policies, guiding representation learning, and handling partial observability in real‐world RL tasks. All code and experimental logs are publicly available for reproducibility (URL omitted for double‐blind review).",
      "keywords": [
        "Markov Property",
        "PCMCI (Causal Discovery)",
        "PPO",
        "Noisy Reinforcement Learning."
      ],
      "decision": "Reject",
      "year": "2025"
    }
  },
  {
    "group_id": "FZURCro04D",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "FZURCro04D",
      "title": "Reasoning on a Spectrum: Aligning LLMs to System 1 and System 2 Thinking",
      "abstract": "Large language models (LLMs) demonstrate remarkable reasoning capabilities, yet their reliance on step-by-step reasoning can make them brittle when tasks do not align with such structured approaches. In contrast, human cognition flexibly alternates between fast, intuitive reasoning (System 1) and slow, analytical reasoning (System 2), depending on context. To bridge this gap, we curate a dataset of 2K examples, each with valid responses from both reasoning styles, and explicitly align LLMs with System 1 and System 2 reasoning. Evaluations across diverse reasoning benchmarks reveal an accuracy-efficiency trade-off: System 2-aligned models excel in arithmetic and symbolic reasoning, while System 1-aligned models perform better in commonsense tasks. A mechanistic analysis of model responses shows that System 1 models employ more definitive answers, whereas System 2 models demonstrate greater uncertainty. Interpolating between these extremes produces a monotonic transition in reasoning accuracy, preserving coherence. This work challenges the assumption that step-by-step reasoning is always optimal and highlights the need for adapting reasoning strategies based on task demands.",
      "keywords": [
        "Alignment",
        "System 1 and System 2 thinking",
        "Cognitive heuristics",
        "LLM",
        "NLP"
      ],
      "decision": "Reject",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "Q3qAsZAEZw",
      "title": "Understanding and Mitigating Numerical Sources of Nondeterminism in LLM Inference",
      "abstract": "Large Language Models (LLMs) are now integral across various domains and have demonstrated impressive performance. Progress, however, rests on the premise that benchmark scores are both accurate and reproducible. We demonstrate that the reproducibility of LLM performance is fragile: changing system configuration, such as evaluation batch size, GPU count, and GPU version, can introduce significant differences in the generated responses. \nThis issue is especially pronounced in reasoning models, where minor rounding differences in early tokens can cascade into divergent chains of thought, ultimately affecting accuracy. For instance, under bfloat16 precision with greedy decoding, a reasoning model like DeepSeek-R1-Distill-Qwen-7B can exhibit up to 9\\% variation in accuracy and 9,000 tokens difference in response length due to differences in GPU count, type, and evaluation batch size.\nWe trace the root cause of this variability to the non-associative nature of floating-point arithmetic under limited numerical precision. \nThis work presents the first systematic investigation into how numerical precision affects reproducibility in LLM inference. Through carefully controlled experiments across various hardware, software, and precision settings, we quantify when and how model outputs diverge.\nOur analysis reveals that floating-point precision—while critical for reproducibility—is often neglected in evaluation practices.\nInspired by this, we develop a lightweight inference pipeline, dubbed LayerCast, that stores weights in 16-bit precision but performs all computations in FP32, balancing memory efficiency with numerical stability. Code is available at https://github.com/nanomaoli/llm_reproducibility.",
      "keywords": [
        "Large Language Models (LLMs)",
        "Reproducibility",
        "Numerical precision",
        "Deterministic inference"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    }
  },
  {
    "group_id": "n33JVwCz38",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "gxfusMqPIs",
      "title": "Improved Regret Bounds for Gaussian Process Upper Confidence Bound in Bayesian Optimization",
      "abstract": "This paper addresses the Bayesian optimization problem (also referred to as the Bayesian setting of the Gaussian process bandit), where the learner seeks to minimize the regret under a function drawn from a known Gaussian process (GP). \nUnder a Mat\\'ern kernel with some extent of smoothness, we show that the Gaussian process upper confidence bound (GP-UCB) algorithm achieves $\\tilde{O}(\\sqrt{T})$ cumulative regret with high probability. Furthermore, our analysis yields $O(\\sqrt{T \\ln^2 T})$ regret under a squared exponential kernel. These results fill the gap between the existing regret upper bound of GP-UCB and the current best upper bound provided by Scarlett [2018]. The key idea in our proof is to capture the concentration behavior of the input sequence realized by GP-UCB, enabling us to handle GP's information gain in a refined manner.",
      "keywords": [
        "Gaussian process bandits",
        "regret analysis",
        "Bayesian optimization"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "n33JVwCz38",
      "title": "Approximate Message Passing for Bayesian Neural Networks",
      "abstract": "Bayesian methods have the ability to consider model uncertainty within a single framework and provide a powerful tool for decision-making. Bayesian neural networks (BNNs) hold great potential for better uncertainty quantification and data efficiency, making them promising candidates for more trustworthy AI in critical applications, and as backbones in data-constrained settings such as real-world reinforcement learning.  However, current approaches often face limitations such as overconfidence, sensitivity to hyperparameters, and posterior collapse, highlighting the need for alternative approaches. In this paper, we introduce a novel method that leverages message passing (MP) to model the predictive posterior of BNNs as a factor graph. Unlike previous MP-based methods, our framework is the first to support convolutional neural networks (CNNs) while addressing the issue of double-counting training data, which has been a key source of overconfidence in prior work. Multiple open datasets are used to demonstrate the general applicability of the method and to illustrate its differences to existing inference methods.",
      "keywords": [
        "Bayesian Neural Networks",
        "Message Passing",
        "Uncertainty Quantification",
        "Bayesian Inference"
      ],
      "decision": "Reject",
      "year": "2025"
    }
  },
  {
    "group_id": "KkOMqJQiWU",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "KkOMqJQiWU",
      "title": "Meta-learning local learning rules for structured credit assignment with sparse feedback",
      "abstract": "Biological neural networks can learn complex behaviors from sparse, delayed feedback using local synaptic plasticity, yet the mechanisms enabling structured credit assignment remain elusive. In contrast, artificial recurrent networks solving similar tasks typically rely on biologically implausible global learning rules or hand-crafted local updates. The space of local plasticity rules capable of supporting learning from delayed reinforcement remains largely unexplored. Here, we present a meta-learning framework that discovers local learning rules for structured credit assignment in recurrent networks trained with sparse feedback. Our approach interleaves local neo-Hebbian-like updates during task execution with an outer loop that optimizes plasticity parameters via **backpropagation through learning**. The resulting three-factor learning rules enable long-timescale credit assignment using only local information and delayed rewards, offering new insights into biologically grounded mechanisms for learning in recurrent circuits.",
      "keywords": [
        "Biologically Plausible Deep Networks",
        "Plasticity and Adaptation",
        "Recurrent Networks",
        "Reinforcement Learning (Cognitive/Neuroscience)"
      ],
      "decision": "Reject",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "RF3miSqdXa",
      "title": "On Linear Mode Connectivity of Mixture-of-Experts Architectures",
      "abstract": "Linear Mode Connectivity (LMC) is a notable phenomenon in the loss landscapes\nof neural networks, wherein independently trained models have been observed to\nbe connected—up to permutation symmetries—by linear paths in parameter space\nalong which the loss remains consistently low. This observation challenges classical\nviews of non-convex optimization and has implications for model ensembling,\ngeneralization, and our understanding of neural loss geometry. Inspired by recent\nstudies on LMC in standard neural networks, we systematically investigate this\nphenomenon within Mixture-of-Experts (MoE) architectures—a class of models\nknown for their scalability and computational efficiency, which combine traditional\nneural networks—referred to as experts—through a learnable gating mechanism.\nWe begin by conducting a comprehensive analysis of both dense and sparse gating\nregimes, demonstrating that the symmetries inherent to MoE architectures are\nfully characterized by permutations acting on both the expert components and the\ngating function. Building on these foundational findings, we propose a matching\nalgorithm that enables alignment between independently trained MoEs, thereby\nfacilitating the discovery of LMC. Finally, we empirically validate the presence of\nLMC using our proposed algorithm across diverse MoE configurations—including\ndense, sparse, and shared-expert variants—under a wide range of model settings\nand datasets of varying scales and modalities. Our results confirm the existence\nof LMC in MoE architectures and offer fundamental insights into the functional\nlandscape and optimization dynamics of deep learning models.",
      "keywords": [
        "linear mode connectivity",
        "mixture-of-experts"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    }
  },
  {
    "group_id": "krF62hkrfR",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "krF62hkrfR",
      "title": "Neural Bayesian Filtering",
      "abstract": "We present Neural Bayesian Filtering (NBF), an algorithm for maintaining posteriors, called beliefs, over hidden states in partially observable systems.\nNBF is trained to find a good latent representation of the beliefs induced by a task.\nIt maps beliefs to fixed-length embedding vectors, which can condition generative models for sampling.\nDuring filtering, particle-style updates compute posteriors in this embedding space using incoming observations and environment dynamics.\nNBF combines the computational efficiency of classical filters with the expressiveness of deep generative models - tracking rapidly shifting, multimodal beliefs while mitigating the risk of *particle impoverishment*.\nWe validate NBF in state estimation tasks in partially observable variants of Gridworld and the card game Goofspiel.",
      "keywords": [
        "Partially observable systems",
        "belief state modeling",
        "particle filtering",
        "bayesian filtering",
        "normalizing flows"
      ],
      "decision": "Reject",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "uWj4s7rMnR",
      "title": "Mean Flows for One-step Generative Modeling",
      "abstract": "We propose a principled and effective framework for one-step generative modeling. We introduce the notion of average velocity to characterize flow fields, in contrast to instantaneous velocity modeled by Flow Matching methods. A well-defined identity between average and instantaneous velocities is derived and used to guide neural network training. Our method, termed the \\textit{MeanFlow} model, is self-contained and requires no pre-training, distillation, or curriculum learning. MeanFlow demonstrates strong empirical performance: it achieves an FID of 3.43 with a single function evaluation (1-NFE) on ImageNet 256$\\times$256 trained from scratch, significantly outperforming previous state-of-the-art one-step diffusion/flow models. Our study substantially narrows the gap between one-step diffusion/flow models and their multi-step predecessors, and we hope it will motivate future research to revisit the foundations of these powerful models.",
      "keywords": [
        "Generative Models"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    }
  },
  {
    "group_id": "G10Y4vrhGF",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "CaSQgef484",
      "title": "Exploring Diffusion Transformer Designs via Grafting",
      "abstract": "Designing model architectures requires decisions such as selecting operators (e.g., attention, convolution) and configurations (e.g., depth, width). However, evaluating the impact of these decisions on model quality requires costly pretraining, limiting architectural investigation.\nInspired by how new software is built on existing code, we ask: can new architecture designs be studied using pretrained models? To this end, we present *grafting*, a simple approach for editing pretrained diffusion transformers (DiTs) to materialize new architectures under small compute budgets. Informed by our analysis of activation behavior and attention locality, we construct a testbed based on the DiT-XL/2 design to study the impact of grafting on model quality. Using this testbed, we develop a family of hybrid designs via grafting: replacing softmax attention with gated convolution, local attention, and linear attention, and replacing MLPs with variable expansion ratio and convolutional variants. Notably, many hybrid designs achieve good quality (FID: 2.38–2.64 vs. 2.27 for DiT-XL/2)\nusing $<2$% pretraining compute. We then graft a text-to-image model (PixArt-$\\Sigma$), achieving a 1.43$\\times$ speedup with less than a 2% drop in GenEval score. Finally, we present a case study that restructures DiT-XL/2 by converting every pair of sequential transformer blocks into parallel blocks via grafting. This reduces model depth by 2$\\times$ and yields better quality (FID: 2.77) than other models of comparable depth. Together, we show that new diffusion model designs can be explored by grafting pretrained DiTs, with edits ranging from operator replacement to architecture restructuring. Code and grafted models: https://grafting.stanford.edu.",
      "keywords": [
        "Diffusion Transformers",
        "Model Grafting",
        "Architectural Editing",
        "Hybrid Models"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "InyYuWLWHD",
      "title": "LayerGuard: Poisoning-Resilient Federated Learning via Layer-Wise Similarity Analysis",
      "abstract": "In recent years, model poisoning attacks have gradually evolved from conventional global parameter manipulations to more stealthy and strategic Targeted Layer Poisoning (TLP) attacks.These attacks achieve high attack success rates by selectively poisoning only a subset of layers. However, most existing defenses rely on evaluation of the entire network and are thus ineffective against TLP attacks, posing new challenges to the security of Federated Learning (FL).In this paper, we propose \\textbf{LayerGuard}, a comprehensive defense framework featuring dynamic detection and adaptive aggregation to protect FL against advanced model poisoning attacks. Diverging from traditional methods that analyze the entire network collectively, \\textbf{LayerGuard} performs layer-wise similarity analysis to detect anomalous clients and adaptively identifies layers under attack based on the clustering behavior of malicious updates, facilitating more precise threat detection. Building on this, we introduce a joint weighting mechanism in the aggregation process, which evaluates each client's credibility at the layer level from two complementary informational dimensions: inter-layer and intra-layer, balancing attack mitigation and benign contribution retention. Extensive experiments across various datasets and model architectures demonstrate that \\textbf{LayerGuard} successfully reduces the average attack success rate of TLP attacks to around 5\\%. Moreover, when confronted with other advanced model poisoning attacks, \\textbf{LayerGuard} consistently maintains global model accuracy—even under high poisoning rates and severe non-IID conditions—comparable to that of FedAvg under no-attack settings, marking a significant improvement over existing defenses.",
      "keywords": [
        "Federated Learning; Security; Model Poisoning Attacks; Robust Aggregation"
      ],
      "decision": "Reject",
      "year": "2025"
    }
  },
  {
    "group_id": "Jzr9VOiJYd",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "K61Y6cTMRl",
      "title": "Toward Foundation Model for Multivariate Wearable Sensing of Physiological Signals",
      "abstract": "Time-series foundation models excel at tasks like forecasting across diverse data types by leveraging informative waveform representations. Wearable sensing data, however, pose unique challenges due to their variability in patterns and frequency bands, especially for healthcare-related outcomes. The main obstacle lies in crafting generalizable representations that adapt efficiently across heterogeneous sensing configurations and applications. To address this, we propose NormWear, the first multi-modal and ubiquitous foundation model designed to extract generalized and informative representations from wearable sensing data. Specifically, we design a channel-aware attention mechanism with a shared special liaison [CLS] token to detect signal patterns in both intra-sensor and inter-sensors. This helps the model to extract more meaningful information considering both time series themselves and the relationships between input sensors. This helps the model to be widely compatible with various sensors settings. NormWear is pretrained on a diverse set of physiological signals, including PPG, ECG, EEG, GSR, and IMU, from various public datasets. Our model shows exceptional generalizability across 11 public wearable sensing datasets, spanning 18 applications in mental health, body state inference, vital sign estimation, and disease risk evaluation. It consistently outperforms competitive baselines under zero-shot, partial-shot, and full-shot settings, indicating broad applicability in real-world health applications.",
      "keywords": [
        "Foundation Model",
        "Signal Processing",
        "Time Series",
        "Wearable Sensing",
        "Digital Health"
      ],
      "decision": "Reject",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "jMhRbV47pS",
      "title": "The emergence of sparse attention: impact of data distribution and benefits of repetition",
      "abstract": "Emergence is a fascinating property of large language models and neural networks more broadly: as models scale and train for longer, they sometimes develop new abilities in sudden ways. Despite initial studies, we still lack a comprehensive understanding of how and when these abilities emerge. To address this gap, we study the emergence over training of sparse attention, a critical and frequently observed attention pattern in Transformers. By combining theoretical analysis of a toy model with empirical observations on small Transformers trained on a linear regression variant, we uncover the mechanics driving sparse attention emergence and reveal that emergence timing follows power laws based on task structure, architecture, and optimizer choice. We additionally find that repetition can greatly speed up emergence. Finally, we confirm these results on a well-studied in-context associative recall task. Our findings provide a simple, theoretically grounded framework for understanding how data distributions and model design influence the learning dynamics behind one form of emergence.",
      "keywords": [
        "emergence",
        "sparse attention",
        "in-context learning",
        "induction head"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    }
  },
  {
    "group_id": "mPuOMcN9E7",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "sYK4yPDuT1",
      "title": "A Snapshot of Influence: A Local Data Attribution Framework for Online Reinforcement Learning",
      "abstract": "Online reinforcement learning (RL) excels in complex, safety-critical domains but suffers from sample inefficiency, training instability, and limited interpretability. Data attribution provides a principled way to trace model behavior back to training samples, yet existing methods assume fixed datasets, which is violated in online RL where each experience both updates the policy and shapes future data collection.\nIn this paper, we initiate the study of data attribution for online RL, focusing on the widely used Proximal Policy Optimization (PPO) algorithm. We start by establishing a *local* attribution framework, interpreting model checkpoints with respect to the records in the recent training buffer. We design two target functions, capturing agent action and cumulative return respectively, and measure each record's contribution through gradient similarity between its training loss and these targets. We demonstrate the power of this framework through three concrete applications: diagnosis of learning, temporal analysis of behavior formation, and targeted intervention during training. Leveraging this framework, we further propose an algorithm, iterative influence-based filtering (IIF), for online RL training that iteratively performs experience filtering to refine policy updates. Across standard RL benchmarks (classic control, navigation, locomotion) to RLHF for large language models, IIF reduces sample complexity, speeds up training, and achieves higher returns. Together, these results open a new direction for making online RL more interpretable, efficient, and effective.",
      "keywords": [
        "data attribution",
        "online reinforcement learning"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "vMfJM9oBYL",
      "title": "Learning from Preferences and Mixed Demonstrations in General Settings",
      "abstract": "Reinforcement learning is a general method for learning in sequential settings, but it can often be difficult to specify a good reward function when the task is complex.\nIn these cases, preference feedback or expert demonstrations can be used instead.\nHowever, existing approaches utilising both together are either ad-hoc or rely on domain-specific properties.\nBuilding upon previous work, we develop a mathematical framework for learning from human data and based on this we introduce LEOPARD: Learning Estimated Objectives from Preferences And Ranked Demonstrations.\nLEOPARD can simultaneously learn from a broad range of data, including negative/failed demonstrations, to effectively learn reward functions in general domains.\nIt does this by modelling the human feedback as reward-rational partial orderings over available trajectories.\nWe find that when a limited amount of preference and demonstration feedback is available, LEOPARD outperforms baselines by a significant margin.\nFurthermore, we use LEOPARD to investigate learning from many types of feedback compared to just a single one, and find that a combination of feedback types is often beneficial.",
      "keywords": [
        "reinforcement learning",
        "rl",
        "human feedback",
        "rlhf",
        "modelling",
        "preferences",
        "demonstrations",
        "rankings",
        "machine learning",
        "reward learning"
      ],
      "decision": "Reject",
      "year": "2025"
    }
  },
  {
    "group_id": "yRxX01oRIi",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "FZURCro04D",
      "title": "Reasoning on a Spectrum: Aligning LLMs to System 1 and System 2 Thinking",
      "abstract": "Large language models (LLMs) demonstrate remarkable reasoning capabilities, yet their reliance on step-by-step reasoning can make them brittle when tasks do not align with such structured approaches. In contrast, human cognition flexibly alternates between fast, intuitive reasoning (System 1) and slow, analytical reasoning (System 2), depending on context. To bridge this gap, we curate a dataset of 2K examples, each with valid responses from both reasoning styles, and explicitly align LLMs with System 1 and System 2 reasoning. Evaluations across diverse reasoning benchmarks reveal an accuracy-efficiency trade-off: System 2-aligned models excel in arithmetic and symbolic reasoning, while System 1-aligned models perform better in commonsense tasks. A mechanistic analysis of model responses shows that System 1 models employ more definitive answers, whereas System 2 models demonstrate greater uncertainty. Interpolating between these extremes produces a monotonic transition in reasoning accuracy, preserving coherence. This work challenges the assumption that step-by-step reasoning is always optimal and highlights the need for adapting reasoning strategies based on task demands.",
      "keywords": [
        "Alignment",
        "System 1 and System 2 thinking",
        "Cognitive heuristics",
        "LLM",
        "NLP"
      ],
      "decision": "Reject",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "Q3qAsZAEZw",
      "title": "Understanding and Mitigating Numerical Sources of Nondeterminism in LLM Inference",
      "abstract": "Large Language Models (LLMs) are now integral across various domains and have demonstrated impressive performance. Progress, however, rests on the premise that benchmark scores are both accurate and reproducible. We demonstrate that the reproducibility of LLM performance is fragile: changing system configuration, such as evaluation batch size, GPU count, and GPU version, can introduce significant differences in the generated responses. \nThis issue is especially pronounced in reasoning models, where minor rounding differences in early tokens can cascade into divergent chains of thought, ultimately affecting accuracy. For instance, under bfloat16 precision with greedy decoding, a reasoning model like DeepSeek-R1-Distill-Qwen-7B can exhibit up to 9\\% variation in accuracy and 9,000 tokens difference in response length due to differences in GPU count, type, and evaluation batch size.\nWe trace the root cause of this variability to the non-associative nature of floating-point arithmetic under limited numerical precision. \nThis work presents the first systematic investigation into how numerical precision affects reproducibility in LLM inference. Through carefully controlled experiments across various hardware, software, and precision settings, we quantify when and how model outputs diverge.\nOur analysis reveals that floating-point precision—while critical for reproducibility—is often neglected in evaluation practices.\nInspired by this, we develop a lightweight inference pipeline, dubbed LayerCast, that stores weights in 16-bit precision but performs all computations in FP32, balancing memory efficiency with numerical stability. Code is available at https://github.com/nanomaoli/llm_reproducibility.",
      "keywords": [
        "Large Language Models (LLMs)",
        "Reproducibility",
        "Numerical precision",
        "Deterministic inference"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    }
  },
  {
    "group_id": "g2vViuEVDS",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "1IpHkK5Q8F",
      "title": "Real-Time Hyper-Personalized Generative AI Should Be Regulated to Prevent the Rise of \"Digital Heroin\"",
      "abstract": "This position paper argues that real-time generative AI has the potential to become the next wave of addictive digital media, creating a new class of digital content akin to ``digital heroin'' with severe implications for mental health and youth development. By shortening the content-generation feedback loop to mere seconds, these advanced models will soon be able to hyper-personalize outputs on the fly. When paired with misaligned incentives (e.g., maximizing user engagement), this will fuel unprecedented compulsive consumption patterns with far-reaching consequences for mental health, cognitive development, and social stability. Drawing on interdisciplinary research, from clinical observations of social media addiction to neuroscientific studies of dopamine-driven feedback, we illustrate how real-time tailored content generation may erode user autonomy, foment emotional distress, and disproportionately endanger vulnerable groups, such as adolescents. Due to the rapid advancement of generative AI and its potential to induce severe addiction-like effects, we call for strong government oversight akin to existing controls on addictive substances, particularly for minors. We further urge the machine learning community to act proactively by establishing robust design guidelines, collaborating with public health experts, and supporting targeted policy measures to ensure responsible and ethical deployment, rather than paving the way for another wave of unregulated digital dependence.",
      "keywords": [
        "generative ai",
        "real-time personalization",
        "behavioral addiction",
        "digital media",
        "public health",
        "policy interventions",
        "machine learning ethics"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "eR8raBLZW7",
      "title": "BriLLM: Brain-inspired Large Language Model",
      "abstract": "This paper reports the brain-inspired large language model (BriLLM). This is a non-Transformer, non-GPT, non-traditional machine learning input-output controlled generative language model. The model is based on the Signal Fully-connected flowing (SiFu) definition on the directed graph in terms of the neural network, and has the interpretability of all nodes on the graph of the whole model, instead of the traditional machine learning model that only has limited interpretability at the input and output ends. In the language model scenario, the token is defined as a node in the graph. A randomly shaped or user-defined signal flow flows between nodes on the principle of \"least resistance\" along paths. The next token or node to be predicted or generated is the target of the signal flow. As a language model, BriLLM theoretically supports infinitely long $n$-gram models when the model size is independent of the input and predicted length of the model. The model's working signal flow provides the possibility of recall activation and innate multi-modal support similar to the cognitive patterns of the human brain. At present, we released the first BriLLM versions in Chinese and English, with 4000 tokens, 32-dimensional node size, 32-token sequence prediction ability, model sizes around 2B and 1B respectively, bringing language model prediction performance comparable to GPT-1.",
      "keywords": [
        "LLM"
      ],
      "decision": "Reject",
      "year": "2025"
    }
  },
  {
    "group_id": "iQoZv77o3g",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "RF3miSqdXa",
      "title": "On Linear Mode Connectivity of Mixture-of-Experts Architectures",
      "abstract": "Linear Mode Connectivity (LMC) is a notable phenomenon in the loss landscapes\nof neural networks, wherein independently trained models have been observed to\nbe connected—up to permutation symmetries—by linear paths in parameter space\nalong which the loss remains consistently low. This observation challenges classical\nviews of non-convex optimization and has implications for model ensembling,\ngeneralization, and our understanding of neural loss geometry. Inspired by recent\nstudies on LMC in standard neural networks, we systematically investigate this\nphenomenon within Mixture-of-Experts (MoE) architectures—a class of models\nknown for their scalability and computational efficiency, which combine traditional\nneural networks—referred to as experts—through a learnable gating mechanism.\nWe begin by conducting a comprehensive analysis of both dense and sparse gating\nregimes, demonstrating that the symmetries inherent to MoE architectures are\nfully characterized by permutations acting on both the expert components and the\ngating function. Building on these foundational findings, we propose a matching\nalgorithm that enables alignment between independently trained MoEs, thereby\nfacilitating the discovery of LMC. Finally, we empirically validate the presence of\nLMC using our proposed algorithm across diverse MoE configurations—including\ndense, sparse, and shared-expert variants—under a wide range of model settings\nand datasets of varying scales and modalities. Our results confirm the existence\nof LMC in MoE architectures and offer fundamental insights into the functional\nlandscape and optimization dynamics of deep learning models.",
      "keywords": [
        "linear mode connectivity",
        "mixture-of-experts"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "WOXyOiVd4B",
      "title": "FragFM: Hierarchical Framework for Efficient Molecule Generation via Fragment-Level Discrete Flow Matching",
      "abstract": "We introduce FragFM, a novel hierarchical framework via fragment-level discrete flow matching for efficient molecular graph generation. FragFM generates molecules at the fragment level, leveraging a coarse-to-fine autoencoder to reconstruct details at the atom level. Together with a stochastic fragment bag strategy to effectively handle an extensive fragment space, our framework enables more efficient and scalable molecular generation. We demonstrate that our fragment-based approach achieves better property control than the atom-based method and additional flexibility through conditioning the fragment bag. We also propose a Natural Product Generation benchmark (NPGen) to evaluate modern molecular graph generative models' ability to generate natural product-like molecules. Since natural products are biologically prevalidated and differ from typical drug-like molecules, our benchmark provides a more challenging yet meaningful evaluation relevant to drug discovery. We conduct a FragFM comparative study against various models on diverse molecular generation benchmarks, including NPGen, demonstrating superior performance. The results highlight the potential of fragment-based generative modeling for large-scale, property-aware molecular design, paving the way for more efficient exploration of chemical space.",
      "keywords": [
        "Molecular Graph Generation",
        "Discrete Flow Matching",
        "Fragment-Based Drug Discovery",
        "Natural Product"
      ],
      "decision": "Reject",
      "year": "2025"
    }
  },
  {
    "group_id": "n4V3MSqK77",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "obXGSmmG70",
      "title": "AdaCoT: Pareto-Optimal Adaptive Chain-of-Thought Triggering via Reinforcement Learning",
      "abstract": "Large Language Models (LLMs) have demonstrated remarkable capabilities but often face challenges with tasks requiring sophisticated reasoning. While Chain-of-Thought (CoT) prompting significantly enhances reasoning, it indiscriminately generates lengthy reasoning steps for all queries, leading to substantial computational costs and inefficiency, especially for simpler inputs. To address this critical issue, we introduce AdaCoT (Adaptive Chain-of-Thought), a novel framework enabling LLMs to adaptively decide when to invoke CoT. AdaCoT framed adaptive reasoning as a Pareto optimization problem that seeks to balance model performance with the costs associated with CoT invocation (both frequency and computational overhead). We propose a reinforcement learning (RL) based method, specifically utilizing Proximal Policy Optimization (PPO), to dynamically control the CoT triggering decision boundary by adjusting penalty coefficients, thereby allowing the model to determine CoT necessity based on implicit query complexity. A key technical contribution is Selective Loss Masking (SLM), designed to counteract decision boundary collapse during multi-stage RL training, ensuring robust and stable adaptive triggering. Experimental results demonstrate that AdaCoT successfully navigates the Pareto frontier, achieving substantial reductions in CoT usage for queries not requiring elaborate reasoning. For instance, on our production traffic testset, AdaCoT reduced CoT triggering rates to as low as 3.18% and decreased average response tokens by 69.06% on APP, while maintaining high performance on complex tasks. This substantial token decrease directly translates to a significant reduction in inference computational load. AdaCoT pioneers adaptive CoT triggering, offering a practical and principled solution for developing more efficient, responsive, and cost-effective LLMs, particularly crucial for interactive and resource-sensitive applications.",
      "keywords": [
        "Adaptive Reasoning",
        "Chain-of-Thought",
        "Large Language Models"
      ],
      "decision": "Reject",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "q2VpjD7k1V",
      "title": "WebGen-Bench: Evaluating LLMs on Generating Interactive and Functional Websites from Scratch",
      "abstract": "LLM‑based agents have demonstrated great potential in generating and managing code within complex codebases. In this paper, we introduce WebGen-Bench, a novel benchmark designed to measure an LLM-based agent's ability to create multi-file website codebases from scratch. It contains diverse instructions for website generation, created through the combined efforts of human annotators and GPT-4o. These instructions span three major categories and thirteen minor categories, encompassing nearly all important types of web applications.\nTo assess the quality of the generated websites, we generate test cases targeting each functionality described in the instructions. These test cases are then manually filtered, refined, and organized to ensure accuracy, resulting in a total of 647 test cases. Each test case specifies an operation to be performed on the website and the expected outcome of the operation.\nTo automate testing and improve reproducibility, we employ a powerful web-navigation agent to execute test cases on the generated websites and determine whether the observed responses align with the expected results.\nWe evaluate three high-performance code-agent frameworks—Bolt.diy, OpenHands, and Aider—using multiple proprietary and open-source LLMs as engines. The best-performing combination, Bolt.diy powered by DeepSeek-R1, achieves only 27.8\\% accuracy on the test cases, highlighting the challenging nature of our benchmark.\nAdditionally, we construct WebGen-Instruct, a training set consisting of 6,667 website-generation instructions. Training Qwen2.5-Coder-32B-Instruct on Bolt.diy trajectories generated from a subset of the training set achieves an accuracy of 38.2\\%, surpassing the performance of the best proprietary model.\nWe release our data-generation, training, and testing code, along with both the datasets and model weights at https://github.com/mnluzimu/WebGen-Bench.",
      "keywords": [
        "Code Agent",
        "Website Generation"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    }
  },
  {
    "group_id": "Ejcn7IDkzT",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "gxfusMqPIs",
      "title": "Improved Regret Bounds for Gaussian Process Upper Confidence Bound in Bayesian Optimization",
      "abstract": "This paper addresses the Bayesian optimization problem (also referred to as the Bayesian setting of the Gaussian process bandit), where the learner seeks to minimize the regret under a function drawn from a known Gaussian process (GP). \nUnder a Mat\\'ern kernel with some extent of smoothness, we show that the Gaussian process upper confidence bound (GP-UCB) algorithm achieves $\\tilde{O}(\\sqrt{T})$ cumulative regret with high probability. Furthermore, our analysis yields $O(\\sqrt{T \\ln^2 T})$ regret under a squared exponential kernel. These results fill the gap between the existing regret upper bound of GP-UCB and the current best upper bound provided by Scarlett [2018]. The key idea in our proof is to capture the concentration behavior of the input sequence realized by GP-UCB, enabling us to handle GP's information gain in a refined manner.",
      "keywords": [
        "Gaussian process bandits",
        "regret analysis",
        "Bayesian optimization"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "nLWhcCs9Dp",
      "title": "O-MMGP: Optimal Mesh Morphing Gaussian Process Regression for Solving PDEs with non-Parametric Geometric Variations",
      "abstract": "We address the computational challenges of solving parametric PDEs with non parametrized geometric variations and non-reducible problems, such as those involving shocks and discontinuities of variable positions. Traditional dimensionality reduction methods like POD struggle with these scenarios due to slowly decaying Kolmogorov widths. To overcome this, we propose a novel non-linear dimensionality reduction technique to reduce the required modes for representation. The non-linear reduction is obtained through a POD after applying a transformation on the fields, which we call optimal mappings, and is a solution to an optimization problem in infinite dimension. The proposed learning framework combines morphing techniques, non-linear dimensionality reduction, and Gaussian Process Regression (GPR). The problem is reformulated on a reference geometry before applying the dimensionality reduction. Our method learns both the optimal mapping, and the solution fields, using a series of GPR models, enabling efficient and accurate modeling of complex parametric PDEs with geometrical variability. The results obtained concur with current state-of-the-art models. We mainly compare our method with the winning solution of the ML4CFD NeurIPS 2024 competition.",
      "keywords": [
        "Gaussian process",
        "Mesh morphing",
        "Reduced order modeling"
      ],
      "decision": "Reject",
      "year": "2025"
    }
  },
  {
    "group_id": "8C8F4NmHfz",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "JFygzwx8SJ",
      "title": "KVzip: Query-Agnostic KV Cache Compression with Context Reconstruction",
      "abstract": "Transformer-based large language models (LLMs) cache context as key-value (KV) pairs during inference. As context length grows, KV cache sizes expand, leading to substantial memory overhead and increased attention latency. This paper introduces \\textit{KVzip}, a query-agnostic KV cache eviction method enabling effective reuse of compressed KV caches across diverse queries. KVzip quantifies the importance of a KV pair using the underlying LLM to reconstruct original contexts from cached KV pairs, subsequently evicting pairs with lower importance. Extensive empirical evaluations demonstrate that KVzip reduces KV cache size by $3$-$4\\times$ and FlashAttention decoding latency by approximately $2\\times$, with negligible performance loss in question-answering, retrieval, reasoning, and code comprehension tasks. Evaluations include various models such as LLaMA3.1, Qwen2.5, and Gemma3, with context lengths reaching up to 170K tokens. KVzip significantly outperforms existing query-aware KV eviction methods, which suffer from performance degradation even at a 90\\% cache budget ratio under multi-query scenarios.",
      "keywords": [
        "Large Language Models",
        "Efficient Inference",
        "Long-Context Processing",
        "KV Cache Compression"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "YtsX7irxbq",
      "title": "When recalling in-context, Transformers are not SSMs",
      "abstract": "Despite the advantageous subquadratic complexity of modern recurrent deep learning models -- such as state-space models (SSMs) -- recent studies have highlighted their potential shortcomings compared to transformers on reasoning and memorization tasks. In this paper, we dive deeper into one of such benchmarks: associative recall (AR), which has been shown to correlate well with language modeling performance, and inspect in detail the effects of scaling and optimization issues in recently proposed token mixing strategies. We first demonstrate that, unlike standard transformers, the choice of learning rate plays a critical role in the performance of modern recurrent models: an issue that can severely affect reported performance in previous works and suggests further research is needed to stabilize training. Next, we show that recurrent and attention-based models exhibit contrasting benefits when scaling in width as opposed to depth, with attention being notably unable to solve AR when limited to a single layer. We then further inspect 1-layer transformers, revealing that despite their poor performance, their training dynamics surprisingly resemble the formation of induction heads, a phenomenon previously observed only in their 2-layer counterparts. Finally, through architectural ablations, we study how components affects Transformer and Mamba’s performance and optimization stability.",
      "keywords": [
        "SSMs",
        "Attention",
        "In-Context Learning",
        "Language Modeling",
        "Mamba"
      ],
      "decision": "Reject",
      "year": "2025"
    }
  },
  {
    "group_id": "k38Th3x4d9",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "k38Th3x4d9",
      "title": "Root Cause Analysis of Anomalies in Multivariate Time Series through Granger Causal Discovery",
      "abstract": "Identifying the root causes of anomalies in multivariate time series is challenging due to the complex dependencies among the series. In this paper, we propose a comprehensive approach called AERCA that inherently integrates Granger causal discovery with root cause analysis. By defining anomalies as interventions on the exogenous variables of time series, AERCA not only learns the Granger causality among time series but also explicitly models the distributions of exogenous variables under normal conditions. AERCA then identifies the root causes of anomalies by highlighting exogenous variables that significantly deviate from their normal states. Experiments on multiple synthetic and real-world datasets demonstrate that AERCA can accurately capture the causal relationships among time series and effectively identify the root causes of anomalies.",
      "keywords": [
        "root cause analysis",
        "Granger causality",
        "multivariate time series"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "v5BouOktUP",
      "title": "Multivariate Time-series Forecasting with SPACE: Series Prediction Augmented by Causality Estimation",
      "abstract": "The analysis of multivariate time series (MTS) presents a complex yet crucial task with substantial applications in areas such as weather forecasting, policy formulation, and stock market prediction. It is important to highlight three key characteristics of MTS that contribute to the challenging and multifaceted nature of their analysis: (i) their interrelationships are represented through causal relationships rather than mere similarities; (ii) they convey information across multiple independent factors; and (iii) their dynamics often arise from inherent temporal dependencies. While conventional time series analysis frameworks often fail to capture one or more of these aspects, resulting in incomplete or even misleading conclusions, we propose an end-to-end trainable $\\textbf{S}$eries $\\textbf{P}$rediction model $\\textbf{A}$ugmented by $\\textbf{C}$ausality $\\textbf{E}$stimation (SPACE) to address these limitations. This model effectively incorporates temporal dependencies and causal relationships, featuring a temporal embedding and a transfer entropy-based Cross-TE module designed to enhance predictions through causality-augmented mechanisms. Experiments demonstrate that SPACE achieves state-of-the-art results on challenging real-world time series prediction tasks, showing its effectiveness and versatility.",
      "keywords": [
        "Time Series Forecasting",
        "Causal Learning",
        "Transfer Entropy",
        "Graph Based Learning"
      ],
      "decision": "Reject",
      "year": "2025"
    }
  },
  {
    "group_id": "N8Oj1XhtYZ",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "N8Oj1XhtYZ",
      "title": "SANA: Efficient High-Resolution Text-to-Image Synthesis with Linear Diffusion Transformers",
      "abstract": "We introduce Sana, a text-to-image framework that can efficiently generate images up to 4096$\\times$4096 resolution. Sana can synthesize high-resolution, high-quality images with strong text-image alignment at a remarkably fast speed, deployable on laptop GPU. Core designs include: (1) Deep compression autoencoder: unlike traditional AEs, which compress images only 8$\\times$, we trained an AE that can compress images 32$\\times$, effectively reducing the number of latent tokens. (2) Linear DiT: we replace all vanilla attention in DiT with linear attention, which is more efficient at high resolutions without sacrificing quality. (3) Decoder-only text encoder: we replaced T5 with modern decoder-only small LLM as the text encoder and designed complex human instruction with in-context learning to enhance the image-text alignment. (4)  Efficient training and sampling: we propose Flow-DPM-Solver to reduce sampling steps, with efficient caption labeling and selection to accelerate convergence. As a result, Sana-0.6B is very competitive with modern giant diffusion model (e.g. Flux-12B), being 20 times smaller and 100+ times faster in measured throughput. Moreover, Sana-0.6B can be deployed on a 16GB laptop GPU, taking less than 1 second to generate a 1024$\\times$1024 resolution image. Sana enables content creation at low cost. Code and model will be publicly released upon publication.",
      "keywords": [
        "Efficient AI",
        "Diffusion Models",
        "Text to Image generation"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "Yd5MHVIKLk",
      "title": "MuLan: Multimodal-LLM Agent for Progressive and Interactive Multi-Object Diffusion",
      "abstract": "Existing text-to-image models still struggle to generate images of multiple objects, especially in handling their spatial positions, relative sizes, overlapping, and attribute bindings. To efficiently address these challenges, we develop a training-free Multimodal-LLM agent (MuLan), as a human painter, that can progressively generate multi-object with intricate planning and feedback control.\nMuLan harnesses a large language model (LLM) to decompose a prompt to a sequence of sub-tasks, each generating only one object by stable diffusion, conditioned on previously generated objects. Unlike existing LLM-grounded methods, MuLan only produces a high-level plan at the beginning while the exact size and location of each object are determined upon each sub-task by an LLM and attention guidance. Moreover, MuLan adopts a vision-language model (VLM) to provide feedback to the image generated in each sub-task and control the diffusion model to re-generate the image if it violates the original prompt. Hence, each model in every step of MuLan only needs to address an easy sub-task it is specialized for. The multi-step process also allows human users to monitor the generation process and make preferred changes at any intermediate step via text prompts, thereby improving the human-AI collaboration experience. We collect 200 prompts containing multi-objects with spatial relationships and attribute bindings from different benchmarks to evaluate MuLan. The results demonstrate the superiority of MuLan in generating multiple objects over baselines and its creativity when collaborating with human users.",
      "keywords": [
        "Diffusion models",
        "Controllable generation",
        "multi-modal agent"
      ],
      "decision": "Reject",
      "year": "2025"
    }
  },
  {
    "group_id": "Y6aHdDNQYD",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "HCJ7B6dhYK",
      "title": "Radon Implicit Field Transform (RIFT): Learning Scenes from Radar Signals",
      "abstract": "Data acquisition in array signal processing (ASP) is costly because achieving high angular and range resolutions necessitates large antenna apertures and wide frequency bandwidths, respectively. The data requirements for ASP problems grow multiplicatively with the number of viewpoints and frequencies, significantly increasing the burden of data collection, even for simulation. Implicit Neural Representations (INRs) — neural network-based models of 3D objects and scenes — offer compact and continuous representations with minimal radar data. They can interpolate to unseen viewpoints and potentially address the sampling cost in ASP problems. In this work, we select Synthetic Aperture Radar (SAR) as a case from ASP and propose the \\textit{\\textbf{R}adon \\textbf{I}mplicit \\textbf{F}ield \\textbf{T}ransform} (RIFT). RIFT consists of two components: a classical forward model for radar (Generalized Radon Transform, GRT), and an INR based scene representation learned from radar signals. This method can be extended to other ASP problems by replacing the GRT with appropriate algorithms corresponding to different data modalities. In our experiments, we first synthesize radar data using the GRT. We then train the INR model on this synthetic data by minimizing the reconstruction error of the radar signal. After training, we render the scene using the trained INR and evaluate our scene representation against the ground truth scene. Due to the lack of existing benchmarks, we introduce two main new error metrics: \\textit{\\textbf{p}hase-\\textbf{R}oot \\textbf{M}ean \\textbf{S}quare \\textbf{E}rror} (p-RMSE) for radar signal interpolation, and \\textit{\\textbf{m}agnitude-\\textbf{S}tructural \\textbf{S}imilarity \\textbf{I}ndex \\textbf{M}easure} (m-SSIM) for scene reconstruction. These metrics adapt traditional error measures to account for the complex nature of radar signals. Compared to traditional scene models in radar signal processing, with only 10\\% data footprint, our RIFT model achieves up to 188\\% improvement in scene reconstruction. Using the same amount of data, RIFT is up to $3\\times$ better at reconstruction and shows a 10\\% improvement generalizing to unseen viewpoints.",
      "keywords": [
        "AI for Science",
        "Representation Learning",
        "Scene Rendering",
        "Implicit Neural Representation",
        "3D Reconstruction",
        "Inverse Problems"
      ],
      "decision": "Reject",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "Y6aHdDNQYD",
      "title": "MOS: Model Synergy for Test-Time Adaptation on LiDAR-Based 3D Object Detection",
      "abstract": "LiDAR-based 3D object detection is crucial for various applications but often experiences performance degradation in real-world deployments due to domain shifts. While most studies focus on cross-dataset shifts, such as changes in environments and object geometries, practical corruptions from sensor variations and weather conditions remain underexplored. In this work, we propose a novel online test-time adaptation framework for 3D detectors that effectively tackles these shifts, including a challenging $\\textit{cross-corruption}$ scenario where cross-dataset shifts and corruptions co-occur. By leveraging long-term knowledge from previous test batches, our approach mitigates catastrophic forgetting and adapts effectively to diverse shifts. Specifically, we propose a Model Synergy (MOS) strategy that dynamically selects historical checkpoints with diverse knowledge and assembles them to best accommodate the current test batch. This assembly is directed by our proposed Synergy Weights (SW), which perform a weighted averaging of the selected checkpoints, minimizing redundancy in the composite model. The SWs are computed by evaluating the similarity of predicted bounding boxes on the test data and the independence of features between checkpoint pairs in the model bank. To maintain an efficient and informative model bank, we discard checkpoints with the lowest average SW scores, replacing them with newly updated models. Our method was rigorously tested against existing test-time adaptation strategies across three datasets and eight types of corruptions, demonstrating superior adaptability to dynamic scenes and conditions. Notably, it achieved a 67.3% improvement in a challenging cross-corruption scenario, offering a more comprehensive benchmark for adaptation. Source code: https://github.com/zhuoxiao-chen/MOS.",
      "keywords": [
        "Test-Time Adaptation",
        "3D Object Detection"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    }
  },
  {
    "group_id": "zBbZ2vdLzH",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "Vszt1FDElj",
      "title": "Coarsening to Conceal: Enabling Privacy-Preserving Federated Learning for Graph Data",
      "abstract": "With the escalating demand for privacy-preserving machine learning, federated learning (FL) stands out by enabling collaboration among decentralized entities. Utilizing graph representations of data enhances learning for graph-level tasks, crucial for FL with data distributed across local repositories. Despite its benefits, stringent privacy regulations often compromise FL's performance. Previous methods aimed at ensuring privacy introduce performance degradation and computational overhead. In response to these challenges, we propose using graph coarsening—a simple yet effective method—to enhance the security and privacy of FL on graph data. Our approach posits that graph coarsening alone can suffice for privacy guarantees, as model parameters obtained from training on the coarsened graph effectively conceal sensitive information susceptible to privacy attacks. Through comprehensive application and analysis, we demonstrate the efficacy of graph coarsening within an FL setup, taking both the graph matrix and node features as input, and jointly learning the coarsened graph matrix and feature matrix while ensuring desired properties. The resultant coarsened graph representations are then utilized to train model parameters, subsequently communicated within an FL framework for downstream tasks such as classification. Extensive experimentation across various datasets confirms that graph coarsening ensures privacy while enhancing performance with minimal trade-offs compared to traditional differential privacy (DP) methods without adding extra complexity overhead.",
      "keywords": [
        "Federated Learning",
        "Privacy-Preserving Machine Learning",
        "Graph Neural Networks",
        "Graph Coarsening",
        "Data Privacy and Security"
      ],
      "decision": "Reject",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "zBbZ2vdLzH",
      "title": "Joint Graph Rewiring and Feature Denoising via Spectral Resonance",
      "abstract": "When learning from graph data, the graph and the node features both give noisy information about the node labels. In this paper we propose an algorithm to **j**ointly **d**enoise the features and **r**ewire the graph (JDR), which improves the performance of downstream node classification graph neural nets (GNNs). JDR works by aligning the leading spectral spaces of graph and feature matrices. It approximately solves the associated non-convex optimization problem in a way that handles graphs with multiple classes and different levels of homophily or heterophily. We theoretically justify JDR in a stylized setting and show that it consistently outperforms existing rewiring methods on a wide range of synthetic and real-world node classification tasks.",
      "keywords": [
        "GNNs",
        "Rewiring",
        "Denoising",
        "Spectral Resonance",
        "cSBM"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    }
  },
  {
    "group_id": "6EUtjXAvmj",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "6EUtjXAvmj",
      "title": "Variational Diffusion Posterior Sampling with Midpoint Guidance",
      "abstract": "Diffusion models have recently shown considerable potential in solving Bayesian inverse problems when used as priors. However, sampling from the resulting denoising posterior distributions remains a challenge as it involves intractable terms. To tackle this issue, state-of-the-art approaches formulate the problem as that of sampling from a surrogate diffusion model targeting the posterior and decompose its scores into two terms: the prior score and an intractable guidance term. While the former is replaced by the pre-trained score of the considered diffusion model, the guidance term has to be estimated. In this paper, we propose a novel approach that utilises a decomposition of the transitions which, in contrast to previous methods, allows a trade-off between the complexity of the intractable guidance term and that of the prior transitions. We validate the proposed approach through extensive experiments on linear and nonlinear inverse problems, including challenging cases with latent diffusion models as priors, and demonstrate its effectiveness in reconstructing electrocardiogram (ECG) from partial measurements for accurate cardiac diagnosis.",
      "keywords": [
        "Diffusion models",
        "Inverse problems",
        "posterior sampling"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "Z9Odi09Rv9",
      "title": "Fast and Noise-Robust Diffusion Solvers for Inverse Problems: A Frequentist Approach",
      "abstract": "Diffusion models have been firmly established as principled zero-shot solvers for linear and nonlinear inverse problems, owing to their powerful image prior and ease of formulation as Bayesian posterior samplers. However, many existing solvers struggle in the noisy measurement regime, either overfitting or underfitting to the measurement constraint, resulting in poor sample quality and inconsistent performance across noise levels. Moreover, existing solvers rely on approximating $x_0$ via Tweedie's formula, where an intractable \\textit{conditional} score is replaced by an \\textit{unconditional} score network, introducing a fundamental source of error in the resulting solution. In this work, we propose a novel frequentist's approach to diffusion-based inverse solvers, where each diffusion step can be seen as the maximum likelihood solution to a simple single-parameter conditional likelihood model, derived by an adjusted application of Tweedie's formula to the forward measurement model. We demonstrate that this perspective is not only scalable and fast, but also allows for a noise-aware maximization scheme with a likelihood-based stopping criterion that promotes the proper noise-adapted fit given knowledge of the measurement noise $\\sigma_\\mathbf{y}$. Finally, we demonstrate comparable or improved performance against a wide selection of contemporary inverse solvers across multiple datasets, tasks, and noise levels.",
      "keywords": [
        "diffusion models",
        "inverse problems",
        "maximum likelihood"
      ],
      "decision": "Reject",
      "year": "2025"
    }
  },
  {
    "group_id": "vRvVVb0NAz",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "DzKdjWe59v",
      "title": "Hint Marginalization for Improved Reasoning in Large Language Models",
      "abstract": "Large Language Models (LLMs) have exhibited an impressive capability to perform reasoning tasks, especially if they are encouraged to generate a sequence of intermediate steps. Reasoning performance can be improved by suitably combining multiple LLM responses, generated either in parallel in a single query, or via sequential interactions with LLMs throughout the reasoning process. Existing strategies for combination, such as self-consistency and progressive-hint-prompting, make inefficient usage of the LLM responses. We present Hint Marginalization, a novel and principled algorithmic framework to enhance the reasoning capabilities of LLMs. Our approach can be viewed as an iterative sampling strategy for forming a Monte Carlo approximation of an underlying distribution of answers, with the goal of identifying the mode the most likely answer. Empirical evaluation on several benchmark datasets for arithmetic reasoning demonstrates the superiority of the proposed approach.",
      "keywords": [
        "reasoning",
        "large language models"
      ],
      "decision": "Reject",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "vRvVVb0NAz",
      "title": "When is Task Vector Provably Effective for Model Editing? A Generalization Analysis of Nonlinear Transformers",
      "abstract": "Task arithmetic refers to editing the pre-trained model by adding a weighted sum of task vectors, each of which is the weight update from the pre-trained model to fine-tuned models for certain tasks. This approach recently gained attention as a computationally efficient inference method for model editing, e.g., multi-task learning, forgetting, and out-of-domain generalization capabilities. However, the theoretical understanding of why task vectors can execute various conceptual operations remains limited, due to the highly non-convexity of training Transformer-based models. To the best of our knowledge, this paper provides the first theoretical characterization of the generalization guarantees of task vector methods on nonlinear Transformers. We consider a conceptual learning setting, where each task is a binary classification problem based on a discriminative pattern. We theoretically prove the effectiveness of task addition in simultaneously learning a set of irrelevant or aligned tasks, as well as the success of task negation in unlearning one task from irrelevant or contradictory tasks. Moreover, we prove the proper selection of linear coefficients for task arithmetic to achieve guaranteed generalization to out-of-domain tasks. All of our theoretical results hold for both dense-weight parameters and their low-rank approximations. Although established in a conceptual setting, our theoretical findings were validated on a practical machine unlearning task using the large language model Phi-1.5 (1.3B).",
      "keywords": [
        "Task arithmetic",
        "generalization",
        "nonlinear Transformers",
        "deep learning theory",
        "machine unlearning"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    }
  },
  {
    "group_id": "je3GZissZc",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "COdUNtjMEp",
      "title": "On the Training Convergence of Transformers for In-Context Classification",
      "abstract": "While transformers have demonstrated impressive capacities for in-context learning (ICL) in practice, theoretical understanding of the underlying mechanism enabling transformers to perform ICL is still in its infant stage. This work aims to theoretically study the training dynamics of transformers for in-context classification tasks. We demonstrate that, for in-context classification of Gaussian mixtures under certain assumptions, a single-layer transformer trained via gradient descent converges to a globally optimal model at a linear rate. We further quantify the impact of the training and testing prompt lengths on the ICL inference error of the trained transformer. We show that when the lengths of training and testing prompts are sufficiently large, the prediction of the trained transformer approaches the Bayes-optimal classifier. Experimental results corroborate the theoretical findings.",
      "keywords": [
        "In-context learning",
        "Transformer"
      ],
      "decision": "Reject",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "je3GZissZc",
      "title": "Instant Policy: In-Context Imitation Learning via Graph Diffusion",
      "abstract": "Following the impressive capabilities of in-context learning with large transformers, In-Context Imitation Learning (ICIL) is a promising opportunity for robotics. We introduce Instant Policy, which learns new tasks instantly from just one or two demonstrations, achieving ICIL through two key components. First, we introduce inductive biases through a graph representation and model ICIL as a graph generation problem using a learned diffusion process, enabling structured reasoning over demonstrations, observations, and actions. Second, we show that such a model can be trained using pseudo-demonstrations – arbitrary trajectories generated in simulation – as a virtually infinite pool of training data. Our experiments, in both simulation and reality, show that Instant Policy enables rapid learning of various everyday robot tasks. We also show how it can serve as a foundation for cross-embodiment and zero-shot transfer to language-defined tasks.",
      "keywords": [
        "In-context Imitation Learning",
        "Robotic Manipulation",
        "Graph Neural Networks",
        "Diffusion Models"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    }
  },
  {
    "group_id": "2efNHgYRvM",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "2efNHgYRvM",
      "title": "On the Identification of Temporal Causal Representation with Instantaneous Dependence",
      "abstract": "Temporally causal representation learning aims to identify the latent causal process from time series observations, but most methods require the assumption that the latent causal processes do not have instantaneous relations. Although some recent methods achieve identifiability in the instantaneous causality case, they require either interventions on the latent variables or grouping of the observations, which are in general difficult to obtain in real-world scenarios. To fill this gap, we propose an \\textbf{ID}entification framework for instantane\\textbf{O}us \\textbf{L}atent dynamics (\\textbf{IDOL}) by imposing a sparse influence constraint that the latent causal processes have sparse time-delayed and instantaneous relations. Specifically, we establish identifiability results of the latent causal process based on sufficient variability and the sparse influence constraint by employing contextual information of time series data. Based on these theories, we incorporate a temporally variational inference architecture to estimate the latent variables and a gradient-based sparsity regularization to identify the latent causal process. Experimental results on simulation datasets illustrate that our method can identify the latent causal process. Furthermore, evaluations on multiple human motion forecasting benchmarks with instantaneous dependencies indicate the effectiveness of our method in real-world settings.",
      "keywords": [
        "Causal Representation Learning",
        "Instantaneous Dependency",
        "Identification"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "Rkpdfia4Sz",
      "title": "Learning Discrete Latent Models from Discrete Observations",
      "abstract": "A central challenge in machine learning is discovering meaningful representations of high-dimensional data, commonly referred to as representation learning. However, many existing methods lack a theoretical foundation, leading to unreliable representations and limited inferential capabilities. In approaches where certain uniqueness of representation is guaranteed, such as nonlinear ICA, variables are typically assumed to be continuous. While recent work has extended identifiability to binarized observed variables, no principled method has been developed for scenarios involving discrete latent variables. In this paper, we show how multi-domain information can be leveraged to achieve identifiability when both latent and observed variables are discrete. We propose general identification conditions that do not depend on specific data distributional assumptions or parametric model forms. The effectiveness of our approach is validated through experiments on both simulated and real-world datasets.",
      "keywords": [
        "Latent Variable Identification",
        "Nonlinear Independent Component Analysis (ICA)"
      ],
      "decision": "Reject",
      "year": "2025"
    }
  },
  {
    "group_id": "EzjsoomYEb",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "2MqyCIxLSi",
      "title": "TopoTune: A Framework for Generalized Combinatorial Complex Neural Networks",
      "abstract": "Graph Neural Networks (GNNs) excel in learning from relational datasets, processing node and edge features in a way that preserves the symmetries of the graph domain. However, many complex systems---such as biological or social networks---involve multiway complex interactions that are more naturally represented by higher-order topological domains. The emerging field of Topological Deep Learning (TDL) aims to accommodate and leverage these higher-order structures. Combinatorial Complex Neural Networks (CCNNs), fairly general TDL models, have been shown to be more expressive and better performing than GNNs. However, differently from the graph deep learning ecosystem, TDL lacks a principled and standardized framework for easily defining new architectures, restricting its accessibility and applicability. To address this issue, we introduce Generalized CCNNs (GCCNs), a novel simple yet powerful family of TDL models that can be used to systematically transform any (graph) neural network into its TDL counterpart. We prove that GCCNs generalize and subsume CCNNs, while extensive experiments on a diverse class of GCCNs show that these architectures consistently match or outperform CCNNs, often with less model complexity. In an effort to accelerate and democratize TDL, we introduce TopoTune, a lightweight software for defining, building, and training GCCNs with unprecedented flexibility and ease.",
      "keywords": [
        "Topological Deep Learning",
        "Graph Neural Network",
        "Graph Expansion",
        "Combinatorial Complex",
        "Cellular Complex"
      ],
      "decision": "Reject",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "EzjsoomYEb",
      "title": "Topological Blindspots: Understanding and Extending Topological Deep Learning Through the Lens of Expressivity",
      "abstract": "Topological deep learning (TDL) is a rapidly growing field that seeks to leverage topological structure in data and facilitate learning from data supported on topological objects, ranging from molecules to 3D shapes. Most TDL architectures can be unified under the framework of higher-order message-passing (HOMP), which generalizes graph message-passing to higher-order domains. In the first part of the paper, we explore HOMP's expressive power from a topological perspective, demonstrating the framework's inability to capture fundamental topological and metric invariants such as diameter, orientability, planarity, and homology. In addition, we demonstrate HOMP's limitations in fully leveraging lifting and pooling methods on graphs. To the best of our knowledge, this is the first work to study the expressivity of TDL from a topological perspective. In the second part of the paper, we develop two new classes of architectures -- multi-cellular networks (MCN) and scalable MCN (SMCN) -- which draw inspiration from expressive GNNs. MCN can reach full expressivity, but scaling it to large data objects can be computationally expansive. Designed as a more scalable alternative, SMCN still mitigates many of HOMP's expressivity limitations. Finally, we design new benchmarks for evaluating models based on their ability to learn topological properties of complexes. We then evaluate SMCN on these benchmarks as well as on real-world graph datasets, demonstrating improvements over both HOMP baselines and expressive graph methods, highlighting the value of expressively leveraging topological information.",
      "keywords": [
        "Topological Deep Learning",
        "Message Passing",
        "Higher Order Message Passing",
        "Expressivity",
        "Graph Neural Networks",
        "GNNs",
        "Topology",
        "Homology",
        "Symmetry"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    }
  },
  {
    "group_id": "OwpLQrpdwE",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "4KKqHIb4iG",
      "title": "Backpropagation-free training of neural PDE solvers for time-dependent problems",
      "abstract": "Approximating solutions to time-dependent Partial Differential Equations (PDEs) is one of the most important problems in computational science. Neural PDE solvers have shown promise recently because they are mesh-free and easy to implement. However, backpropagation-based training often leads to poor approximation accuracy and long training time. In particular, capturing high-frequency temporal dynamics and solving over long time spans pose significant challenges. To address these, we present an approach to training neural PDE solvers without backpropagation by integrating two key ideas: separation of space and time variables and random sampling of weights and biases of the hidden layers. We reformulate the PDE as an Ordinary Differential Equation (ODE) using a neural network ansatz, construct neural basis functions only in the spatial domain, and solve the ODE leveraging classical ODE solvers from scientific computing. We demonstrate that our backpropagation-free algorithm outperforms the iterative, gradient-based optimization of physics-informed neural networks with respect to training time and accuracy, often by 1 to 5 orders of magnitude using different complicated PDEs characterized by high-frequency temporal dynamics, long time span, complex spatial domain, non-linearities, shocks, and high dimensionality.",
      "keywords": [
        "neural PDE solvers",
        "time-dependent partial differential equations",
        "random feature networks",
        "backpropagation-free training"
      ],
      "decision": "Reject",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "AoraWUmpLU",
      "title": "Global Convergence in Neural ODEs: Impact of Activation Functions",
      "abstract": "Neural Ordinary Differential Equations (ODEs) have been successful in various applications due to their continuous nature and parameter-sharing efficiency. However, these unique characteristics also introduce challenges in training, particularly with respect to gradient computation accuracy and convergence analysis. In this paper, we address these challenges by investigating the impact of activation functions. We demonstrate that the properties of activation functions—specifically smoothness and nonlinearity—are critical to the training dynamics. Smooth activation functions guarantee globally unique solutions for both forward and backward ODEs, while sufficient nonlinearity is essential for maintaining the spectral properties of the Neural Tangent Kernel (NTK) during training. Together, these properties enable us to establish the global convergence of Neural ODEs under gradient descent in overparameterized regimes. Our theoretical findings are validated by numerical experiments, which not only support our analysis but also provide practical guidelines for scaling Neural ODEs, potentially leading to faster training and improved performance in real-world applications.",
      "keywords": [
        "Neural ODEs",
        "Gradient Descent",
        "Neural Tangent Kernel (NTK)"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    }
  },
  {
    "group_id": "44cMlQSreK",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "UKjAwMzX4m",
      "title": "BCQ: Block Clustered Quantization for 4-bit (W4A4) LLM inference",
      "abstract": "Post-training quantization (PTQ) is a promising approach to reducing the storage and computational requirements of large language models (LLMs) without additional training cost. Recent PTQ studies have primarily focused on quantizing only weights to sub-8-bits while maintaining activations at 8-bits or higher. Accurate sub-8-bit quantization for both weights and activations without relying on quantization-aware training remains a significant challenge. In this work, we introduce a novel quantization method called block clustered quantization (BCQ) wherein each operand tensor is decomposed into blocks (a block is a group of contiguous scalars), blocks are clustered based on their statistics, and a dedicated optimal quantization codebook is designed for each cluster. We propose a PTQ algorithm called Locally-Optimal BCQ (LO-BCQ) that iterates between the steps of block clustering and codebook design to greedily minimize the quantization mean squared error. When weight and activation scalars are encoded to W4A4 format (with 0.5-bits of overhead for storing scaling factors and codebook selectors), we advance the current state-of-the-art by demonstrating <1% loss in inference accuracy across several LLMs and downstream tasks.",
      "keywords": [
        "Post-training Quantization",
        "Large Language Models",
        "Codebooks",
        "Clustering",
        "Block Clustered Quantization"
      ],
      "decision": "Reject",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "kbjJ9ZOakb",
      "title": "Learning and aligning single-neuron invariance manifolds in visual cortex",
      "abstract": "Understanding how sensory neurons exhibit selectivity to certain features and invariance to others is central to uncovering the computational principles underlying robustness and generalization in visual perception. Most existing methods for characterizing selectivity and invariance identify single or finite discrete sets of stimuli. Since these are only isolated measurements from an underlying continuous manifold, characterizing invariance properties accurately and comparing them across neurons with varying receptive field size, position, and orientation, becomes challenging. Consequently, a systematic analysis of invariance types at the population level remains under-explored. Building on recent advances in learning continuous invariance manifolds, we introduce a novel method to accurately identify and align invariance manifolds of visual sensory neurons, overcoming these challenges. Our approach first learns the continuous invariance manifold of stimuli that maximally excite a neuron modeled by a response-predicting deep neural network. It then learns an affine transformation on the pixel coordinates such that the same manifold activates another neuron as strongly as possible, effectively aligning their invariance manifolds spatially. This alignment provides a principled way to quantify and compare neuronal invariances irrespective of receptive field differences. Using simulated neurons, we demonstrate that our method accurately learns and aligns known invariance manifolds, robustly identifying functional clusters. When applied to macaque V1 neurons, it reveals functional clusters of neurons, including simple and complex cells. Overall, our method enables systematic, quantitative exploration of the neural invariance landscape, to gain new insights into the functional properties of visual sensory neurons.",
      "keywords": [
        "neural invariances",
        "invariance manifold",
        "MEI",
        "implicit neural representations",
        "contrastive learning",
        "invariance alignment",
        "clustering",
        "visual cortex",
        "macaque V1",
        "primary visual cortex"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    }
  },
  {
    "group_id": "ZV7CLf0RHK",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "TwJrTz9cRS",
      "title": "HiRA: Parameter-Efficient Hadamard High-Rank Adaptation for Large Language Models",
      "abstract": "We propose Hadamard High-Rank Adaptation (HiRA), a parameter-efficient fine-tuning (PEFT) method that enhances the adaptability of Large Language Models (LLMs). While Low-rank Adaptation (LoRA) is widely used to reduce resource demands, its low-rank updates may limit its expressiveness for new tasks. HiRA addresses this by using a Hadamard product to retain high-rank update parameters, improving the model capacity. Empirically, HiRA outperforms LoRA and its variants on several tasks, with extensive ablation studies validating its effectiveness. Our code is available at https://github.com/hqsiswiliam/hira.",
      "keywords": [
        "Parametric-efficient fine-tuning",
        "Large Language Model"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "d465apqCqc",
      "title": "BA-LoRA: Bias-Alleviating Low-Rank Adaptation to Mitigate Catastrophic Inheritance in Large Language Models",
      "abstract": "Large language models (LLMs) have demonstrated remarkable proficiency across various natural language processing (NLP) tasks. However, adapting LLMs to downstream applications requires computationally intensive and memory-demanding fine-tuning procedures. To alleviate these burdens, parameter-efficient fine-tuning (PEFT) techniques have emerged as a promising approach to tailor LLMs with minimal computational overhead. While PEFT methods offer substantial advantages, they do not fully address the pervasive issue of bias propagation from pre-training data. This work introduces Bias-Alleviating Low-Rank Adaptation (BA-LoRA), a novel PEFT method designed to counteract bias inheritance. BA-LoRA incorporates three distinct regularization terms: (1) a consistency regularizer, (2) a diversity regularizer, and (3) a singular value decomposition regularizer. These regularizers aim to enhance the models' consistency, diversity, and generalization capabilities during fine-tuning. We conduct extensive experiments on natural language understanding (NLU) and natural language generation (NLG) tasks using prominent LLMs such as LLaMA, Mistral, and Gemma. The results demonstrate that BA-LoRA outperforms LoRA and its state-of-the-art variants. Moreover, our method effectively mitigates the adverse effects of pre-training bias, leading to more reliable and robust model outputs.",
      "keywords": [
        "supervised fine-tuning",
        "parameter efficient fine-tuning",
        "bias reduction"
      ],
      "decision": "Reject",
      "year": "2025"
    }
  },
  {
    "group_id": "fZK6AQXlUU",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "LxkgScfHKf",
      "title": "Conformal Training with Reduced Variance",
      "abstract": "Conformal prediction (CP) is a distribution-free framework for achieving probabilistic guarantees on black-box models. {CP} is generally applied to a model post-training. Conformal training is an approach that aims to optimize the CP efficiency during training. In this direction, ConfTr (Stutz et al, 2022) is a technique that seeks to minimize the expected prediction set size of a model by simulating {CP} in-between training updates. Despite its potential, we identify a strong source of sample inefficiency in ConfTr that leads to overly noisy estimated gradients, introducing training instability and limiting practical use. To address this challenge, we propose variance-reduced conformal training (VR-ConfTr), a method that incorporates a variance reduction technique in the gradient estimation of the ConfTr objective function. Through extensive experiments on various benchmark datasets, we demonstrate that VR-ConfTr consistently achieves faster convergence and smaller prediction sets compared to baselines.",
      "keywords": [
        "Conformal Training",
        "Conformal Prediction",
        "Optimization",
        "Quantile",
        "Deep Learning",
        "Uncertainty Quantification"
      ],
      "decision": "Reject",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "st77ShxP1K",
      "title": "Do as We Do, Not as You Think: the Conformity of Large Language Models",
      "abstract": "Recent advancements in large language models (LLMs) revolutionize the field of intelligent agents, enabling collaborative multi-agent systems capable of tackling complex problems across various domains. However, the potential of conformity within these systems, analogous to phenomena like conformity bias and group-think in human group dynamics, remains largely unexplored, raising concerns about their collective problem-solving capabilities and possible ethical implications. This paper presents a comprehensive study on conformity in LLM-driven multi-agent systems, focusing on three aspects: the existence of conformity, the factors influencing conformity, and potential mitigation strategies. In particular, we introduce BenchForm, a new conformity-oriented benchmark, featuring reasoning-intensive tasks and five distinct interaction protocols designed to probe LLMs’ behavior in collaborative scenarios. Several representative LLMs are evaluated on BenchForm, using metrics such as conformity rate and independence rate to quantify conformity’s impact. Our analysis delves into factors influencing conformity, including interaction time and majority size, and examines how the subject agent rationalize its conforming behavior. Furthermore, we explore two strategies to mitigate conformity effects, i.e., developing enhanced persona and implementing a reflection mechanism. Several interesting findings regarding LLMs’ conformity are derived from empirical results and case studies. We hope that these insights can pave the way for more robust and ethically-aligned collaborative AI systems. Our benchmark and code are available at BenchForm.",
      "keywords": [
        "Large Language Models",
        "Conformity",
        "Multi-agent System"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    }
  },
  {
    "group_id": "jXvwJ51vcK",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "CRmiX0v16e",
      "title": "Open-YOLO 3D: Towards Fast and Accurate Open-Vocabulary 3D Instance Segmentation",
      "abstract": "Recent works on open-vocabulary 3D instance segmentation show strong promise but at the cost of slow inference speed and high computation requirements. This high computation cost is typically due to their heavy reliance on aggregated clip features from multi-view, which require computationally expensive 2D foundation models like Segment Anything (SAM) and CLIP. Consequently, this hampers their applicability in many real-world applications that require both fast and accurate predictions. To this end, we propose a novel open-vocabulary 3D instance segmentation approach, named Open-YOLO 3D, that efficiently leverages only 2D object detection from multi-view RGB images for open-vocabulary 3D instance segmentation. \n We demonstrate that our proposed Multi-View Prompt Distribution (MVPDist) method makes use of multi-view information to account for misclassification from the object detector to predict a reliable label for 3D instance masks. Furthermore, since projections of 3D object instances are already contained within the 2D bounding boxes, we show that our proposed low granularity label maps, which require only a 2D object detector to construct, are sufficient and very fast to predict prompt IDs for 3D instance masks when used with our proposed MVPDist.\n We validate our Open-YOLO 3D on two benchmarks, ScanNet200 and Replica, \n under two scenarios: (i) with ground truth masks, where labels are required for given object proposals, and (ii) with class-agnostic 3D proposals generated from a 3D proposal network.\n Our Open-YOLO 3D achieves state-of-the-art performance on both datasets while obtaining up to $\\sim$16$\\times$ speedup compared to the best existing method in literature. On ScanNet200 val. set, our Open-YOLO 3D achieves mean average precision (mAP) of 24.7% while operating at 22 seconds per scene. github.com/aminebdj/OpenYOLO3D",
      "keywords": [
        "Open Vocabulary",
        "3D point cloud instance segmentation"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "PD8JVDg8mB",
      "title": "Annotation Bootstrapping: Reinforcing Visual Pre-Training using Unlabelled Images",
      "abstract": "A common approach to learning from unlabeled images is to train models to satisfy invariances on these images, such as consistency under augmentations or crops. Despite successes on Imagenet, these approaches struggle to learn from larger uncurated datasets like web crawls or video, where such inductive biases only weakly hold. How can we more effectively learn from broader datasets? Instead of training models to be invariant across views, we study an alternative approach encouraging model representations to be \\textit{predictive} of important semantics of adjacent views of an image. We concurrently train a model to predict semantic annotations from images (generated either self-supervised, or from auxiliary datasets); and bootstrap the model's semantics by predicting, given a cropped view of an image and the coordinates for a nearby crop, the model's annotation distribution for the neighboring view.  A core strength of this approach is the ability to extract information universally from both unlabelled and labelled image data, incorporating captions, bounding boxes, and other annotations when they are present. Our experiments show that annotation propagation improves pre-training on unlabelled datasets in the wild, including video datasets like EpicKitchens, scene datasets like COCO, and uncurated web-scale image datasets like CC12M.",
      "keywords": [
        "visual pretraining",
        "self supervised learning",
        "bootstrapping"
      ],
      "decision": "Reject",
      "year": "2025"
    }
  },
  {
    "group_id": "SG1R2H3fa1",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "0Th6bCZwKt",
      "title": "Gaussian Mixture Models Based Augmentation Enhances GNN Generalization",
      "abstract": "Graph Neural Networks (GNNs) have shown great promise in many learning tasks, notably including node and graph classification, but they face difficulties when tested on new or unseen data. These challenges are exacerbated when training data is limited in size or diversity. To address this issue, we introduce a theoretical framework using Rademacher complexity to compute a regret bound on the generalization error and then characterize the effect of data augmentation. This framework informs the design of GMM-GDA, a new, efficient graph data augmentation (GDA) algorithm leveraging the capability of Gaussian Mixture Models (GMMs) to approximate any distribution. Our approach not only outperforms existing augmentation techniques but also offers improved time complexity, making it highly suitable for real-world applications.",
      "keywords": [
        "Graph Neural Networks",
        "Data Augmentation"
      ],
      "decision": "Reject",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "zBbZ2vdLzH",
      "title": "Joint Graph Rewiring and Feature Denoising via Spectral Resonance",
      "abstract": "When learning from graph data, the graph and the node features both give noisy information about the node labels. In this paper we propose an algorithm to **j**ointly **d**enoise the features and **r**ewire the graph (JDR), which improves the performance of downstream node classification graph neural nets (GNNs). JDR works by aligning the leading spectral spaces of graph and feature matrices. It approximately solves the associated non-convex optimization problem in a way that handles graphs with multiple classes and different levels of homophily or heterophily. We theoretically justify JDR in a stylized setting and show that it consistently outperforms existing rewiring methods on a wide range of synthetic and real-world node classification tasks.",
      "keywords": [
        "GNNs",
        "Rewiring",
        "Denoising",
        "Spectral Resonance",
        "cSBM"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    }
  },
  {
    "group_id": "DC8bsa9bzY",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "0F1rIKppTf",
      "title": "Through the Looking Glass: Mirror Schrödinger Bridges",
      "abstract": "Resampling from a target measure whose density is unknown is a fundamental problem in mathematical statistics and machine learning. A setting that dominates the machine learning literature consists of learning a map from an easy-to-sample prior, such as the Gaussian distribution, to a target measure. Under this model, samples from the prior are pushed forward to generate a new sample on the target measure, which is often difficult to sample from directly. In this paper, we propose a new model for conditional resampling called mirror Schrödinger bridges. Our key observation is that solving the Schrödinger bridge problem between a distribution and itself provides a natural way to produce new samples from conditional distributions, giving in-distribution variations of an input data point. We show how to efficiently solve this largely overlooked version of the Schrödinger bridge problem. We prove that our proposed method leads to significant algorithmic simplifications over existing alternatives, in addition to providing control over conditioning. Empirically, we demonstrate how these benefits can be leveraged to produce proximal samples in a number of application domains.",
      "keywords": [
        "entropic optimal transport",
        "schrödinger bridge",
        "stochastic differential equations",
        "sampling"
      ],
      "decision": "Reject",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "EUSkm2sVJ6",
      "title": "How much of my dataset did you use? Quantitative Data Usage Inference in Machine Learning",
      "abstract": "How much of my data was used to train a machine learning model? This is a critical question for data owners assessing the risk of unauthorized usage of their data to train models. However, previous work mistakenly treats this as a binary problem—inferring whether all-or-none or any-or-none of the data was used—which is fragile when faced with real, non-binary data usage risks. To address this, we propose a fine-grained analysis called Dataset Usage Cardinality Inference (DUCI), which estimates the exact proportion of data used. Our algorithm, leveraging debiased membership guesses, matches the performance of the optimal MLE approach (with a maximum error <0.1) but with significantly lower (e.g., $300 \\times$ less) computational cost.",
      "keywords": [
        "Machine Learning",
        "Privacy",
        "Dataset Usage Inference",
        "Dataset Ownership",
        "Membership Inference Attack",
        "Dataset Copyright"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    }
  },
  {
    "group_id": "dYTtGFuD3S",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "dYTtGFuD3S",
      "title": "Adaptive Drug Interaction Prediction via Enhanced Graph Representation Learning",
      "abstract": "This paper presents a groundbreaking theoretical framework for drug-drug interaction (DDI) prediction that seamlessly integrates domain adaptation (DA) techniques with advanced mathematical concepts. We introduce GraphPharmNet, a novel architecture that operates on DDI-DA bundles, leveraging gauge-equivariant geometric deep learning to capture the intricate structure of drug interactions across domains. Our approach reformulates the DDI prediction problem using the language of differential geometry, optimal transport, and symplectic geometry, viewing domain adaptation as a Hamiltonian flow on a statistical manifold. We develop a cohomological interpretation of domain invariance, characterizing robust DDI prediction features through the lens of persistent homology and sheaf theory. The domain adaptation process is analyzed using a geometric renormalization group framework, revealing a profound connection between the DDI-DA bundle's geometry and the emergence of domain-invariant predictive features. We further elucidate the spectral properties of the DDI-DA Laplacian, providing insights into the topological stability of domain adaptation in DDI prediction. Extensive experiments on benchmark datasets demonstrate that GraphPharmNet significantly outperforms existing methods, particularly in scenarios with limited data or when transferring knowledge across disparate domains. Our results highlight the power of this unified mathematical framework in capturing complex drug interactions and adapting to new domains, paving the way for more accurate, robust, and interpretable DDI prediction models. This work not only advances the field of computational drug discovery but also establishes a rigorous theoretical foundation for domain adaptation in graph-structured data, with potential applications across a wide range of scientific disciplines. Our anonymous github link: \\textbf{https://anonymous.4open.science/r/GraphPharmNet-C9D9}",
      "keywords": [
        "Domain-Aligned，Transfer Learning，Drug-Target Interaction"
      ],
      "decision": "Reject",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "zBbZ2vdLzH",
      "title": "Joint Graph Rewiring and Feature Denoising via Spectral Resonance",
      "abstract": "When learning from graph data, the graph and the node features both give noisy information about the node labels. In this paper we propose an algorithm to **j**ointly **d**enoise the features and **r**ewire the graph (JDR), which improves the performance of downstream node classification graph neural nets (GNNs). JDR works by aligning the leading spectral spaces of graph and feature matrices. It approximately solves the associated non-convex optimization problem in a way that handles graphs with multiple classes and different levels of homophily or heterophily. We theoretically justify JDR in a stylized setting and show that it consistently outperforms existing rewiring methods on a wide range of synthetic and real-world node classification tasks.",
      "keywords": [
        "GNNs",
        "Rewiring",
        "Denoising",
        "Spectral Resonance",
        "cSBM"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    }
  },
  {
    "group_id": "dML3XGvWmy",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "YrycTjllL0",
      "title": "BigCodeBench: Benchmarking Code Generation with Diverse Function Calls and Complex Instructions",
      "abstract": "Task automation has been greatly empowered by the recent advances in Large Language Models (LLMs) via Python code, where the tasks range from software engineering development to general-purpose reasoning. While current benchmarks have shown that LLMs can solve tasks using programs like human developers, the majority of their evaluations are limited to short and self-contained algorithmic tasks or standalone function calls. Solving challenging and practical tasks requires the capability of utilizing **diverse function calls as tools** to efficiently implement functionalities like data analysis and web development. In addition, using multiple tools to solve a task needs compositional reasoning by accurately understanding **complex instructions**. Fulfilling both of these characteristics can pose a great challenge for LLMs. To assess how well LLMs can solve challenging and practical tasks via programs, we introduce BigCodeBench, a benchmark that challenges LLMs to invoke multiple function calls as tools from 139 libraries and 7 domains for 1,140 fine-grained tasks. To evaluate LLMs rigorously, each task encompasses 5.6 test cases with an average branch coverage of 99%. In addition, we propose a natural-language-oriented variant of BigCodeBench, BigCodeBench-Instruct, that automatically transforms the original docstrings into short instructions containing only essential information. Our extensive evaluation of 60 LLMs shows that **LLMs are not yet capable of following complex instructions to use function calls precisely, with scores up to 60%, significantly lower than the human performance of 97%**. The results underscore the need for further advancements in this area.",
      "keywords": [
        "Code Generation",
        "Tool Use",
        "Instruction Following",
        "Benchmark"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "dML3XGvWmy",
      "title": "Gödel Agent: A Self-Referential Framework Helps for Recursively Self-Improvement",
      "abstract": "The rapid advancement of large language models (LLMs) has significantly enhanced the capabilities of AI-driven agents across various tasks. However, existing agentic systems, whether based on fixed pipeline algorithms or pre-defined meta-learning frameworks, cannot search the whole agent design space due to the restriction of human-designed components, and thus might miss the globally optimal agent design. In this paper, we introduce Gödel Agent, a self-evolving framework inspired by the Gödel machine, enabling agents to recursively improve themselves without relying on predefined routines or fixed optimization algorithms. Gödel Agent leverages LLMs to dynamically modify its own logic and behavior, guided solely by high-level objectives through prompting. Experimental results on mathematical reasoning and complex agent tasks demonstrate that implementation of Gödel Agent can achieve continuous self-improvement, surpassing manually crafted agents in performance, efficiency, and generalizability.",
      "keywords": [
        "Agent",
        "Large Language Model",
        "Reasoning",
        "Self-Improvement"
      ],
      "decision": "Reject",
      "year": "2025"
    }
  },
  {
    "group_id": "zbIS2r0t0F",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "kbjJ9ZOakb",
      "title": "Learning and aligning single-neuron invariance manifolds in visual cortex",
      "abstract": "Understanding how sensory neurons exhibit selectivity to certain features and invariance to others is central to uncovering the computational principles underlying robustness and generalization in visual perception. Most existing methods for characterizing selectivity and invariance identify single or finite discrete sets of stimuli. Since these are only isolated measurements from an underlying continuous manifold, characterizing invariance properties accurately and comparing them across neurons with varying receptive field size, position, and orientation, becomes challenging. Consequently, a systematic analysis of invariance types at the population level remains under-explored. Building on recent advances in learning continuous invariance manifolds, we introduce a novel method to accurately identify and align invariance manifolds of visual sensory neurons, overcoming these challenges. Our approach first learns the continuous invariance manifold of stimuli that maximally excite a neuron modeled by a response-predicting deep neural network. It then learns an affine transformation on the pixel coordinates such that the same manifold activates another neuron as strongly as possible, effectively aligning their invariance manifolds spatially. This alignment provides a principled way to quantify and compare neuronal invariances irrespective of receptive field differences. Using simulated neurons, we demonstrate that our method accurately learns and aligns known invariance manifolds, robustly identifying functional clusters. When applied to macaque V1 neurons, it reveals functional clusters of neurons, including simple and complex cells. Overall, our method enables systematic, quantitative exploration of the neural invariance landscape, to gain new insights into the functional properties of visual sensory neurons.",
      "keywords": [
        "neural invariances",
        "invariance manifold",
        "MEI",
        "implicit neural representations",
        "contrastive learning",
        "invariance alignment",
        "clustering",
        "visual cortex",
        "macaque V1",
        "primary visual cortex"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "zbIS2r0t0F",
      "title": "Allostatic Control of Persistent States in Spiking Neural Networks for Perception and Computation",
      "abstract": "We introduce a novel model for updating perceptual beliefs about the environment\nby extending the concept of Allostasis to the control of internal representations.\nAllostasis is a fundamental regulatory mechanism observed in animal physiology\nthat orchestrates responses to maintain a dynamic equilibrium in bodily needs and\ninternal states. In this paper, we focus on an application in numerical cognition,\nwhere a bump of activity in an attractor network is used as a spatial-numerical\nrepresentation. While existing neural networks can maintain persistent states, to\ndate, there is no unified framework for dynamically controlling spatial changes in\nneuronal activity in response to enviromental changes. To address this, we couple\na well-known allostatic microcircuit, the Hammel model, with a ring attractor, re-\nsulting in a Spiking Neural Network architecture that can modulate the location of\nthe bump as a function of some reference input. This localised activity in turn is\nused as a perceptual belief in a simulated subitization task – a quick enumeration\nprocess without counting. We provide a general procedure to fine-tune the model\nand demonstrate the successful control of the bump location. We also study the\nresponse time in the model with respect to changes in parameters and compare\nit with biological data. Finally, we analyze the dynamics of the network to un-\nderstand the selectivity and specificity of different neurons to different categories\npresent in the input. The results of this paper, particularly the mechanism for mov-\ning persistent states, are not limited to numerical cognition but can be applied to a\nwide range of tasks involving similar representations.",
      "keywords": [
        "Allostatic",
        "Dynamic",
        "Attractors"
      ],
      "decision": "Reject",
      "year": "2025"
    }
  },
  {
    "group_id": "Thnk4ez3wN",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "SctfBCLmWo",
      "title": "A Decade's Battle on Dataset Bias: Are We There Yet?",
      "abstract": "We revisit the ``dataset classification'' experiment suggested by Torralba & Efros (2011) a decade ago, in the new era with large-scale, diverse, and hopefully less biased datasets as well as more capable neural network architectures. Surprisingly, we observe that modern neural networks can achieve excellent accuracy in classifying which dataset an image is from: e.g., we report 84.7% accuracy on held-out validation data for the three-way classification problem consisting of the YFCC, CC, and DataComp datasets. Our further experiments show that such a dataset classifier could learn semantic features that are generalizable and transferable, which cannot be explained by memorization. We hope our discovery will inspire the community to rethink issues involving dataset bias.",
      "keywords": [
        "Vision datasets",
        "Dataset bias",
        "Deep learning"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "Thnk4ez3wN",
      "title": "On Learning Representations for Tabular Dataset Distillation",
      "abstract": "Dataset distillation generates a small set of information-rich instances from a large dataset, resulting in reduced storage requirements, privacy or copyright risks, and computational costs for downstream modeling, though much of the research has focused on the image data modality. We study tabular data distillation, which brings in novel challenges such as the inherent feature heterogeneity and the common use of non-differentiable learning models (such as decision tree ensembles and nearest-neighbor predictors). To mitigate these challenges, we present TDColER, a tabular data distillation framework via column embeddings-based representation learning. To evaluate this framework, we also present a tabular data distillation benchmark, TDBench. Based on an elaborate evaluation on TDBench, resulting in 226,200 distilled datasets and 541,980 models trained on them, we demonstrate that TDColER is able to boost the distilled data quality of off-the-shelf distillation schemes by 0.5-143% across 7 different tabular learning models.",
      "keywords": [
        "Dataset Distillation",
        "Tabular Data",
        "Representation Learning",
        "Autoencoders"
      ],
      "decision": "Reject",
      "year": "2025"
    }
  },
  {
    "group_id": "GqGoa44obw",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "FpiCLJrSW8",
      "title": "More RLHF, More Trust? On The Impact of Preference Alignment On Trustworthiness",
      "abstract": "The trustworthiness of Large Language Models (LLMs) refers to the extent to which their outputs are reliable, safe, and ethically aligned, and it has become a crucial consideration alongside their cognitive performance. In practice, Reinforcement Learning From Human Feedback (RLHF) has been widely used to align LLMs with labeled human preferences, but its assumed effect on model trustworthiness hasn't been rigorously evaluated. To bridge this knowledge gap, this study investigates how models aligned with general-purpose preference data perform across five trustworthiness verticals: toxicity, stereotypical bias, machine ethics, truthfulness, and privacy. Our results demonstrate that RLHF on human preferences doesn't automatically guarantee trustworthiness, and reverse effects are often observed. Furthermore, we propose to adapt efficient influence function based data attribution methods to the RLHF setting to better understand the influence of fine-tuning data on individual trustworthiness benchmarks, and show its feasibility by providing our estimated attribution scores. Together, our results underscore the need for more nuanced approaches for model alignment from both the data and framework perspectives, and we hope this research will guide the community towards developing language models that are increasingly capable without sacrificing trustworthiness.",
      "keywords": [
        "Large Language Model",
        "Trustworthy ML",
        "Data Attribution"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "GqGoa44obw",
      "title": "RLHF with Inconsistent Multi-Agent Feedback Under General Function Approximation: A Theoretical Perspective",
      "abstract": "Reinforcement learning from human feedback (RLHF) has been widely studied, as a method for leveraging feedback from human evaluators to guide the learning process. However, existing theoretical analyses typically assume that the human feedback is generated by the ground-truth reward function. This may not be true in practice, because the reward functions in human minds for providing feedback are usually different from the ground-truth reward function, e.g., due to diverse personal experiences and inherent biases. Such inconsistencies could lead to undesirable outcomes when applying existing algorithms, particularly when considering feedback from heterogeneous agents. Therefore, in this paper, we make the first effort to investigate a more practical and general setting of RLHF, where feedback could be generated by multiple agents with reward functions differing from the ground truth. To address this challenge, we develop a new algorithm with novel ideas for handling inconsistent multi-agent feedback, including a Steiner-Point-based confidence set to exploit the benefits of *multi-agent* feedback and a new weighted importance sampling method to manage complexity issues arising from *inconsistency*. Our theoretical analysis develops new methods to demonstrate the optimality of our algorithm. This result is the first of its kind to demonstrate the fundamental impact and potential of inconsistent multi-agent feedback in RLHF.",
      "keywords": [
        "RLHF theory",
        "inconsistent multi-agent feedback",
        "regret analysis"
      ],
      "decision": "Reject",
      "year": "2025"
    }
  },
  {
    "group_id": "pwNIOcr8fU",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "N8Oj1XhtYZ",
      "title": "SANA: Efficient High-Resolution Text-to-Image Synthesis with Linear Diffusion Transformers",
      "abstract": "We introduce Sana, a text-to-image framework that can efficiently generate images up to 4096$\\times$4096 resolution. Sana can synthesize high-resolution, high-quality images with strong text-image alignment at a remarkably fast speed, deployable on laptop GPU. Core designs include: (1) Deep compression autoencoder: unlike traditional AEs, which compress images only 8$\\times$, we trained an AE that can compress images 32$\\times$, effectively reducing the number of latent tokens. (2) Linear DiT: we replace all vanilla attention in DiT with linear attention, which is more efficient at high resolutions without sacrificing quality. (3) Decoder-only text encoder: we replaced T5 with modern decoder-only small LLM as the text encoder and designed complex human instruction with in-context learning to enhance the image-text alignment. (4)  Efficient training and sampling: we propose Flow-DPM-Solver to reduce sampling steps, with efficient caption labeling and selection to accelerate convergence. As a result, Sana-0.6B is very competitive with modern giant diffusion model (e.g. Flux-12B), being 20 times smaller and 100+ times faster in measured throughput. Moreover, Sana-0.6B can be deployed on a 16GB laptop GPU, taking less than 1 second to generate a 1024$\\times$1024 resolution image. Sana enables content creation at low cost. Code and model will be publicly released upon publication.",
      "keywords": [
        "Efficient AI",
        "Diffusion Models",
        "Text to Image generation"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "pwNIOcr8fU",
      "title": "Towards Syn-to-Real IQA: A Novel Perspective on Reshaping Synthetic Data Distributions",
      "abstract": "Blind Image Quality Assessment (BIQA) has advanced significantly through deep learning, but the scarcity of large-scale labeled datasets remains a challenge. While synthetic data offers a promising solution, models trained on existing synthetic datasets often show limited generalization ability. In this work, we make a key observation that representations learned from synthetic datasets often exhibit a discrete and clustered pattern that hinders regression performance: features of high-quality images cluster around reference images, while those of low-quality images cluster based on distortion types. Our analysis reveals that this issue stems from the distribution of synthetic data rather than model architecture. Consequently, we introduce a novel framework SynDR-IQA, which reshapes synthetic data distribution to enhance BIQA generalization. Based on theoretical derivations of sample diversity and redundancy's impact on generalization error, SynDR-IQA employs two strategies: distribution-aware diverse content upsampling, which enhances visual diversity while preserving content distribution, and density-aware redundant cluster downsampling, which balances samples by reducing the density of densely clustered areas. Extensive experiments across three cross-dataset settings (synthetic-to-authentic, synthetic-to-algorithmic, and synthetic-to-synthetic) demonstrate the effectiveness of our method. Additionally, as a data-based approach, SynDR-IQA can be coupled with model-based methods without increasing inference costs. The source code will be publicly available.",
      "keywords": [
        "Blind Image Quality Assessment; Data Distribution Reshaping; Synthetic Data"
      ],
      "decision": "Reject",
      "year": "2025"
    }
  },
  {
    "group_id": "viQ1bLqKY0",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "YrycTjllL0",
      "title": "BigCodeBench: Benchmarking Code Generation with Diverse Function Calls and Complex Instructions",
      "abstract": "Task automation has been greatly empowered by the recent advances in Large Language Models (LLMs) via Python code, where the tasks range from software engineering development to general-purpose reasoning. While current benchmarks have shown that LLMs can solve tasks using programs like human developers, the majority of their evaluations are limited to short and self-contained algorithmic tasks or standalone function calls. Solving challenging and practical tasks requires the capability of utilizing **diverse function calls as tools** to efficiently implement functionalities like data analysis and web development. In addition, using multiple tools to solve a task needs compositional reasoning by accurately understanding **complex instructions**. Fulfilling both of these characteristics can pose a great challenge for LLMs. To assess how well LLMs can solve challenging and practical tasks via programs, we introduce BigCodeBench, a benchmark that challenges LLMs to invoke multiple function calls as tools from 139 libraries and 7 domains for 1,140 fine-grained tasks. To evaluate LLMs rigorously, each task encompasses 5.6 test cases with an average branch coverage of 99%. In addition, we propose a natural-language-oriented variant of BigCodeBench, BigCodeBench-Instruct, that automatically transforms the original docstrings into short instructions containing only essential information. Our extensive evaluation of 60 LLMs shows that **LLMs are not yet capable of following complex instructions to use function calls precisely, with scores up to 60%, significantly lower than the human performance of 97%**. The results underscore the need for further advancements in this area.",
      "keywords": [
        "Code Generation",
        "Tool Use",
        "Instruction Following",
        "Benchmark"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "viQ1bLqKY0",
      "title": "EXecution-Eval: Can language models execute real-world code?",
      "abstract": "As Large Language Models (LLMs) advance, traditional benchmarks face challenges of dataset saturation and disconnection from real-world performance, limiting our understanding of true model capabilities. We introduce EXecution-Eval (EXE), a benchmark designed to assess LLMs' ability to execute code and predict program states. EXE attempts to address key limitations in existing evaluations: difficulty scaling, task diversity, training data contamination, and cost-effective scalability.\nComprising over 30,000 tasks derived from 1,000 popular Python repositories on GitHub, EXE spans a range of context lengths and algorithmic complexities. Tasks require models to execute code, necessitating various operations including mathematical reasoning, logical inference, bit manipulation, string operations, loop execution, and maintaining multiple internal variable states during computation. Our methodology involves: (a) selecting and preprocessing GitHub repositories, (b) generating diverse inputs for functions, (c) executing code to obtain ground truth outputs, and (d) formulating tasks that require models to reason about code execution. This approach allows for continuous new task generation for as few as 1,200 tokens, significantly reducing the risk of models \"training on the test set.\"\nWe evaluate several state-of-the-art LLMs on EXE, revealing insights into their code comprehension and execution capabilities. Our results show that even the best-performing models struggle with complex, multi-step execution tasks, highlighting specific computational concepts that pose the greatest challenges for today's LLMs. Furthermore, we review EXE's potential for finding and predicting errors to aid in assessing a model's cybersecurity capabilities. We propose EXE as a sustainable and challenging testbed for evaluating frontier models, offering potential insights into their internal mechanistic advancement",
      "keywords": [
        "large language model",
        "evaluation",
        "benchmark",
        "code execution"
      ],
      "decision": "Reject",
      "year": "2025"
    }
  },
  {
    "group_id": "TLgDQ0Rr2Z",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "TLgDQ0Rr2Z",
      "title": "Principle Counterfactual Fairness",
      "abstract": "Fairness in human and algorithmic decision-making is crucial in areas such as criminal justice, education, and social welfare. Recently, counterfactual fairness has drawn increasing research interest, suggesting that decision-making for individuals should remain the same when intervening with different values on the protected attributes. Nevertheless, the question of \"which attributes and individuals should be protected\" is rarely discussed in the existing counterfactual fairness literature. For example, when considering leg disability as a protected attribute, the algorithms should not treat individuals with leg disabilities differently in college admissions, but one may naturally take into this factor for the purpose of selecting runner athletes. In other words, when and how to enforce fairness is expected to depend on the causal relation between the protected attribute and the outcome of interest. Formally, this paper proposes principal counterfactual fairness using the concept of principal stratification from the causal inference literature, focusing on whether an algorithm is counterfactually fair for individuals whose protected attribute has no individual causal effect on the outcome of interest. To examine whether an algorithm satisfies principal counterfactual fairness, we derive the statistical bounds, and propose a post-processing approach to achieving principal counterfactual fairness with minimal individual decision changes. Experiments are conducted using synthetic and real-world datasets to verify the effectiveness of our methods.",
      "keywords": [
        "Counterfactual Fairness"
      ],
      "decision": "Reject",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "stUKwWBuBm",
      "title": "Tractable Multi-Agent Reinforcement Learning through Behavioral Economics",
      "abstract": "A significant roadblock to the development of principled multi-agent reinforcement learning (MARL) algorithms is the fact that desired solution concepts like Nash equilibria may be intractable to compute. We show how one can overcome this obstacle by introducing concepts from behavioral economics into MARL. To do so, we imbue agents with two key features of human decision-making: risk aversion and bounded rationality. We show that introducing these two properties into games gives rise to a class of equilibria---risk-averse quantal response equilibria (RQE)---which are tractable to compute in \\emph{all} $n$-player matrix and finite-horizon Markov games.  In particular, we show that they emerge as the endpoint of no-regret learning in suitably adjusted versions of the games. Crucially, the class of computationally tractable RQE is independent of the underlying game structure and only depends on agents' degrees of risk-aversion and bounded rationality.  To validate the expressivity of this class of solution concepts we show that it captures peoples' patterns of play in a number of 2-player matrix games previously studied in experimental economics. Furthermore, we give a first analysis of the sample complexity of computing these equilibria in finite-horizon Markov games when one has access to a generative model. We validate our findings on a simple multi-agent reinforcement learning benchmark. Our results open the doors for to the principled development of new decentralized multi-agent reinforcement learning algorithms.",
      "keywords": [
        "behavioral economics",
        "risk-aversion",
        "multi-agent reinforcement learning",
        "quantal response",
        "bounded rationality"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    }
  },
  {
    "group_id": "VRlihVklCL",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "QWunLKbBGF",
      "title": "Do LLMs Recognize Your Preferences? Evaluating Personalized Preference Following in LLMs",
      "abstract": "Large Language Models (LLMs) are increasingly deployed as chatbots, yet their ability to personalize responses to user preferences remains limited. We introduce PrefEval, a benchmark for evaluating LLMs' ability to infer, memorize and adhere to user preferences in long-context conversational setting.\nPrefEval comprises 3,000 manually curated user preference and query pairs spanning 20 topics. PrefEval contains user personalization or preference information in both explicit and implicit preference forms, and evaluates LLM performance using a generation and a classification task. With PrefEval, we have evaluated 10 open-sourced and\nproprietary LLMs in multi-session conversations with varying context lengths up to 100k tokens. We benchmark with various prompting, iterative feedback, and retrieval-augmented generation methods. \nOur benchmarking effort reveals that state-of-the-art LLMs face significant challenges in following users' preference during conversations. In particular,  in zero-shot settings, preference following accuracy falls below 10\\% at merely 10 turns (~3k tokens) across most evaluated models. Even with advanced prompting and retrieval methods, preference following still deteriorates in long-context conversations. Furthermore, we show that fine-tuning on PrefEval significantly improves performance. We believe PrefEval serves as a valuable resource for measuring, understanding, and enhancing LLMs' proactive preference following abilities, paving the way for personalized conversational agents.",
      "keywords": [
        "personalization",
        "benchmark",
        "Large language models",
        "conversational llm",
        "chatbots"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "VRlihVklCL",
      "title": "MaD-Scientist: AI-based Scientist solving Convection-Diffusion-Reaction Equations Using Massive PINN-Based Prior Data",
      "abstract": "Large language models (LLMs), like ChatGPT, have shown that even trained with noisy prior data,  they can generalize effectively to new tasks through in-context learning (ICL) and pre-training techniques.\nMotivated by this, we explore whether a similar approach can be applied to scientific foundation models (SFMs). Our methodology is structured as follows: (i) we collect low-cost physics-informed neural network (PINN)-based approximated prior data in the form of solutions to partial differential equations (PDEs) constructed through an arbitrary linear combination of mathematical dictionaries; (ii) we utilize Transformer architectures with self and cross-attention mechanisms to predict PDE solutions without knowledge of the governing equations in a zero-shot setting; (iii) we provide experimental evidence on the one-dimensional convection-diffusion-reaction equation, which demonstrate that pre-training remains robust even with approximated prior data, with only marginal impacts on test accuracy. Notably, this finding opens the path to pre-training SFMs with realistic, low-cost data instead of (or in conjunction with) numerical high-cost data. These results support the conjecture that SFMs can improve in a manner similar to LLMs, where fully cleaning the vast set of sentences crawled from the Internet is nearly impossible.",
      "keywords": [
        "in-context learning",
        "scientific foundation model",
        "zero-shot",
        "PINN-prior"
      ],
      "decision": "Reject",
      "year": "2025"
    }
  },
  {
    "group_id": "RcNzwKrjTo",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "RcNzwKrjTo",
      "title": "Conformal Prediction Sets with Improved Conditional Coverage using Trust Scores",
      "abstract": "Standard conformal prediction offers a marginal guarantee on coverage, but for prediction sets to be truly useful, they should ideally ensure coverage conditional on each test point. However, it is impossible to achieve exact, distribution-free conditional coverage in finite samples. In this work, we propose an alternative conformal prediction algorithm that targets coverage where it matters most---in instances where a classifier is overconfident in its incorrect predictions. We start by dissecting miscoverage events in marginally-valid conformal prediction, and show that miscoverage rates vary based on the classifier's confidence and its deviation from the Bayes optimal classifier. Motivated by this insight, we develop a variant of conformal prediction that targets coverage conditional on a reduced set of two variables: the classifier's confidence in a prediction and a nonparametric trust score that measures its deviation from the Bayes classifier. Empirical evaluation on multiple image datasets shows that our method generally improves conditional coverage properties compared to standard conformal prediction, including class-conditional coverage, coverage over arbitrary subgroups, and coverage over demographic groups.",
      "keywords": [
        "uncertainty quantification",
        "conformal prediction",
        "conditional guarantees"
      ],
      "decision": "Reject",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "UHPnqSTBPO",
      "title": "Trust or Escalate: LLM Judges with Provable Guarantees for Human Agreement",
      "abstract": "We present a principled approach to provide LLM-based evaluation with a rigorous guarantee of human agreement. We first propose that a reliable evaluation method should not uncritically rely on model preferences for pairwise evaluation, but rather assess the confidence of judge models and selectively decide when to trust its judgement. We then show that under this *selective evaluation* framework, human agreement can be provably guaranteed---such that the model evaluation aligns with that of humans to a user-specified agreement level. As part of our framework, we also introduce *Simulated Annotators*, a novel confidence estimation method that significantly improves judge calibration and thus enables high coverage of evaluated instances. Finally, we propose *Cascaded Selective Evaluation*, where we use cheaper models as initial judges and escalate to stronger models only when necessary---again, while still providing a provable guarantee of human agreement. Experimental results show that Cascaded Selective Evaluation guarantees strong alignment with humans, far beyond what LLM judges could achieve without selective evaluation. For example, on a subset of Chatbot Arena where GPT-4 almost never achieves 80% human agreement, our method, even while employing substantially cost-effective models such as Mistral-7B, *guarantees* over 80% human agreement with almost 80% test coverage.",
      "keywords": [
        "Large Language Model",
        "LLM",
        "LLM Judge",
        "Evaluation",
        "Alignment"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    }
  },
  {
    "group_id": "uaKBM9sGEm",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "uaKBM9sGEm",
      "title": "Towards Off-Road Autonomous Driving via Planner Guided Policy Optimization",
      "abstract": "Off-road autonomous driving poses significant challenges such as navigating diverse terrains, avoiding obstacles, and maneuvering through ditches. Addressing these challenges requires effective planning and adaptability, making it a long-horizon planning and control problem. Traditional model-based control techniques like Model Predictive Path Integral (MPPI) require dense sampling and accurate modeling of the vehicle-terrain interaction, both of which are computationally expensive, making effective long-horizon planning in real-time intractable. Reinforcement learning (RL) methods operate without this limitation and are computationally cheaper at deployment. However, exploration in obstacle-dense and challenging terrains is difficult, and typical RL techniques struggle to navigate in these terrains. To alleviate the limitations of MPPI, we propose a hierarchical autonomy pipeline with a low-frequency high-level MPPI planner and a high-frequency low-level RL controller. To tackle RL's exploration challenge, we propose a teacher-student paradigm to learn an end-to-end RL policy, capable of real-time execution and traversal through challenging terrains. The teacher policy is trained using dense planning information from an MPPI planner while the student policy learns to navigate using visual inputs and sparse planning information. In this framework, we introduce a new policy gradient formulation that extends Proximal Policy Optimization (PPO), leveraging off-policy trajectories for teacher guidance and on-policy trajectories for student exploration. We demonstrate our performance in a realistic off-road simulator against various RL and imitation learning methods.",
      "keywords": [
        "Reinforcement learning",
        "Learning from Demonstrations",
        "Autonomous driving",
        "Off-road driving"
      ],
      "decision": "Reject",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "wM2sfVgMDH",
      "title": "Diffusion-Based Planning for Autonomous Driving with Flexible Guidance",
      "abstract": "Achieving human-like driving behaviors in complex open-world environments is a critical challenge in autonomous driving. Contemporary learning-based planning approaches such as imitation learning methods often struggle to balance competing objectives and lack of safety assurance,due to limited adaptability and inadequacy in learning complex multi-modal behaviors commonly exhibited in human planning, not to mention their strong reliance on the fallback strategy with predefined rules. We propose a novel transformer-based Diffusion Planner for closed-loop planning, which can effectively model multi-modal driving behavior and ensure trajectory quality without any rule-based refinement. Our model supports joint modeling of both prediction and planning tasks under the same architecture, enabling cooperative behaviors between vehicles. Moreover, by learning the gradient of the trajectory score function and employing a flexible classifier guidance mechanism, Diffusion Planner effectively achieves safe and adaptable planning behaviors. Evaluations on the large-scale real-world autonomous planning benchmark nuPlan and our newly collected 200-hour delivery-vehicle driving dataset demonstrate that Diffusion Planner achieves state-of-the-art closed-loop performance with robust transferability in diverse driving styles.",
      "keywords": [
        "diffusion planning",
        "autonomous driving"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    }
  },
  {
    "group_id": "TkbjqexD8w",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "1CLzLXSFNn",
      "title": "TimeMixer++: A General Time Series Pattern Machine for Universal Predictive Analysis",
      "abstract": "Time series analysis plays a critical role in numerous applications, supporting tasks such as forecasting, classification, anomaly detection, and imputation. In this work, we present the time series pattern machine (TSPM), a model designed to excel in a broad range of time series tasks through powerful representation and pattern extraction capabilities. Traditional time series models often struggle to capture universal patterns, limiting their effectiveness across diverse tasks. To address this, we define multiple scales in the time domain and various resolutions in the frequency domain, employing various mixing strategies to extract intricate, task-adaptive time series patterns. Specifically, we introduce TimeMixer++, a general-purpose TSPM that processes multi-scale time series using (1) multi-resolution time imaging (MRTI), (2) time image decomposition (TID), (3) multi-scale mixing (MCM), and (4) multi-resolution mixing (MRM) to extract comprehensive temporal patterns. MRTI transforms multi-scale time series into multi-resolution time images, capturing patterns across both temporal and frequency domains. TID leverages dual-axis attention to extract seasonal and trend patterns, while MCM hierarchically aggregates these patterns across scales. MRM adaptively integrates all representations across resolutions. TimeMixer++ achieves state-of-the-art performance across 8 time series analytical tasks, consistently surpassing both general-purpose and task-specific models. Our work marks a promising step toward the next generation of TSPMs, paving the way for further advancements in time series analysis.",
      "keywords": [
        "time series",
        "pattern machine",
        "predictive analysis"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "TkbjqexD8w",
      "title": "Invariant Spatiotemporal Representation Learning for Cross-patient Seizure Classification",
      "abstract": "Automatic seizure type classification from electroencephalogram (EEG) data can help clinicians to better diagnose epilepsy. Although many previous studies have focused on the classification problem of seizure EEG data, most of these methods require that there is no distribution shift between training data and test data, which greatly limits the applicability in real-world scenarios. In this paper, we propose an invariant spatiotemporal representation learning method for cross-patient seizure classification. Specifically, we first split the spatiotemporal EEG data into different environments based on heterogeneous risk minimization to reflect the spurious correlations. We then learn invariant spatiotemporal representations and train the seizure classification model based on the learned representations to achieve accurate seizure-type classification across various environments. The experiments are conducted on the largest public EEG dataset, the Temple University Hospital Seizure Corpus (TUSZ) dataset, and the experimental results demonstrate the effectiveness of our method.",
      "keywords": [
        "electroencephalogram data",
        "spatiotemporal data",
        "invariant representation learning"
      ],
      "decision": "Reject",
      "year": "2025"
    }
  },
  {
    "group_id": "BOQpRtI4F5",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "OIvg3MqWX2",
      "title": "A Theoretically-Principled Sparse, Connected, and Rigid Graph Representation of Molecules",
      "abstract": "Graph neural networks (GNNs) -- learn graph representations by exploiting the graph's sparsity, connectivity, and symmetries -- have become indispensable for learning geometric data like molecules. However, the most used graphs (e.g., radial cutoff graphs) in molecular modeling lack theoretical guarantees for achieving connectivity and sparsity simultaneously, which are essential for the performance and scalability of GNNs. Furthermore, existing widely used graph construction methods for molecules lack rigidity, limiting GNNs' ability to exploit graph nodes' spatial arrangement. In this paper, we introduce a new hyperparameter-free graph construction of molecules and beyond with sparsity, connectivity, and rigidity guarantees. Remarkably, our method consistently generates connected and sparse graphs with the edge-to-node ratio being bounded above by 3. Our graphs' rigidity guarantees that edge distances and dihedral angles are sufficient to uniquely determine the general spatial arrangements of atoms. We substantiate the effectiveness and efficiency of our proposed graphs in various molecular modeling benchmarks. Code is available at https://github.com/shihhsinwang0214/SCHull.",
      "keywords": [
        "Graph representation",
        "sparsity",
        "connectivity",
        "rigidity",
        "molecules",
        "learning"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "TVwD2zIQ1F",
      "title": "Provable Robustness of (Graph) Neural Networks Against Data Poisoning and Backdoors",
      "abstract": "Generalization of machine learning models can be severely compromised by data poisoning, where adversarial changes are applied to the training data. This vulnerability has led to interest in certifying (i.e., proving) that such changes up to a certain magnitude do not affect test predictions. We, for the first time, certify Graph Neural Networks (GNNs) against poisoning attacks, including backdoors, targeting the node features of a given graph. Our certificates are white-box and based upon (i) the neural tangent kernel, which characterizes the training dynamics of sufficiently wide networks; and (ii) a novel reformulation of the bilevel optimization problem describing poisoning as a mixed-integer linear program. Consequently, we leverage our framework to provide fundamental insights into the role of graph structure and its connectivity on the worst-case robustness behavior of convolution-based and PageRank-based GNNs. We note that our framework is more general and constitutes the first approach to derive white-box poisoning certificates for NNs, which can be of independent interest beyond graph-related tasks.",
      "keywords": [
        "graph neural networks",
        "provable robustness",
        "certificates",
        "poisoning",
        "data poisoning",
        "backdoor attacks",
        "neural tangent kernel",
        "adversarial robustness",
        "mixed-integer linear programming",
        "support vector machines"
      ],
      "decision": "Reject",
      "year": "2025"
    }
  },
  {
    "group_id": "cFu7ze7xUm",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "LlE61BEYpB",
      "title": "FLARE: Fine-tuned Long-context Acceleration with ReLU-enhanced FIRE",
      "abstract": "Deploying large language models (LLMs) on resource-constrained edge devices is challenging due to computational bottlenecks, memory bottlenecks, and -- for long-contexts -- specifically the Softmax operation in the attention mechanism. While using ReLU in place of Softmax has been explored, and FIRE as an alternative to RoPE has been explored for models trained from scratch, there has been little work towards exploring fine-tuning models to utilize these efficient algorithms, or the combination of the two.\n\nIn this paper, we contribute FLARE, a method for fusing Rectified Linear Activations (ReLU) with Relative Encodings (specifically FIRE), and we share a particular recipe which allows these to be fine-tuned effectively into existing models and fused to create efficient long-context inference. Following this recipe yields markedly better validation loss, long-context inference speed, and successfully introduces the property of length-generalization -- the property where the model gains high accuracy for contexts lengths several times larger than trained -- unlike RoPE -- without further fine-tuning.   \n\nOnce FIRE and ReLU are both fine-tuned into a model, we show these can be mathematically fused into a single, more efficient operation, which on average was found to shave 98.9\\% of FIRE operations and produce a Probability matrix with 98.9\\% zeros in its lower-triangle.\n\nFinally, we benchmark inference speed improvements for custom hardware as well with custom CUDA kernels. Using Power, Performance, and Area (PPA) analysis, we show that FLARE operates at eight times the frequency of Softmax while consuming only 0.1\\% of the power and 0.11\\% of the energy per cycle. Our custom CUDA Kernel shows 3.8x faster operation than Softmax FlashAttention. We believe this shows the potential of fine-tuning new algorithms in pre-trained models, and we share our fine-tuning recipes, code and custom hardware designs at \\url{https://anonymous.4open.science/r/nanoGPTBD54}.",
      "keywords": [
        "FIRE",
        "Functional Interpolation for Relative Position Encoding",
        "fine-tune",
        "fine-tuning",
        "ReLU",
        "Softmax",
        "Softplus",
        "Softmax alternatives",
        "long context",
        "transformer",
        "large language model",
        "edge device",
        "Flash Attention"
      ],
      "decision": "Reject",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "QWunLKbBGF",
      "title": "Do LLMs Recognize Your Preferences? Evaluating Personalized Preference Following in LLMs",
      "abstract": "Large Language Models (LLMs) are increasingly deployed as chatbots, yet their ability to personalize responses to user preferences remains limited. We introduce PrefEval, a benchmark for evaluating LLMs' ability to infer, memorize and adhere to user preferences in long-context conversational setting.\nPrefEval comprises 3,000 manually curated user preference and query pairs spanning 20 topics. PrefEval contains user personalization or preference information in both explicit and implicit preference forms, and evaluates LLM performance using a generation and a classification task. With PrefEval, we have evaluated 10 open-sourced and\nproprietary LLMs in multi-session conversations with varying context lengths up to 100k tokens. We benchmark with various prompting, iterative feedback, and retrieval-augmented generation methods. \nOur benchmarking effort reveals that state-of-the-art LLMs face significant challenges in following users' preference during conversations. In particular,  in zero-shot settings, preference following accuracy falls below 10\\% at merely 10 turns (~3k tokens) across most evaluated models. Even with advanced prompting and retrieval methods, preference following still deteriorates in long-context conversations. Furthermore, we show that fine-tuning on PrefEval significantly improves performance. We believe PrefEval serves as a valuable resource for measuring, understanding, and enhancing LLMs' proactive preference following abilities, paving the way for personalized conversational agents.",
      "keywords": [
        "personalization",
        "benchmark",
        "Large language models",
        "conversational llm",
        "chatbots"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    }
  },
  {
    "group_id": "09FiNmvNMw",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "8QTpYC4smR",
      "title": "Systematic Review of Large Language Models: Applications, Limitations, Practical Usages and Future Directions",
      "abstract": "Large Language Models have revolutionized natural language processing with their remarkable ability to understand and generate human-like text. This review explores the various applications of large language models, highlighting their versatility across different domains. The paper begins with an introduction to LLMs, followed by an overview of their types and a detailed literature review. We then examine their limitations before delving into specific applications such as text generation, translation, summarization, and more. Finally, we discuss future directions for research and development, concluding with a summary of key findings and the potential impact of large language models on various industries.",
      "keywords": [
        "Large Language Models",
        "Systematic Review"
      ],
      "decision": "Reject",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "YUYJsHOf3c",
      "title": "ReGenesis: LLMs can Grow into Reasoning Generalists via Self-Improvement",
      "abstract": "Post-training Large Language Models (LLMs) with explicit reasoning trajectories can enhance their reasoning abilities. However, acquiring such high-quality trajectory data typically demands meticulous supervision from humans or superior models, which can be either expensive or license-constrained. In this paper, we explore how far an LLM can improve its reasoning by self-synthesizing reasoning paths as training data without any additional supervision. Existing self-synthesizing methods, such as STaR, suffer from poor generalization to out-of-domain (OOD) reasoning tasks. We hypothesize it is due to that their self-synthesized reasoning paths are too task-specific, lacking general task-agnostic reasoning guidance. To address this, we propose **Reasoning Generalist via Self-Improvement (ReGenesis)**, a method to *self-synthesize reasoning paths as post-training data by progressing from abstract to concrete*. More specifically, ReGenesis self-synthesizes reasoning paths by converting general reasoning guidelines into task-specific ones, generating reasoning structures, and subsequently transforming these structures into reasoning paths, without the need for human-designed task-specific examples used in existing methods. We show that ReGenesis achieves superior performance on all in-domain and OOD settings tested compared to existing methods. For six OOD tasks specifically, while previous methods exhibited an average performance decrease of approximately 4.6% after post training, ReGenesis delivers around 6.1% performance improvement. We also conduct an in-depth analysis of our framework and show ReGenesis is effective across various language models and design choices.",
      "keywords": [
        "LLM",
        "reasoning",
        "generalization",
        "self-improvement"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    }
  },
  {
    "group_id": "N5fVv6PZGz",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "QWunLKbBGF",
      "title": "Do LLMs Recognize Your Preferences? Evaluating Personalized Preference Following in LLMs",
      "abstract": "Large Language Models (LLMs) are increasingly deployed as chatbots, yet their ability to personalize responses to user preferences remains limited. We introduce PrefEval, a benchmark for evaluating LLMs' ability to infer, memorize and adhere to user preferences in long-context conversational setting.\nPrefEval comprises 3,000 manually curated user preference and query pairs spanning 20 topics. PrefEval contains user personalization or preference information in both explicit and implicit preference forms, and evaluates LLM performance using a generation and a classification task. With PrefEval, we have evaluated 10 open-sourced and\nproprietary LLMs in multi-session conversations with varying context lengths up to 100k tokens. We benchmark with various prompting, iterative feedback, and retrieval-augmented generation methods. \nOur benchmarking effort reveals that state-of-the-art LLMs face significant challenges in following users' preference during conversations. In particular,  in zero-shot settings, preference following accuracy falls below 10\\% at merely 10 turns (~3k tokens) across most evaluated models. Even with advanced prompting and retrieval methods, preference following still deteriorates in long-context conversations. Furthermore, we show that fine-tuning on PrefEval significantly improves performance. We believe PrefEval serves as a valuable resource for measuring, understanding, and enhancing LLMs' proactive preference following abilities, paving the way for personalized conversational agents.",
      "keywords": [
        "personalization",
        "benchmark",
        "Large language models",
        "conversational llm",
        "chatbots"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "ob7UrZOJve",
      "title": "Inheritune: Training Smaller Yet More Attentive Language Models",
      "abstract": "Large Language Models (LLMs) have achieved remarkable performance across various natural language processing tasks, primarily due to the transformer architecture and its self-attention mechanism. However, we observe that in standard decoder-style LLMs attention matrices degenerate to single-column for deeper layers. Layers in this state unable to learn anything meaningful and mostly redundant; we refer to these as lazy layers. The goal of this paper is to train smaller models by eliminating this structural inefficiency without compromising performance.\n\nMotivated by this observation, we propose Inheritune, a simple yet effective training recipe for developing smaller, high-performing language models. Smaller models trained with Inheritune inherits early transformer layers from a larger pre-trained model, then retrains and progressively expands the smaller model until it matches or exceeds the performance of the larger model. We demonstrate that Inheritune enables the training of various sizes of GPT-2 models on datasets like OpenWebText-9B and FineWeb\\_Edu. Models trained with Inheritune, despite having significantly fewer layers, match or even surpass the performance of their larger counterparts. For instance, our 16-layer GPT-2 medium variant achieves comparable performance to the standard 24-layer GPT-2 medium model.",
      "keywords": [
        "Large Language Models",
        "Small Language Models",
        "Attention degeneration",
        "Efficient training",
        "Model Initialization"
      ],
      "decision": "Reject",
      "year": "2025"
    }
  },
  {
    "group_id": "0Xt7uT04cQ",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "GsCMKwyfWm",
      "title": "LVLM-COUNT: Enhancing the Counting Ability of Large Vision-Language Models",
      "abstract": "Counting is a fundamental skill for various visual tasks in real-life applications, requiring both object recognition and robust counting capabilities. Despite their advanced visual perception, large vision-language models (LVLMs) struggle with counting tasks, especially when the number of objects exceeds those commonly encountered during training. We enhance LVLMs’ counting abilities using a divide-and conquer approach, breaking counting problems into sub-counting tasks. Unlike prior methods, which do not generalize well to counting datasets on which they have not been trained, our method performs well on new datasets without any additional training or fine-tuning. We demonstrate that our approach enhances counting capabilities across various datasets and benchmarks.",
      "keywords": [
        "Counting",
        "Large vision-language models"
      ],
      "decision": "Reject",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "SI2hI0frk6",
      "title": "Transfusion: Predict the Next Token and Diffuse Images with One Multi-Modal Model",
      "abstract": "We introduce Transfusion, a recipe for training a multi-modal model over discrete and continuous data.\nTransfusion combines the language modeling loss function (next token prediction) with diffusion to train a single transformer over mixed-modality sequences.\nWe pretrain multiple Transfusion models up to 7B parameters from scratch on a mixture of text and image data, establishing scaling laws with respect to a variety of uni- and cross-modal benchmarks.\nOur experiments show that Transfusion scales significantly better than quantizing images and training a language model over discrete image tokens.\nBy introducing modality-specific encoding and decoding layers, we can further improve the performance of Transfusion models, and even compress each image to just 16 patches.\nWe further demonstrate that scaling our Transfusion recipe to 7B parameters and 2T multi-modal tokens produces a model that can generate images and text on a par with similar scale diffusion models and language models, reaping the benefits of both worlds.",
      "keywords": [
        "multimodal foundation model",
        "multimodal generation and understanding",
        "diffusion",
        "next token prediction"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    }
  },
  {
    "group_id": "VipcVxaTnG",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "bFYST1MaGh",
      "title": "Communicating Activations Between Language Model Agents",
      "abstract": "Communication between multiple language model (LM) agents has been shown to scale up the reasoning ability of LMs. While natural language has been the dominant medium for inter-LM communication, it is not obvious this should be the standard: not only does natural language communication incur high inference costs that scale quickly with the number of both agents and messages, but also the decoding process abstracts away too much rich information that could be otherwise accessed from the internal activations. In this work, we propose a simple technique whereby LMs communicate via *activations*; concretely, we pause an LM $B$'s computation at an intermediate layer, combine its current activation with another LM $A$'s intermediate activation via some function $f$, then pass $f$'s output into the next layer of $B$ and continue the forward pass till decoding is complete. This approach scales up LMs on new tasks with *zero* additional parameters and data, and saves a *substantial amount of compute* over natural language communication. We test our method with various functional forms $f$ on two experimental setups—multi-player coordination games and reasoning benchmarks—and find that it achieves up to $27.0$% improvement over natural language communication across datasets with $<$$1/4$ the compute, illustrating the superiority and robustness of activations as an alternative \"language\" for communication between LMs.",
      "keywords": [
        "large language models",
        "multiagent communication",
        "embedding representation",
        "multiagent debate"
      ],
      "decision": "Reject",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "mtSSFiqW6y",
      "title": "Judge Decoding: Faster Speculative Sampling Requires Going Beyond Model Alignment",
      "abstract": "The performance of large language models (LLMs) is closely linked to their underlying size, leading to ever-growing networks and hence slower inference. Speculative decoding has been proposed as a technique to accelerate autoregressive generation, leveraging a fast draft model to propose candidate tokens, which are then verified in parallel based on their likelihood under the target model. While this approach guarantees to reproduce the target output, it incurs a substantial penalty: many high-quality draft tokens are rejected, even when they represent objectively valid continuations. Indeed, we show that even powerful draft models such as GPT-4o, as well as human text cannot achieve high acceptance rates under the standard verification scheme. This severely limits the speedup potential of current speculative decoding methods, as an early rejection becomes overwhelmingly likely when solely relying on alignment of draft and target.\nWe thus ask the following question: Can we adapt verification to recognize correct, but non-aligned replies? To this end, we draw inspiration from the LLM-as-a-judge framework, which demonstrated that LLMs are able to rate answers in a versatile way. We carefully design a dataset coined TokenCourt to elicit the same capability in the target model by training a compact module on top of the embeddings to produce ``judgements\" of the current continuation. We showcase our strategy on the Llama-3.1 family, where our 8B/405B-Judge achieves a speedup of $9\\times$ over Llama-405B, while maintaining its quality on a large range of benchmarks. These benefits remain present even in optimized inference frameworks, where our method reaches up to $141$ tokens/s for 8B/70B-Judge and $129$ tokens/s for 8B/405B on $2$ and $8$ H100s respectively.",
      "keywords": [
        "LLM inference",
        "speculative decoding"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    }
  },
  {
    "group_id": "jVDPq9EdzT",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "SEvJfuCtPY",
      "title": "Phase-aware Training Schedule Simplifies Learning in Flow-Based Generative Models",
      "abstract": "We analyze the training of a two-layer autoencoder used to parameterize a flow-based generative model for sampling from a high-dimensional Gaussian mixture. Building on the work of Cui et al. (2024), we find that the phase where the high-level features are learnt during training disappears as the dimension goes to infinity without an appropriate time schedule. We introduce a time dilation that solves this problem. This enables us to characterize the learnt velocity field, finding a first phase where the high-level feature (asymmetry between modes) is learnt and a second phase where the low-level feature (distribution of each mode) is learnt. We find that the autoencoder representing the velocity field learns to simplify by estimating only the parameters relevant to the feature for each phase. Turning to real data, we propose a method that, for a given feature, finds intervals of time where training improves accuracy the most on that feature, and we provide an experiment on MNIST validating this approach.",
      "keywords": [
        "diffusion models",
        "phase transitions",
        "flow-based generative model",
        "high-dimensional gaussian mixtures",
        "denoising autoencoders",
        "training schedules"
      ],
      "decision": "Reject",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "xDrFWUmCne",
      "title": "Learning to Discretize Denoising Diffusion ODEs",
      "abstract": "Diffusion Probabilistic Models (DPMs) are generative models showing competitive performance in various domains, including image synthesis and 3D point cloud generation. Sampling from pre-trained DPMs involves multiple neural function evaluations (NFEs) to transform Gaussian noise samples into images, resulting in higher computational costs compared to single-step generative models such as GANs or VAEs. Therefore, reducing the number of NFEs while preserving generation quality is crucial. To address this, we propose LD3, a lightweight framework designed to learn the optimal time discretization for sampling. LD3 can be combined with various samplers and consistently improves generation quality without having to retrain resource-intensive neural networks. We demonstrate analytically and empirically that LD3 improves sampling efficiency with much less computational overhead. We evaluate our method with extensive experiments on 7 pre-trained models, covering unconditional and conditional sampling in both pixel-space and latent-space DPMs. We achieve FIDs of 2.38 (10 NFE), and 2.27 (10 NFE) on unconditional CIFAR10 and AFHQv2 in 5-10 minutes of training. LD3 offers an efficient approach to sampling from pre-trained diffusion models. Code is available at https://github.com/vinhsuhi/LD3.",
      "keywords": [
        "Diffusion models",
        "Efficient Sampling",
        "Ordinary Differentiable Equations"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    }
  },
  {
    "group_id": "MsUhByb3CM",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "MsUhByb3CM",
      "title": "Extracting Symbolic Sequences from Visual Representations via Self-Supervised Learning",
      "abstract": "In this paper, we explore the potential of abstracting complex visual information into discrete, structured symbolic sequences using self-supervised learning (SSL). Inspired by how language abstracts and organizes information to enable better reasoning and generalization, we propose a novel approach for generating symbolic representations from visual data. To learn these sequences, we extend the DINO framework to handle both visual and symbolic information. Initial experiments suggest that the generated symbolic sequences capture a meaningful level of abstraction, though further refinement is required. An advantage of our method is its interpretability: the sequences are produced by a decoder transformer using cross-attention, allowing attention maps to be linked to specific symbols and offering insight into how these representations correspond to image regions. This approach lays the foundation for creating interpretable symbolic representations with potential applications in high-level scene understanding.",
      "keywords": [
        "Self-Supervised Learning",
        "Symbolic Representations",
        "Information Theory",
        "Knowledge Distillation",
        "Visual Abstraction",
        "Interpretability"
      ],
      "decision": "Reject",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "YUYJsHOf3c",
      "title": "ReGenesis: LLMs can Grow into Reasoning Generalists via Self-Improvement",
      "abstract": "Post-training Large Language Models (LLMs) with explicit reasoning trajectories can enhance their reasoning abilities. However, acquiring such high-quality trajectory data typically demands meticulous supervision from humans or superior models, which can be either expensive or license-constrained. In this paper, we explore how far an LLM can improve its reasoning by self-synthesizing reasoning paths as training data without any additional supervision. Existing self-synthesizing methods, such as STaR, suffer from poor generalization to out-of-domain (OOD) reasoning tasks. We hypothesize it is due to that their self-synthesized reasoning paths are too task-specific, lacking general task-agnostic reasoning guidance. To address this, we propose **Reasoning Generalist via Self-Improvement (ReGenesis)**, a method to *self-synthesize reasoning paths as post-training data by progressing from abstract to concrete*. More specifically, ReGenesis self-synthesizes reasoning paths by converting general reasoning guidelines into task-specific ones, generating reasoning structures, and subsequently transforming these structures into reasoning paths, without the need for human-designed task-specific examples used in existing methods. We show that ReGenesis achieves superior performance on all in-domain and OOD settings tested compared to existing methods. For six OOD tasks specifically, while previous methods exhibited an average performance decrease of approximately 4.6% after post training, ReGenesis delivers around 6.1% performance improvement. We also conduct an in-depth analysis of our framework and show ReGenesis is effective across various language models and design choices.",
      "keywords": [
        "LLM",
        "reasoning",
        "generalization",
        "self-improvement"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    }
  },
  {
    "group_id": "YSA0QeYnDd",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "2efNHgYRvM",
      "title": "On the Identification of Temporal Causal Representation with Instantaneous Dependence",
      "abstract": "Temporally causal representation learning aims to identify the latent causal process from time series observations, but most methods require the assumption that the latent causal processes do not have instantaneous relations. Although some recent methods achieve identifiability in the instantaneous causality case, they require either interventions on the latent variables or grouping of the observations, which are in general difficult to obtain in real-world scenarios. To fill this gap, we propose an \\textbf{ID}entification framework for instantane\\textbf{O}us \\textbf{L}atent dynamics (\\textbf{IDOL}) by imposing a sparse influence constraint that the latent causal processes have sparse time-delayed and instantaneous relations. Specifically, we establish identifiability results of the latent causal process based on sufficient variability and the sparse influence constraint by employing contextual information of time series data. Based on these theories, we incorporate a temporally variational inference architecture to estimate the latent variables and a gradient-based sparsity regularization to identify the latent causal process. Experimental results on simulation datasets illustrate that our method can identify the latent causal process. Furthermore, evaluations on multiple human motion forecasting benchmarks with instantaneous dependencies indicate the effectiveness of our method in real-world settings.",
      "keywords": [
        "Causal Representation Learning",
        "Instantaneous Dependency",
        "Identification"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "YSA0QeYnDd",
      "title": "Inference of Evolving Mental States from Irregular Action Events to Understand Human Behaviors",
      "abstract": "Inference of latent human mental processes, such as belief, intention, or desire, is crucial for developing AI with human-like intelligence, enabling more effective and timely collaboration. In this paper, we introduce a versatile encoder-decoder model designed to infer  evolving mental processes based on irregularly observed action events and predict future occurrences. The primary challenges arise from two factors: both actions and mental processes are irregular events, and the observed action data is often limited. To address the irregularity of these events, we leverage a temporal point process model within the encoder-decoder framework, effectively capturing the dynamics of both action and mental events. Additionally, we implement a backtracking mechanism in the decoder to enhance the accuracy of predicting future actions and evolving mental states. To tackle the issue of limited data, our model incorporates logic rules as priors, enabling accurate inferences from just a few observed samples. These logic rules can be refined and updated as needed, providing flexibility to the model. Overall, our approach enhances the understanding of human behavior by predicting when actions will occur and how mental processes evolve. Experiments on both synthetic and real-world datasets demonstrate the strong performance of our model in inferring mental states and predicting future actions, contributing to the development of more human-centric AI systems.",
      "keywords": [
        "temporal point process",
        "logic rule",
        "human-AI collaboration"
      ],
      "decision": "Reject",
      "year": "2025"
    }
  },
  {
    "group_id": "fWXYD0ZCdd",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "UvTo3tVBk2",
      "title": "Unlocking State-Tracking in Linear RNNs Through Negative Eigenvalues",
      "abstract": "Linear Recurrent Neural Networks (LRNNs) such as Mamba, RWKV, GLA, mLSTM, and DeltaNet have emerged as efficient alternatives to Transformers for long sequences. However, both Transformers and LRNNs struggle to perform state-tracking, which may impair performance in tasks such as code evaluation. In one forward pass, current architectures are unable to solve even parity, the simplest state-tracking task, which non-linear RNNs can handle effectively. Recently, Sarrof et al. (2024) demonstrated that the failure of LRNNs like Mamba to solve parity stems from restricting the value range of their diagonal state-transition matrices to $[0, 1]$ and that incorporating negative values can resolve this issue. We extend this result to non-diagonal LRNNs such as DeltaNet. We prove that finite precision LRNNs with state-transition matrices having only positive eigenvalues cannot solve parity, while non-triangular matrices are needed to count modulo $3$. Notably, we also prove that LRNNs can learn any regular language when their state-transition matrices are products of identity minus vector outer product matrices, each with eigenvalues in the range $[-1, 1]$. Our experiments confirm that extending the eigenvalue range of Mamba and DeltaNet to include negative values not only enables them to solve parity but consistently improves their performance on state-tracking tasks. We also show that state-tracking enabled LRNNs can be pretrained  stably and efficiently at scale (1.3B parameters), achieving competitive performance on language modeling and showing promise on code and math tasks.",
      "keywords": [
        "State Tracking",
        "State Space",
        "Mamba",
        "Linear RNN",
        "Linear Attention",
        "GLA",
        "DeltaNet",
        "Formal Languages",
        "Products of Householders"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "fWXYD0ZCdd",
      "title": "A New Look at Low-Rank Recurrent Neural Networks",
      "abstract": "Low-rank recurrent neural networks (RNNs) have recently gained prominence as a framework for understanding how neural systems solve complex cognitive tasks. However, fitting and interpreting these networks remains an important open problem.\nHere we address this challenge using a perspective from the ``neural engineering framework'', which shows how to embed an arbitrary ordinary differential equation (ODE) into a low-rank RNN using least-squares regression. Under this perspective, individual neurons in a low-rank RNN provide nonlinear basis functions for representing an ODE of interest. This clarifies limits on the expressivity of low-rank RNNs, such as the fact that with a $\\tanh$ non-linearity they can only capture odd-symmetric functions in the absence of per neuron inputs or biases. Building on this framework, we propose a method for finding the smallest low-rank RNN to implement a given dynamical system using a variant of orthogonal matching pursuit. We also show how to use regression-based fitting to obtain low-rank RNNs with time-varying dynamics. This allows for the rapid training of vastly different dynamical systems that nevertheless produce a given time-varying trajectory. Finally, we highlight the usefulness of our framework by comparing to RNNs trained using backprop-through-time on neuroscience-inspired tasks, showing that our method achieves faster and more accurate learning with smaller networks than gradient-based training.",
      "keywords": [
        "low-rank rnn",
        "computational neuroscience",
        "dynamical systems",
        "neural dynamics"
      ],
      "decision": "Reject",
      "year": "2025"
    }
  },
  {
    "group_id": "w7BGq6ozOL",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "syThiTmWWm",
      "title": "Cheating Automatic LLM Benchmarks: Null Models Achieve High Win Rates",
      "abstract": "Automatic LLM benchmarks, such as AlpacaEval 2.0, Arena-Hard-Auto, and MT-Bench, have become popular for evaluating language models due to their cost-effectiveness and scalability compared to human evaluation. Achieving high win rates on these benchmarks can significantly boost the promotional impact of newly released language models. This promotional benefit may motivate tricks, such as manipulating model output length or style to game win rates, even though several mechanisms have been developed to control length and disentangle style to reduce gameability. Nonetheless, we show that even a **\"null model\"** that always outputs a **constant** response (*irrelevant to input instructions*) can cheat automatic benchmarks and achieve top-ranked win rates: an $86.5\\\\%$ LC win rate on AlpacaEval 2.0; an $83.0$ score on Arena-Hard-Auto; and a $9.55$ score on MT-Bench. Moreover, the crafted cheating outputs are **transferable** because we assume that the instructions of these benchmarks (e.g., $805$ samples of AlpacaEval 2.0) are *private* and cannot be accessed. While our experiments are primarily proof-of-concept, an adversary could use LLMs to generate more imperceptible cheating responses, unethically benefiting from high win rates and promotional impact. Our findings call for the development of anti-cheating mechanisms for reliable automatic benchmarks. The code is available at https://github.com/sail-sg/Cheating-LLM-Benchmarks.",
      "keywords": [
        "Large Language Models",
        "Cheating",
        "Automatic LLM Benchmarks"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "w7BGq6ozOL",
      "title": "Advancing Algorithmic Trading with Large Language Models: A Reinforcement Learning Approach for Stock Market Optimization",
      "abstract": "In the fast-evolving landscape of financial markets, effective decision-making tools are essential for managing complexities driven by economic indicators and market dynamics. Algorithmic trading strategies have gained prominence for their ability to execute trades autonomously, with Deep Reinforcement Learning (DRL) emerging as a key approach for optimizing trading actions through continuous market interaction. However, RL-based systems face significant challenges, particularly in adapting to evolving time series data and incorporating unstructured textual information. In response to these limitations, recent advancements in Large Language Models (LLMs) offer new opportunities. LLMs possess the capacity to analyze vast volumes of data, providing enhanced insights that can complement traditional market analysis. This study proposes a novel approach that integrates six distinct LLMs into algorithmic trading frameworks, developing Stock-Evol-Instruct, an innovative instruction generation algorithm. This algorithm enables RL agents to fine-tune their trading strategies by leveraging LLM-driven insights for daily stock trading decisions. Empirical evaluation using real-world stock data from Silver and JPMorgan demonstrates the significant potential of this approach to outperform conventional trading models. By bridging the gap between LLMs and RL in algorithmic trading, this study contributes to a new frontier in financial technology, setting the stage for future advancements in autonomous trading systems.",
      "keywords": [
        "Algorithmic trading",
        "Stock market",
        "Large language models",
        "Deep reinforcement learning"
      ],
      "decision": "Reject",
      "year": "2025"
    }
  },
  {
    "group_id": "xPO6fwvldG",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "8enWnd6Gp3",
      "title": "TetSphere Splatting: Representing High-Quality Geometry with Lagrangian Volumetric Meshes",
      "abstract": "We introduce TetSphere Splatting, a Lagrangian geometry representation designed for high-quality 3D shape modeling. TetSphere splatting leverages an underused yet powerful geometric primitive -- volumetric tetrahedral meshes. It represents 3D shapes by deforming a collection of tetrahedral spheres, with geometric regularizations and constraints that effectively resolve common mesh issues such as irregular triangles, non-manifoldness, and floating artifacts. Experimental results on multi-view and single-view reconstruction highlight TetSphere splatting's superior mesh quality while maintaining competitive reconstruction accuracy compared to state-of-the-art methods. Additionally, TetSphere splatting demonstrates versatility by seamlessly integrating into generative modeling tasks, such as image-to-3D and text-to-3D generation.",
      "keywords": [
        "geometry representation",
        "3D modeling"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "BoRmf8wDZ7",
      "title": "Gaussian Masked Autoencoders",
      "abstract": "This paper explores Masked Autoencoders (MAE) with Gaussian Splatting. While mainstream self-supervised learning frameworks such as MAE operate on low-level pixels, the image synthesis community has evolved to use latent, mid-level representations for better generative visual data modeling. Our approach, named GMAE, aims to reconcile these two and get the benefits of both worlds. Like MAE, it reconstructs the image end-to-end in the pixel space; however, it also introduces an intermediate, 3D Gaussian-based representation and renders images via splatting. We show that GMAE can enable various zero-shot learning capabilities (e.g figure-ground segmentation, image layering, edge detection, etc) while preserving the high self-supervised representation quality from MAE. Notably, we are the first to employ Gaussian primitives in an image representation learning framework beyond optimization-based single-scene reconstructions. We believe GMAE will inspire further research in this direction and contribute to developing next-generation techniques for modeling high-fidelity visual data.",
      "keywords": [
        "Representation learning",
        "Gaussian Splatting"
      ],
      "decision": "Reject",
      "year": "2025"
    }
  },
  {
    "group_id": "dGSOn7sdWg",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "OW332Wh9S5",
      "title": "DC-Spin: A Speaker-invariant Speech Tokenizer For Spoken Language Models",
      "abstract": "Spoken language models (SLMs) have gained increasing attention with advancements in text-based, decoder-only language models. SLMs process text and speech, enabling simultaneous speech understanding and generation. This paper presents Double-Codebook Speaker-invariant Clustering (DC-Spin), which aims to improve speech tokenization by bridging audio signals and SLM tokens. DC-Spin extracts speaker-invariant tokens rich in phonetic information and resilient to input variations, enhancing zero-shot SLM tasks and speech resynthesis. We propose a chunk-wise approach to enable streamable DC-Spin without retraining and degradation. Comparisons of tokenization methods (self-supervised and neural audio codecs), model scalability, and downstream task proxies show that tokens easily modeled by an n-gram LM or aligned with phonemes offer strong performance, providing insights for designing speech tokenizers for SLMs.",
      "keywords": [
        "speech tokenizer",
        "self-supervised learning",
        "spoken language model",
        "speech language model",
        "speech resynthesis",
        "audio codec"
      ],
      "decision": "Reject",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "mtSSFiqW6y",
      "title": "Judge Decoding: Faster Speculative Sampling Requires Going Beyond Model Alignment",
      "abstract": "The performance of large language models (LLMs) is closely linked to their underlying size, leading to ever-growing networks and hence slower inference. Speculative decoding has been proposed as a technique to accelerate autoregressive generation, leveraging a fast draft model to propose candidate tokens, which are then verified in parallel based on their likelihood under the target model. While this approach guarantees to reproduce the target output, it incurs a substantial penalty: many high-quality draft tokens are rejected, even when they represent objectively valid continuations. Indeed, we show that even powerful draft models such as GPT-4o, as well as human text cannot achieve high acceptance rates under the standard verification scheme. This severely limits the speedup potential of current speculative decoding methods, as an early rejection becomes overwhelmingly likely when solely relying on alignment of draft and target.\nWe thus ask the following question: Can we adapt verification to recognize correct, but non-aligned replies? To this end, we draw inspiration from the LLM-as-a-judge framework, which demonstrated that LLMs are able to rate answers in a versatile way. We carefully design a dataset coined TokenCourt to elicit the same capability in the target model by training a compact module on top of the embeddings to produce ``judgements\" of the current continuation. We showcase our strategy on the Llama-3.1 family, where our 8B/405B-Judge achieves a speedup of $9\\times$ over Llama-405B, while maintaining its quality on a large range of benchmarks. These benefits remain present even in optimized inference frameworks, where our method reaches up to $141$ tokens/s for 8B/70B-Judge and $129$ tokens/s for 8B/405B on $2$ and $8$ H100s respectively.",
      "keywords": [
        "LLM inference",
        "speculative decoding"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    }
  },
  {
    "group_id": "CbpWPbYHuv",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "FVuqJt3c4L",
      "title": "Population Transformer: Learning Population-level Representations of Neural Activity",
      "abstract": "We present a self-supervised framework that learns population-level codes for arbitrary ensembles of neural recordings at scale. We address key challenges in scaling models with neural time-series data, namely, sparse and variable electrode distribution across subjects and datasets. The Population Transformer (PopT) stacks on top of pretrained temporal embeddings and enhances downstream decoding by enabling learned aggregation of multiple spatially-sparse data channels. The pretrained PopT lowers the amount of data required for downstream decoding experiments, while increasing accuracy, even on held-out subjects and tasks. Compared to end-to-end methods, this approach is computationally lightweight, while achieving similar or better decoding performance. We further show how our framework is generalizable to multiple time-series embeddings and neural data modalities. Beyond decoding, we interpret the pretrained and fine-tuned PopT models to show how they can be used to extract neuroscience insights from large amounts of data. We release our code as well as a pretrained PopT to enable off-the-shelf improvements in multi-channel intracranial data decoding and interpretability. Code is available at https://github.com/czlwang/PopulationTransformer.",
      "keywords": [
        "representation learning",
        "neuroscience",
        "self supervised learning"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "cnecLUNs6w",
      "title": "Adversarial Robustness of In-Context Learning in Transformers for Linear Regression",
      "abstract": "Transformers have demonstrated remarkable in-context learning capabilities across various domains, including statistical learning tasks. While previous work has shown that transformers can implement common learning algorithms, the adversarial robustness of these learned algorithms remains unexplored. This work investigates the vulnerability of in-context learning in transformers to _hijacking attacks_ focusing on the setting of linear regression tasks. Hijacking attacks are prompt-manipulation attacks in which the adversary's goal is to manipulate the prompt to force the transformer to generate a specific output. We first prove that single-layer linear transformers, known to implement gradient descent in-context, are non-robust and can be manipulated to output arbitrary predictions by perturbing\na single example in the in-context training set. While our experiments show these attacks succeed on linear transformers, we find they do not transfer to more complex transformers with GPT-2 architectures. Nonetheless, we show that these transformers can be hijacked using gradient-based adversarial attacks. We then demonstrate that adversarial training enhances transformers' robustness against hijacking attacks, even when just applied during finetuning.  Additionally, we find that in some settings, adversarial training against a weaker attack model can lead to robustness to a stronger attack model.  Lastly, we investigate the transferability of hijacking attacks across transformers of varying scales and initialization seeds, as well as between transformers and ordinary least squares (OLS). We find that while attacks transfer effectively between small-scale transformers, they show poor transferability in other scenarios (small-to-large scale, large-to-large scale, and between transformers and OLS).",
      "keywords": [
        "in-context learning",
        "transformers",
        "hijacking attacks",
        "linear regression",
        "linear transformers",
        "transfer of adversarial attacks"
      ],
      "decision": "Reject",
      "year": "2025"
    }
  },
  {
    "group_id": "pTeOOKnjGM",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "8enWnd6Gp3",
      "title": "TetSphere Splatting: Representing High-Quality Geometry with Lagrangian Volumetric Meshes",
      "abstract": "We introduce TetSphere Splatting, a Lagrangian geometry representation designed for high-quality 3D shape modeling. TetSphere splatting leverages an underused yet powerful geometric primitive -- volumetric tetrahedral meshes. It represents 3D shapes by deforming a collection of tetrahedral spheres, with geometric regularizations and constraints that effectively resolve common mesh issues such as irregular triangles, non-manifoldness, and floating artifacts. Experimental results on multi-view and single-view reconstruction highlight TetSphere splatting's superior mesh quality while maintaining competitive reconstruction accuracy compared to state-of-the-art methods. Additionally, TetSphere splatting demonstrates versatility by seamlessly integrating into generative modeling tasks, such as image-to-3D and text-to-3D generation.",
      "keywords": [
        "geometry representation",
        "3D modeling"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "VAvZ4oinpa",
      "title": "Video Generation with Learned Action Prior",
      "abstract": "Long-term stochastic video generation remains challenging, especially with moving cameras. This scenario introduces complex interactions between camera movement and observed pixels, resulting in intricate spatio-temporal dynamics and partial observability issues. Current approaches often focus on pixel-level image reconstruction, neglecting explicit modeling of camera motion dynamics. Our proposed solution incorporates camera motion or action as an extended part of the observed image state, employing a multi-modal learning framework to simultaneously model both image and action. We introduce three models: (i) Video Generation with Learning Action Prior (VG-LeAP) that treats the image-action pair as an augmented state generated from a single latent stochastic process and uses variational inference to learn the image-action latent prior; (ii) Causal-LeAP, which establishes a causal relationship between action and the observed image frame, and learns a seperate action prior, conditioned on the observed image states along with the image prior; and (iii) RAFI, which integrates the augmented image-action state concept with a conditional flow matching framework, demonstrating that this action-conditioned image generation concept can be extended to other transformer-based architectures. Through comprehensive empirical studies on robotic video dataset, RoAM, we highlight the importance of multi-modal training in addressing partially observable video generation problems.",
      "keywords": [
        "Stochastic Video Generation",
        "Variational Inference"
      ],
      "decision": "Reject",
      "year": "2025"
    }
  },
  {
    "group_id": "sgbI8Pxwie",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "QWunLKbBGF",
      "title": "Do LLMs Recognize Your Preferences? Evaluating Personalized Preference Following in LLMs",
      "abstract": "Large Language Models (LLMs) are increasingly deployed as chatbots, yet their ability to personalize responses to user preferences remains limited. We introduce PrefEval, a benchmark for evaluating LLMs' ability to infer, memorize and adhere to user preferences in long-context conversational setting.\nPrefEval comprises 3,000 manually curated user preference and query pairs spanning 20 topics. PrefEval contains user personalization or preference information in both explicit and implicit preference forms, and evaluates LLM performance using a generation and a classification task. With PrefEval, we have evaluated 10 open-sourced and\nproprietary LLMs in multi-session conversations with varying context lengths up to 100k tokens. We benchmark with various prompting, iterative feedback, and retrieval-augmented generation methods. \nOur benchmarking effort reveals that state-of-the-art LLMs face significant challenges in following users' preference during conversations. In particular,  in zero-shot settings, preference following accuracy falls below 10\\% at merely 10 turns (~3k tokens) across most evaluated models. Even with advanced prompting and retrieval methods, preference following still deteriorates in long-context conversations. Furthermore, we show that fine-tuning on PrefEval significantly improves performance. We believe PrefEval serves as a valuable resource for measuring, understanding, and enhancing LLMs' proactive preference following abilities, paving the way for personalized conversational agents.",
      "keywords": [
        "personalization",
        "benchmark",
        "Large language models",
        "conversational llm",
        "chatbots"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "eks3dGnocX",
      "title": "How Transformers Solve Propositional Logic Problems: A Mechanistic Analysis",
      "abstract": "Large language models (LLMs) have shown amazing performance on tasks that require planning and reasoning. Motivated by this, we investigate the internal mechanisms that underpin a network's ability to perform complex logical reasoning. We first construct a synthetic propositional logic problem that serves as a concrete test-bed for network training and evaluation. Crucially, this problem demands nontrivial planning to solve. We perform our study on two fronts. First, we pursue an understanding of precisely how a three-layer transformer, trained from scratch and attains perfect test accuracy, solves this problem. We are able to identify certain \"planning\" and \"reasoning\" circuits in the network that necessitate cooperation between the attention blocks to implement the desired logic. Second, we study how a pretrained LLM, Mistral 7B, solves this problem. Using activation patching, we characterize internal components that are critical in solving our logic problem. Overall, our work systemically uncovers novel aspects of small and large transformers, and continues the study of how they plan and reason.",
      "keywords": [
        "Mechanistic Interpretability",
        "Language Models",
        "Transformers",
        "Logical Reasoning",
        "Learned Representations"
      ],
      "decision": "Reject",
      "year": "2025"
    }
  },
  {
    "group_id": "PYmrUQmMEw",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "5iUUorHeM3",
      "title": "CIRCUIT: A Benchmark for Circuit Interpretation and Reasoning Capabilities of LLMs",
      "abstract": "The role of Large Language Models (LLMs) has not been extensively explored in analog circuit design, which could benefit from a reasoning-based approach that transcends traditional optimization techniques. In particular, despite their growing relevance, there are no benchmarks to assess LLMs’ reasoning capability about circuits. Therefore, we created the CIRCUIT dataset consisting of 510 question-answer pairs spanning various levels of analog-circuit-related subjects. The best-performing model on our dataset, GPT-4o, achieves 48.04\\% accuracy when evaluated on the final numerical answer. To evaluate the robustness of LLMs on our dataset, we introduced a unique feature that enables unit-test-like evaluation by grouping questions into unit tests. In this case, GPT-4o can only pass 27.45\\% of the unit tests, highlighting that the most advanced LLMs still struggle with understanding circuits, which requires multi-level reasoning, particularly when involving circuit topologies. This circuit-specific benchmark highlights LLMs' limitations, offering valuable insights for advancing their application in analog integrated circuit design.",
      "keywords": [
        "Large Language Models (LLMs)",
        "benchmarking",
        "analog circuits",
        "dataset creation",
        "evaluation metrics"
      ],
      "decision": "Reject",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "mtSSFiqW6y",
      "title": "Judge Decoding: Faster Speculative Sampling Requires Going Beyond Model Alignment",
      "abstract": "The performance of large language models (LLMs) is closely linked to their underlying size, leading to ever-growing networks and hence slower inference. Speculative decoding has been proposed as a technique to accelerate autoregressive generation, leveraging a fast draft model to propose candidate tokens, which are then verified in parallel based on their likelihood under the target model. While this approach guarantees to reproduce the target output, it incurs a substantial penalty: many high-quality draft tokens are rejected, even when they represent objectively valid continuations. Indeed, we show that even powerful draft models such as GPT-4o, as well as human text cannot achieve high acceptance rates under the standard verification scheme. This severely limits the speedup potential of current speculative decoding methods, as an early rejection becomes overwhelmingly likely when solely relying on alignment of draft and target.\nWe thus ask the following question: Can we adapt verification to recognize correct, but non-aligned replies? To this end, we draw inspiration from the LLM-as-a-judge framework, which demonstrated that LLMs are able to rate answers in a versatile way. We carefully design a dataset coined TokenCourt to elicit the same capability in the target model by training a compact module on top of the embeddings to produce ``judgements\" of the current continuation. We showcase our strategy on the Llama-3.1 family, where our 8B/405B-Judge achieves a speedup of $9\\times$ over Llama-405B, while maintaining its quality on a large range of benchmarks. These benefits remain present even in optimized inference frameworks, where our method reaches up to $141$ tokens/s for 8B/70B-Judge and $129$ tokens/s for 8B/405B on $2$ and $8$ H100s respectively.",
      "keywords": [
        "LLM inference",
        "speculative decoding"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    }
  },
  {
    "group_id": "8xxEBAtD7y",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "Ozo7qJ5vZi",
      "title": "KAN: Kolmogorov–Arnold Networks",
      "abstract": "Inspired by the Kolmogorov-Arnold representation theorem, we propose Kolmogorov-Arnold Networks (KANs) as promising alternatives to Multi-Layer Perceptrons (MLPs). While MLPs have fixed activation functions on nodes (\"neurons''), KANs have learnable activation functions on edges (\"weights''). KANs have no linear weights at all -- every weight parameter is replaced by a univariate function parametrized as a spline. We show that this seemingly simple change makes KANs outperform MLPs in terms of accuracy and interpretability, on small-scale AI + Science tasks. For accuracy, smaller KANs can achieve comparable or better accuracy than larger MLPs in function fitting tasks. Theoretically and empirically, KANs possess faster neural scaling laws than MLPs. For interpretability, KANs can be intuitively visualized and can easily interact with human users. Through two examples in mathematics and physics, KANs are shown to be useful ``collaborators'' helping scientists (re)discover mathematical and physical laws. In summary, KANs are promising alternatives for MLPs. Despite the slow training of KANs, their improved accuracy and interpretability show the potential to improve today's deep learning models which rely heavily on MLPs. More research is necessary to make KANs' training more efficient.",
      "keywords": [
        "Kolmogorov-Arnold networks",
        "Kolmogorov-Arnold representation theorem",
        "learnable activation functions",
        "interpretability",
        "AI + Science"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "sp9irsV1yq",
      "title": "Identifying Sub-networks in Neural Networks via Functionally Similar Representations",
      "abstract": "Mechanistic interpretability aims to provide human-understandable insights into the inner workings of neural network models by examining their internals. Existing approaches typically require significant manual effort and prior knowledge, with strategies tailored to specific tasks. In this work, we take a step toward automating the understanding of the network by investigating the existence of distinct  sub-networks. Specifically, we explore a novel automated and task-agnostic approach based on the notion of functionally similar representations within neural networks, reducing the need for human intervention. \nOur method identifies similar and dissimilar layers in the network, revealing potential sub-components. We achieve this by proposing, for the first time to our knowledge, the use of Gromov-Wasserstein distance, which overcomes challenges posed by varying distributions and dimensionalities across intermediate representations—issues that complicate direct layer-to-layer comparisons.\nThrough experiments on algebraic, language, and vision tasks, we observe the emergence of sub-groups within neural network layers corresponding to functional abstractions. Additionally, we find that different training strategies influence the positioning of these sub-groups. Our approach offers meaningful insights into the behavior of neural networks with minimal human and computational cost.",
      "keywords": [
        "mechanistic interpretability",
        "subnetworks"
      ],
      "decision": "Reject",
      "year": "2025"
    }
  },
  {
    "group_id": "RD9q5vEe1Q",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "WFlLqUmb9v",
      "title": "Efficient Time Series Forecasting via Hyper-Complex Models and Frequency Aggregation",
      "abstract": "Time-series forecasting is a long-standing challenge in statistics and machine learning, with one of the key difficulties being the ability to process sequences with long-range dependencies. A recent line of work has addressed this by applying the short-time Fourier transform (STFT), which partitions sequences into multiple subsequences and applies a Fourier transform to each separately.\nWe propose the Frequency Information Aggregation (FIA-Net), a model that can utilize two backbone architectures: the Window-Mixing MLP (WM-MLP), which aggregates adjacent window information in the frequency domain, and the Hyper-Complex MLP (HC-MLP), which treats the set of STFT windows as hyper-complex (HC) valued vectors. and employ HC algebra to efficiently combine information from all STFT windows altogether. Furthermore, due to the nature of HC operations, the HC-MLP uses up to three times fewer parameters than the equivalent standard window aggre- gation method. We evaluate the FIA-Net on various time-series benchmarks and show that the proposed methodologies outperform existing state-of-the-art meth- ods in terms of both accuracy and efficiency. Our code is publicly available on https://anonymous.4open.science/r/research-1803/",
      "keywords": [
        "time-series forecasting",
        "frequency models",
        "hyper-complex machine learning",
        "short-time Fourier transform"
      ],
      "decision": "Reject",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "k38Th3x4d9",
      "title": "Root Cause Analysis of Anomalies in Multivariate Time Series through Granger Causal Discovery",
      "abstract": "Identifying the root causes of anomalies in multivariate time series is challenging due to the complex dependencies among the series. In this paper, we propose a comprehensive approach called AERCA that inherently integrates Granger causal discovery with root cause analysis. By defining anomalies as interventions on the exogenous variables of time series, AERCA not only learns the Granger causality among time series but also explicitly models the distributions of exogenous variables under normal conditions. AERCA then identifies the root causes of anomalies by highlighting exogenous variables that significantly deviate from their normal states. Experiments on multiple synthetic and real-world datasets demonstrate that AERCA can accurately capture the causal relationships among time series and effectively identify the root causes of anomalies.",
      "keywords": [
        "root cause analysis",
        "Granger causality",
        "multivariate time series"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    }
  },
  {
    "group_id": "wGa2plE8ka",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "GsCMKwyfWm",
      "title": "LVLM-COUNT: Enhancing the Counting Ability of Large Vision-Language Models",
      "abstract": "Counting is a fundamental skill for various visual tasks in real-life applications, requiring both object recognition and robust counting capabilities. Despite their advanced visual perception, large vision-language models (LVLMs) struggle with counting tasks, especially when the number of objects exceeds those commonly encountered during training. We enhance LVLMs’ counting abilities using a divide-and conquer approach, breaking counting problems into sub-counting tasks. Unlike prior methods, which do not generalize well to counting datasets on which they have not been trained, our method performs well on new datasets without any additional training or fine-tuning. We demonstrate that our approach enhances counting capabilities across various datasets and benchmarks.",
      "keywords": [
        "Counting",
        "Large vision-language models"
      ],
      "decision": "Reject",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "Ha6RTeWMd0",
      "title": "SAM 2: Segment Anything in Images and Videos",
      "abstract": "We present Segment Anything Model 2 (SAM 2), a foundation model towards solving promptable visual segmentation in images and videos. We build a data engine, which improves model and data via user interaction, to collect the largest video segmentation dataset to date. Our model is a simple transformer architecture with streaming memory for real-time video processing. SAM 2 trained on our data provides strong performance across a wide range of tasks. In video segmentation, we observe better accuracy, using 3x fewer interactions than prior approaches. In image segmentation, our model is more accurate and 6x faster than the Segment Anything Model (SAM). We believe that our data, model, and insights will serve as a significant milestone for video segmentation and related perception tasks. We are releasing our main model, the dataset, an interactive demo and code.",
      "keywords": [
        "computer vision",
        "video segmentation",
        "image segmentation"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    }
  },
  {
    "group_id": "cLtE4qoPlD",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "25kAzqzTrz",
      "title": "Towards Understanding Why FixMatch Generalizes Better Than Supervised Learning",
      "abstract": "Semi-supervised learning (SSL), exemplified by FixMatch (Sohn et al., 2020), has shown significant generalization advantages over supervised learning (SL), particularly in the context of deep neural networks (DNNs). However, it is still unclear, from a theoretical standpoint, why FixMatch-like SSL algorithms generalize  better than SL on DNNs. In this work, we present the first theoretical justification for the enhanced test accuracy observed in  FixMatch-like SSL applied to DNNs by taking  convolutional neural networks (CNNs) on classification tasks as an example. Our theoretical analysis reveals that the semantic feature learning processes in FixMatch and SL are rather different. In particular, FixMatch learns all the discriminative features of each semantic class, while SL only randomly captures a subset of features due to the well-known lottery ticket hypothesis. Furthermore, we show that our analysis framework can be applied to other FixMatch-like SSL methods, e.g., FlexMatch, FreeMatch, Dash, and SoftMatch. Inspired by our theoretical analysis, we develop an improved variant of FixMatch, termed Semantic-Aware FixMatch (SA-FixMatch). Experimental results corroborate our theoretical findings and the enhanced generalization capability of SA-FixMatch.",
      "keywords": [
        "deep semi-supervised learning",
        "generalization error",
        "feature learning"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "PJjHILiQHC",
      "title": "Approaching Deep Learning through the Spectral Dynamics of Weights",
      "abstract": "We propose an empirical approach centered on the spectral dynamics of weights---the behavior of singular values and vectors during optimization---to unify and clarify several phenomena in deep learning. We identify a consistent bias in optimization across various experiments, from small-scale ``grokking'' to large-scale tasks like image classification with ConvNets, image generation with UNets, speech recognition with LSTMs, and language modeling with Transformers. We also demonstrate that weight decay enhances this bias beyond its role as a norm regularizer, even in practical systems. Moreover, we show that these spectral dynamics distinguish memorizing networks from generalizing ones, offering a novel perspective on this longstanding conundrum. Additionally, we leverage spectral dynamics to explore the emergence of well-performing sparse subnetworks (lottery tickets) and the structure of the loss surface through linear mode connectivity. Our findings suggest that spectral dynamics provide a coherent framework to better understand the behavior of neural networks across diverse settings.",
      "keywords": [
        "simplicity bias",
        "grokking",
        "lottery tickets",
        "linear mode connectivity"
      ],
      "decision": "Reject",
      "year": "2025"
    }
  },
  {
    "group_id": "HfWcFs7XLR",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "CFKZKjrQ5r",
      "title": "FCoReBench: Can Large Language Models Solve Challenging First-Order Combinatorial Reasoning Problems?",
      "abstract": "Can the large language models (LLMs) solve challenging first-order combinatorial\nreasoning problems such as graph coloring, knapsack, and cryptarithmetic? By\nfirst-order, we mean these problems can be instantiated with potentially an infinite\nnumber of problem instances of varying sizes. They are also challenging being\nNP-hard and requiring several reasoning steps to reach a solution. While existing\nwork has focused on coming up with datasets with hard benchmarks, there is\nlimited work which exploits the first-order nature of the problem structure. To\naddress this challenge, we present FCoReBench, a dataset of 40 such challenging\nproblems, along with scripts to generate problem instances of varying sizes and\nautomatically verify and generate their solutions. We first observe that LLMs, even\nwhen aided by symbolic solvers, perform rather poorly on our dataset, being unable\nto leverage the underlying structure of these problems. We specifically observe\na drop in performance with increasing problem size. In response, we propose a\nnew approach, SymPro-LM, which combines LLMs with both symbolic solvers\nand program interpreters, along with feedback from a few solved examples, to\nachieve huge performance gains. Our proposed approach is robust to changes in the\nproblem size, and has the unique characteristic of not requiring any LLM call during\ninference time, unlike earlier approaches. As an additional experiment, we also\ndemonstrate SymPro-LM’s effectiveness on other logical reasoning benchmarks.",
      "keywords": [
        "llms",
        "logical-reasoning",
        "first-order-reasoning",
        "neuro-symbolic"
      ],
      "decision": "Reject",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "mtSSFiqW6y",
      "title": "Judge Decoding: Faster Speculative Sampling Requires Going Beyond Model Alignment",
      "abstract": "The performance of large language models (LLMs) is closely linked to their underlying size, leading to ever-growing networks and hence slower inference. Speculative decoding has been proposed as a technique to accelerate autoregressive generation, leveraging a fast draft model to propose candidate tokens, which are then verified in parallel based on their likelihood under the target model. While this approach guarantees to reproduce the target output, it incurs a substantial penalty: many high-quality draft tokens are rejected, even when they represent objectively valid continuations. Indeed, we show that even powerful draft models such as GPT-4o, as well as human text cannot achieve high acceptance rates under the standard verification scheme. This severely limits the speedup potential of current speculative decoding methods, as an early rejection becomes overwhelmingly likely when solely relying on alignment of draft and target.\nWe thus ask the following question: Can we adapt verification to recognize correct, but non-aligned replies? To this end, we draw inspiration from the LLM-as-a-judge framework, which demonstrated that LLMs are able to rate answers in a versatile way. We carefully design a dataset coined TokenCourt to elicit the same capability in the target model by training a compact module on top of the embeddings to produce ``judgements\" of the current continuation. We showcase our strategy on the Llama-3.1 family, where our 8B/405B-Judge achieves a speedup of $9\\times$ over Llama-405B, while maintaining its quality on a large range of benchmarks. These benefits remain present even in optimized inference frameworks, where our method reaches up to $141$ tokens/s for 8B/70B-Judge and $129$ tokens/s for 8B/405B on $2$ and $8$ H100s respectively.",
      "keywords": [
        "LLM inference",
        "speculative decoding"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    }
  },
  {
    "group_id": "cR5GTis5II",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "kBybSUskz7",
      "title": "Reinforcement Learning and Heuristics for Hardware-Efficient Constrained Code Design",
      "abstract": "Constrained codes enhance reliability in high-speed communication systems and optimize bit efficiency when working with non-binary data representations (e.g., three-level ternary symbols).  A key challenge in their design is minimizing the hardware complexity of the translation logic that encodes and decodes data. We introduce a reinforcement learning (RL)-based framework, augmented by a custom L1 similarity-based heuristic, to design hardware-efficient translation logic, navigating the vast solution space of codeword assignments. By modeling the task as a bipartite graph matching problem and using logic synthesis tools to evaluate hardware complexity, our RL approach outperforms human-derived solutions and generalizes to various code types. Finally, we analyze the learned policies to extract insights into high-performing strategies.",
      "keywords": [
        "reinforcement learning",
        "bipartite matching",
        "GNN",
        "combinatorial optimization",
        "feature engineering",
        "hardware design optimization",
        "logic synthesis"
      ],
      "decision": "Reject",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "stUKwWBuBm",
      "title": "Tractable Multi-Agent Reinforcement Learning through Behavioral Economics",
      "abstract": "A significant roadblock to the development of principled multi-agent reinforcement learning (MARL) algorithms is the fact that desired solution concepts like Nash equilibria may be intractable to compute. We show how one can overcome this obstacle by introducing concepts from behavioral economics into MARL. To do so, we imbue agents with two key features of human decision-making: risk aversion and bounded rationality. We show that introducing these two properties into games gives rise to a class of equilibria---risk-averse quantal response equilibria (RQE)---which are tractable to compute in \\emph{all} $n$-player matrix and finite-horizon Markov games.  In particular, we show that they emerge as the endpoint of no-regret learning in suitably adjusted versions of the games. Crucially, the class of computationally tractable RQE is independent of the underlying game structure and only depends on agents' degrees of risk-aversion and bounded rationality.  To validate the expressivity of this class of solution concepts we show that it captures peoples' patterns of play in a number of 2-player matrix games previously studied in experimental economics. Furthermore, we give a first analysis of the sample complexity of computing these equilibria in finite-horizon Markov games when one has access to a generative model. We validate our findings on a simple multi-agent reinforcement learning benchmark. Our results open the doors for to the principled development of new decentralized multi-agent reinforcement learning algorithms.",
      "keywords": [
        "behavioral economics",
        "risk-aversion",
        "multi-agent reinforcement learning",
        "quantal response",
        "bounded rationality"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    }
  },
  {
    "group_id": "00SnKBGTsz",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "5EuAMDMPRK",
      "title": "POROver: Improving Safety and Reducing Overrefusal in Large Language Models with Overgeneration and Preference Optimization",
      "abstract": "Balancing safety and usefulness in large language models has become a critical challenge in recent years. \nModels often exhibit unsafe behavior or adopt an overly cautious approach, leading to frequent overrefusal of benign prompts, which reduces their usefulness. \nAddressing these issues requires methods that maintain safety while avoiding overrefusal. \nIn this work, we examine how the overgeneration of training data using advanced teacher models (e.g., GPT-4o), including responses to both general-purpose and toxic prompts, influences the safety and usefulness in instruction-following language models.\nAdditionally, we present POROver, a strategy to use preference optimization methods in order to reduce overrefusal, via employing a superior teacher model's completions.\nOur results show that overgenerating completions for general-purpose prompts significantly enhances the model's safety and usefulness balance.\nSpecifically, the F1 score calculated between safety and usefulness increases from 74.4\\% to 91.8\\% due to a substantial increase in safety. \nMoreover, overgeneration for toxic prompts substantially increases the usefulness from 11.1\\% to 57.6\\% while maintaining safety.\nFurthermore, preference optimization algorithms, when applied with carefully curated preference data, can effectively increase a model's usefulness from 57.6\\% to 82.1\\% while maintaining comparable safety levels.",
      "keywords": [
        "LLM safety",
        "LLM usefulness",
        "Overrefusal in LLMs",
        "responsible AI"
      ],
      "decision": "Reject",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "YUYJsHOf3c",
      "title": "ReGenesis: LLMs can Grow into Reasoning Generalists via Self-Improvement",
      "abstract": "Post-training Large Language Models (LLMs) with explicit reasoning trajectories can enhance their reasoning abilities. However, acquiring such high-quality trajectory data typically demands meticulous supervision from humans or superior models, which can be either expensive or license-constrained. In this paper, we explore how far an LLM can improve its reasoning by self-synthesizing reasoning paths as training data without any additional supervision. Existing self-synthesizing methods, such as STaR, suffer from poor generalization to out-of-domain (OOD) reasoning tasks. We hypothesize it is due to that their self-synthesized reasoning paths are too task-specific, lacking general task-agnostic reasoning guidance. To address this, we propose **Reasoning Generalist via Self-Improvement (ReGenesis)**, a method to *self-synthesize reasoning paths as post-training data by progressing from abstract to concrete*. More specifically, ReGenesis self-synthesizes reasoning paths by converting general reasoning guidelines into task-specific ones, generating reasoning structures, and subsequently transforming these structures into reasoning paths, without the need for human-designed task-specific examples used in existing methods. We show that ReGenesis achieves superior performance on all in-domain and OOD settings tested compared to existing methods. For six OOD tasks specifically, while previous methods exhibited an average performance decrease of approximately 4.6% after post training, ReGenesis delivers around 6.1% performance improvement. We also conduct an in-depth analysis of our framework and show ReGenesis is effective across various language models and design choices.",
      "keywords": [
        "LLM",
        "reasoning",
        "generalization",
        "self-improvement"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    }
  },
  {
    "group_id": "aPHHhnZktB",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "f4gF6AIHRy",
      "title": "Combatting Dimensional Collapse in LLM Pre-Training Data via Submodular File Selection",
      "abstract": "Selecting high-quality pre-training data for large language models (LLMs) is crucial for enhancing their overall performance under limited computation budget, improving both training and sample efficiency. Recent advancements in file selection primarily rely on using an existing or trained proxy model to assess the similarity of samples to a target domain, such as high quality sources BookCorpus and Wikipedia. However, upon revisiting these methods, the domain-similarity selection criteria demonstrates a diversity dilemma, i.e. dimensional collapse in the feature space, improving performance on the domain-related tasks but causing severe degradation on generic performance.To prevent collapse and enhance diversity, we propose a DiverSified File selection algorithm (DiSF), which selects the most decorrelated text files in the feature space. We approach this with a classical greedy algorithm to achieve more uniform eigenvalues in the feature covariance matrix of the selected texts, analyzing its approximation to the optimal solution under a formulation of $\\gamma$-weakly submodular optimization problem. Empirically, we establish a benchmark and conduct extensive experiments on the TinyLlama architecture with models from 120M to 1.1B parameters. Evaluating across nine tasks from the Harness framework, DiSF demonstrates a significant improvement on overall performance. Specifically, DiSF saves 98.5\\% of 590M training files in SlimPajama, outperforming the full-data pre-training within a 50B training budget, and achieving about 1.5x training efficiency and 5x data efficiency. Source code\nis available at: https://github.com/MediaBrain-SJTU/DiSF.git.",
      "keywords": [
        "file selection",
        "large language model",
        "pre-training",
        "submodular optimization"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "nphsoKxlFs",
      "title": "Dynamic Contrastive Learning for Time Series Representation",
      "abstract": "Understanding events in time series is an important task in a variety of contexts. However, human analysis and labeling are expensive and time-consuming. Therefore, it is advantageous to learn embeddings for moments in time series in an unsupervised way, which allows for good performance in classification or detection tasks after later minimal human labeling. In this paper, we propose dynamic contrastive learning (DynaCL), an unsupervised representation learning framework for time series that uses temporal adjacent steps to define positive pairs. DynaCL adopts N-pair loss to dynamically treat all samples in a batch as positive or negative pairs, enabling efficient training and addressing the challenges of complicated sampling of positives. We demonstrate that DynaCL embeds instances from time series into well-defined, semantically meaningful clusters, which allows superior performance on downstream tasks on a variety of public time series datasets. Our findings also reveal that high scores on unsupervised clustering metrics do not guarantee that the representations are useful in downstream tasks.",
      "keywords": [
        "contrastive learning",
        "self-supervised learning",
        "time series analysis",
        "representation learning"
      ],
      "decision": "Reject",
      "year": "2025"
    }
  },
  {
    "group_id": "NgLFQTBPRR",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "dCWSVAbWXM",
      "title": "Fence off Anomaly Interference: Cross-Domain Distillation for Fully Unsupervised Anomaly Detection",
      "abstract": "Fully Unsupervised Anomaly Detection (FUAD) is a practical extension of Unsupervised Anomaly Detection (UAD), aiming to detect anomalies without any labels even when the training set may contain anomalous samples. To achieve FUAD,  we pioneer the introduction of Knowledge Distillation (KD) paradigm based on teacher–student framework into the FUAD setting. However, due to the presence of anomalies in the training data, traditional KD methods risk enabling the student to learn the teacher’s representation of anomalies under FUAD setting, thereby resulting in poor anomaly detection performance. To address this issue, we propose a novel Cross-Domain Distillation (CDD) framework based on the widely studied reverse distillation (RD) paradigm. Specifically, we design a Domain-Specific Training, which divides the training set into multiple domains with lower anomaly ratios and train a domain-specific student for each. Cross-Domain Knowledge Aggregation is then performed, where pseudo-normal features generated by domain-specific students collaboratively guide a global student to learn generalized normal representations across all samples. Experimental results on noisy versions of the MVTec AD and VisA datasets demonstrate that our method achieves significant performance improvements over the baseline, validating its effectiveness under FUAD setting.",
      "keywords": [
        "Anomaly Detection",
        "Unsupervised Learning",
        "Knowledge Distillation"
      ],
      "decision": "Reject",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "koEALFNBj1",
      "title": "Representation Entanglement for Generation: Training Diffusion Transformers Is Much Easier Than You Think",
      "abstract": "REPA and its variants effectively mitigate training challenges in diffusion models by incorporating external visual representations from pretrained models, through alignment between the noisy hidden projections of denoising networks and foundational clean image representations. We argue that the external alignment, which is absent during the entire denoising inference process, falls short of fully harnessing the potential of discriminative representations. In this work, we propose a straightforward method called \\textit{\\textbf{R}epresentation \\textbf{E}ntanglement for \\textbf{G}eneration} (\\textbf{REG}), which entangles low-level image latents with a single high-level class token from pretrained foundation models for denoising. \nREG acquires the capability to produce coherent image-class pairs directly from pure noise, substantially improving both generation quality and training efficiency.\nThis is accomplished with negligible additional inference overhead, requiring only one single additional token for denoising (<0.5\\% increase in FLOPs and latency).\nThe inference process concurrently reconstructs both image latents and their corresponding global semantics, where the acquired semantic knowledge actively guides and enhances the image generation process.\nOn ImageNet 256$\\times$256, SiT-XL/2 + REG demonstrates remarkable convergence acceleration, achieving $\\textbf{63}\\times$ and $\\textbf{23}\\times$ faster training than SiT-XL/2 and SiT-XL/2 + REPA, respectively. \nMore impressively, SiT-L/2 + REG trained for merely 400K iterations outperforms SiT-XL/2 + REPA trained for 4M iterations ($\\textbf{10}\\times$ longer). Code is available at: https://github.com/Martinser/REG.",
      "keywords": [
        "Diffusion Model Acceleration; Representation Entanglement; Diffusion Transformers"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    }
  },
  {
    "group_id": "hq2CkcEY7h",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "ImpizBSKcu",
      "title": "Dynamical Decoupling of Generalization and Overfitting in Large Two-Layer Networks",
      "abstract": "Understanding the inductive bias and generalization properties of large overparametrized machine learning models requires to characterize the dynamics of the training algorithm.  We study the learning dynamics of large two-layer neural networks via dynamical mean field theory, a well established technique of non-equilibrium statistical physics. We show that, for large network width $m$,\nand large number of samples per input dimension $n/d$, the training dynamics exhibits a separation of timescales which implies:\n$(i)$ The emergence of a slow time scale associated with the growth in Gaussian/Rademacher complexity of the network;\n$(ii)$ Inductive bias towards small complexity if the initialization has small enough complexity;\n$(iii)$ A dynamical decoupling between feature learning and overfitting regimes; $(iv)$ A non-monotone behavior of the test error, associated  `feature unlearning' regime at large times.",
      "keywords": [
        "Overfitting; feature learning; dynamical mean field theory; generalization;"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "xEzQCFSfPG",
      "title": "Grokking and Generalization Collapse: Insights from HTSR theory",
      "abstract": "Grokking is a surprising phenomenon in neural network training where test accuracy remains low for an extended period despite near-perfect training accuracy, only to suddenly leap to strong generalization. In this work, we study grokking using a depth-3, width-200 ReLU MLP trained on a subset of MNIST. We investigate it's long-term dynamics under both weight-decay and, critically, no-decay regimes—the latter often characterized by increasing $l^2$ weight norms. Our primary tool is the theory of Heavy-Tailed Self-Regularization **HTSR**, where we track the heavy-tailed exponent $\\alpha$. We find that $\\alpha$ reliably predicts both the initial grokking transition and subsequent anti-grokking. We benchmark these insights against four prior approaches: progress measures---Activation Sparsity, Absolute Weight Entropy, and Approximate Local Circuit Complexity ---and weight norm ($l^2$) analysis.\nOur experiments show that while comparative approaches register significant changes, **in this regime of increasing $l^2$ norm, the heavy-tailed exponent $\\alpha$ demonstrates a unique correlation with the ensuing large, long-term dip in test accuracy, a signal not reliably captured by most other measures.**\n\n\n\nExtending our zero weight decay experiment significantly beyond typical timescales ($10^{5}$ to approximately $10^{7}$ optimization steps), **we reveal a late-stage catastrophic generalization collapse (``anti-grokking''), characterized by a dramatic drop in test accuracy (over 25 percentage points) while training accuracy remains perfect**; notably, the heavy-tail metric $\\alpha$ uniquely provides an early warning of this impending collapse. Our results underscore the utility of Heavy-Tailed Self-Regularization theory for tracking generalization dynamics, even in the challenging regimes without explicit weight decay regularization.",
      "keywords": [
        "Grokking",
        "Heavy-Tailed Self-Regularization",
        "Random Matrix Theory",
        "Heavy-Tail Exponent",
        "Spectral Analysis",
        "Generalization Dynamics",
        "Catastrophic Generalization Collapse",
        "Implicit Regularization"
      ],
      "decision": "Reject",
      "year": "2025"
    }
  },
  {
    "group_id": "sPafJfwI2I",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "aUAG1WS7J2",
      "title": "Class-wise Balancing Data Replay for Federated Class-Incremental Learning",
      "abstract": "Federated Class Incremental Learning (FCIL) aims to collaboratively process continuously increasing incoming tasks across multiple clients. Among various approaches, data replay has become a promising solution, which can alleviate forgetting by reintroducing representative samples from previous tasks. However, their performance is typically limited by class imbalance, both within the replay buffer due to limited global awareness and between replayed and newly arrived classes. To address this issue, we propose a class-wise balancing data replay method for FCIL (FedCBDR), which employs a global coordination mechanism for class-level memory construction and reweights the learning objective to alleviate the aforementioned imbalances. Specifically, FedCBDR has two key components: 1) the global-perspective data replay module reconstructs global representations of prior task knowledge in a privacy-preserving manner, which then guides a class-aware and importance-sensitive sampling strategy to achieve balanced replay; 2) Subsequently, to handle class imbalance across tasks, the task-aware temperature scaling module adaptively adjusts the temperature of logits at both class and instance levels based on task dynamics, which reduces the model’s overconfidence in majority classes while enhancing its sensitivity to minority classes. Experimental results verified that FedCBDR achieves balanced class-wise sampling under heterogeneous data distributions and improves generalization under task imbalance between earlier and recent tasks, yielding a 2%-15% Top-1 accuracy improvement over six state-of-the-art methods.",
      "keywords": [
        "Federated Learning;Federated Class-Incremental Learning; Data Replay"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "sPafJfwI2I",
      "title": "Reshape-then-Factorize: Communication-Efficient FL via Model-Agnostic Projection Optimization",
      "abstract": "Federated learning (FL) enables collaborative model training across distributed clients without sharing sensitive data. However, communication overhead remains a significant bottleneck, particularly for large-scale models. Low-rank decomposition techniques address this by approximating each layer’s weights or gradients with a product of low-rank matrices, thereby reducing the communication cost in FL. While effective, these methods are constrained by the layer's architecture and shapes, limiting their flexibility and performance.\nWe propose *Model-Agnostic Projection Optimization* (MAPO), a novel method that reshapes and factorizes the full model gradient into a *fixed reconstruction matrix* and a *trainable projection vector*, avoiding layer-wise decomposition and architecture constraints. MAPO directly optimizes the projection in a randomly sampled subspace, with all clients generating the reconstruction matrix via a shared random seed, incurring no additional communication overhead for synchronization.\nBy decoupling the gradient from architectural constraints through reshaping and enabling communication-free exploration of dynamic subspaces via seed sharing, MAPO provides a more flexible and efficient low-rank representation.\nEmpirical results demonstrate the effectiveness of MAPO in various FL settings.",
      "keywords": [
        "Federated Learning",
        "Low-Rank Adaptation",
        "Communication Efficiency",
        "Subspace Optimization"
      ],
      "decision": "Reject",
      "year": "2025"
    }
  },
  {
    "group_id": "1KXST1ksJ2",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "nMUpDatZBh",
      "title": "VICON: Vision In-Context Operator Networks for Multi-Physics Fluid Dynamics",
      "abstract": "In-Context Operator Networks (ICONs) have demonstrated the ability to learn operators across diverse partial differential equations using few-shot, in-context learning. However, existing ICONs process each spatial point as an individual token, severely limiting computational efficiency when handling dense data in higher spatial dimensions. We propose \\textit{Vision In-Context Operator Networks} (VICON), which integrates vision transformer architectures to efficiently process 2D data through patch-wise operations while preserving ICON's adaptability to multiphysics systems and varying timesteps. Evaluated across three fluid dynamics benchmarks, VICON significantly outperforms state-of-the-art baselines: DPOT and MPP, reducing the averaged last-step rollout error by 37.9\\% compared to DPOT and 44.7\\% compared to MPP, while requiring only 72.5\\% and 34.8\\% of their respective inference times. VICON naturally supports flexible rollout strategies with varying timestep strides, enabling immediate deployment in \\textit{imperfect measurement systems} where sampling frequencies may differ or frames might be dropped—common challenges in real-world settings—without requiring retraining or interpolation. In these realistic scenarios, VICON exhibits remarkable robustness, experiencing only 24.41\\% relative performance degradation compared to 71.37\\%-74.49\\% degradation in baseline methods, demonstrating its versatility for depolying in realistic applications.",
      "keywords": [
        "AI4Science",
        "Learning PDE",
        "Fluid Dynamics",
        "In-Context Learning"
      ],
      "decision": "Reject",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "s0JVsx3bx1",
      "title": "1000 Layer Networks for Self-Supervised RL: Scaling Depth Can Enable New Goal-Reaching Capabilities",
      "abstract": "Scaling up self-supervised learning has driven breakthroughs in language and vision, yet comparable progress has remained elusive in reinforcement learning (RL). In this paper, we study building blocks for self-supervised RL that unlock substantial improvements in scalability, with network depth serving as a critical factor. Whereas most RL papers in recent years have relied on shallow architectures (around 2 -- 5 layers), we demonstrate that increasing the depth up to 1024 layers can significantly boost performance.\nOur experiments are conducted in an unsupervised goal-conditioned setting, where no demonstrations or rewards are provided, so an agent must explore (from scratch) and learn how to maximize the likelihood of reaching commanded goals.\nEvaluated on simulated locomotion and manipulation tasks, our approach increases performance on the self-supervised contrastive RL algorithm by $2\\times$ -- $50\\times$, outperforming other goal-conditioned baselines.\nIncreasing the model depth not only increases success rates but also qualitatively changes the behaviors learned.",
      "keywords": [
        "Reinforcement Learning",
        "Self-Supervised Learning",
        "Contrastive RL",
        "Goal-conditioned RL",
        "Scaling"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    }
  },
  {
    "group_id": "H8fscnm6Xx",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "6JlzhISecd",
      "title": "A Stochastic Approximation Approach for Efficient Decentralized Optimization on Random Networks",
      "abstract": "A challenging problem in decentralized optimization is to develop algorithms with fast convergence on random and time varying topologies under unreliable and bandwidth-constrained communication network. This paper studies a stochastic approximation approach with a Fully Stochastic Primal Dual Algorithm (FSPDA) framework. Our framework relies on a novel observation that the randomness in time varying topology can be incorporated in a stochastic augmented Lagrangian formulation, whose expected value admits saddle points that coincide with stationary solutions of the decentralized optimization problem. With the FSPDA framework, we develop two new algorithms supporting efficient sparsified communication on random time varying topologies --- FSPDA-SA allows agents to execute multiple local gradient steps depending on the time varying topology to accelerate convergence, and FSPDA-STORM further incorporates a variance reduction step to improve sample complexity. For problems with smooth (possibly non-convex) objective function, within $T$ iterations, we show that FSPDA-SA (resp. FSPDA-STORM) finds an $\\mathcal{O}( 1/\\sqrt{T} )$-stationary (resp. $\\mathcal{O}( 1/T^{2/3} )$) solution. Numerical experiments show the benefits of the FSPDA algorithms.",
      "keywords": [
        "Decentralized Optimization",
        "Time Varying Graph",
        "Random Network",
        "Primal-dual Lagrangian",
        "Stochastic Approximation"
      ],
      "decision": "Reject",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "7AwFJzgIUW",
      "title": "Dynamical Low-Rank Compression of Neural Networks with Robustness under Adversarial Attacks",
      "abstract": "Deployment of neural networks on resource-constrained devices demands models that are both compact and robust to adversarial inputs. However, compression and adversarial robustness often conflict. In this work, we introduce a dynamical low-rank training scheme enhanced with a novel spectral regularizer that controls the condition number of the low-rank core in each layer. This approach mitigates the sensitivity of compressed models to adversarial perturbations without sacrificing clean accuracy. The method is model- and data-agnostic, computationally efficient, and supports rank adaptivity to automatically compress the network at hand. Extensive experiments across standard architectures, datasets, and adversarial attacks show the regularized networks can achieve over 94 compression while recovering or improving adversarial accuracy relative to uncompressed baselines.",
      "keywords": [
        "Low Rank",
        "Adversarial Robustenss",
        "Adversarial Attacks",
        "Rank Adaptive",
        "Computer Vision",
        "Compression"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    }
  },
  {
    "group_id": "heJ7NRInjs",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "0Y4gjqdvC6",
      "title": "Fundamental Limits of Game-Theoretic LLM Alignment: Smith Consistency and Preference Matching",
      "abstract": "Nash Learning from Human Feedback (NLHF) is a game-theoretic framework for aligning large language models (LLMs) with human preferences by modeling learning as a two-player zero-sum game. However, using raw preference as the payoff in the game highly limits the potential of the game-theoretic LLM alignment framework.In this paper, we systematically study using what choices of payoff based on the pairwise human preferences can yield desirable alignment properties. We establish necessary and sufficient conditions for Condorcet consistency, diversity through mixed strategies, and Smith consistency. These results provide a theoretical foundation for the robustness of game-theoretic LLM alignment. Further, we show the impossibility of preference matching—i.e., no smooth and learnable mappings of pairwise preferences can guarantee a unique Nash equilibrium that matches a target policy, even under standard assumptions like the Bradley-Terry-Luce (BTL) model. This result highlight the fundamental limitation of game-theoretic LLM alignment.",
      "keywords": [
        "Large Language Models",
        "Preference Alignment",
        "Nash Equilibrium",
        "Nash Learning from Human Feedback"
      ],
      "decision": "Reject",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "NM8Apk61NA",
      "title": "HyperET: Efficient Training in Hyperbolic Space for Multi-modal Large Language Models",
      "abstract": "Multi-modal large language models (MLLMs) have emerged as a transformative approach for aligning visual and textual understanding. They typically require extremely high computational resources (e.g., thousands of GPUs) for training to achieve cross-modal alignment at multi-granularity levels. We argue that a key source of this inefficiency lies in the vision encoders they widely equip with, e.g., CLIP and SAM, which lack the alignment with language at multi-granularity levels. To address this issue, in this paper, we leverage hyperbolic space, which inherently models hierarchical levels and thus provides a principled framework for bridging the granularity gap between visual and textual modalities at an arbitrary granularity level. Concretely, we propose an efficient training paradigm for MLLMs, dubbed as \\blg, which can optimize visual representations to align with their textual counterparts at an arbitrary granularity level through dynamic hyperbolic radius adjustment in hyperbolic space. \\alg employs learnable matrices with M\\\"{o}bius multiplication operations, implemented via three effective configurations: diagonal scaling matrices, block-diagonal matrices, and banded matrices, providing a flexible yet efficient parametrization strategy. Comprehensive experiments across multiple MLLM benchmarks demonstrate that \\alg consistently improves both existing pre-training and fine-tuning MLLMs clearly with less than 1\\% additional parameters. Code is available at \\url{https://github.com/godlin-sjtu/HyperET}.",
      "keywords": [
        "Efficient Training",
        "Multi-modal Large Language Models",
        "Granularity Levels",
        "Hyperbolic Space"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    }
  },
  {
    "group_id": "M4Laq0Y5WG",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "NM8Apk61NA",
      "title": "HyperET: Efficient Training in Hyperbolic Space for Multi-modal Large Language Models",
      "abstract": "Multi-modal large language models (MLLMs) have emerged as a transformative approach for aligning visual and textual understanding. They typically require extremely high computational resources (e.g., thousands of GPUs) for training to achieve cross-modal alignment at multi-granularity levels. We argue that a key source of this inefficiency lies in the vision encoders they widely equip with, e.g., CLIP and SAM, which lack the alignment with language at multi-granularity levels. To address this issue, in this paper, we leverage hyperbolic space, which inherently models hierarchical levels and thus provides a principled framework for bridging the granularity gap between visual and textual modalities at an arbitrary granularity level. Concretely, we propose an efficient training paradigm for MLLMs, dubbed as \\blg, which can optimize visual representations to align with their textual counterparts at an arbitrary granularity level through dynamic hyperbolic radius adjustment in hyperbolic space. \\alg employs learnable matrices with M\\\"{o}bius multiplication operations, implemented via three effective configurations: diagonal scaling matrices, block-diagonal matrices, and banded matrices, providing a flexible yet efficient parametrization strategy. Comprehensive experiments across multiple MLLM benchmarks demonstrate that \\alg consistently improves both existing pre-training and fine-tuning MLLMs clearly with less than 1\\% additional parameters. Code is available at \\url{https://github.com/godlin-sjtu/HyperET}.",
      "keywords": [
        "Efficient Training",
        "Multi-modal Large Language Models",
        "Granularity Levels",
        "Hyperbolic Space"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "csjryswwao",
      "title": "SyntheOcc: Synthesize Occupancy-Controlled Street View Images through 3D Semantic MPIs",
      "abstract": "The advancement of autonomous driving is increasingly reliant on high-quality annotated datasets, especially in the task of 3D occupancy prediction, where the occupancy labels require dense 3D annotation with significant human effort. In this paper, we propose SyntheOcc, which denotes a diffusion model that Synthesize photorealistic and geometric controllable images by conditioning Occupancy labels in driving scenarios. This yields an unlimited amount of diverse, annotated, and controllable datasets for applications like training perception models and simulation. SyntheOcc addresses the critical challenge of how to efficiently encode 3D geometric information as conditional input to a 2D diffusion model. Our approach innovatively incorporates 3D semantic multi-plane images (MPIs) to provide comprehensive and spatially aligned 3D scene descriptions for conditioning. By doing so, SyntheOcc can generate photorealistic multi-view images and videos that faithfully align with the given geometric labels (semantics in 3D voxel space). Extensive qualitative and quantitative evaluations of SyntheOcc on the nuScenes dataset prove its effectiveness in generating controllable occupancy datasets that serve as an effective data augmentation to perception models.",
      "keywords": [
        "Autonomous Driving",
        "Generative Model",
        "Image and Video Generation",
        "Data-centric AI",
        "3D Vision"
      ],
      "decision": "Reject",
      "year": "2025"
    }
  },
  {
    "group_id": "gkcU26BOml",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "dZRCZUvKPj",
      "title": "Get RICH or Die Scaling: Profitably Trading Inference Compute for Robustness",
      "abstract": "Recent work shows that increasing inference-time compute through generation of long reasoning traces improves not just capability scores, but robustness to various text jailbreaks designed to control models or lower their guardrails. However, multimodal reasoning offers comparatively little defense against vision jailbreaks, which typically succeed by creating noise-like perturbations. When attacking a robust model, vision attacks are also capable of and often must resort to producing human-interpretable perturbations. Rather than operating in a model's blind-spot or out of its training distribution, such interpretable attacks construct familiar concepts connected to the attacker's goal. Inspired by the ability of robust models to force attacks into this space that appears more in-distribution for reasoning tasks, we posit the Robustness from Inference Compute Hypothesis (RICH): defending against attacks with inference compute (like reasoning) profits as those attacks become more in-distribution. To test this, we adversarially attack models of varying robustness with black-box-transfer and white-box attacks. RICH predicts a rich-get-richer dynamic: models that start with higher initial robustness gain more robustness benefits from increases in inference-time compute. Consistent with RICH, we find that robust models benefit more from increased compute, whereas non-robust models show little to no improvement. Our work suggests that inference-time compute can be an effective defense against adversarial attacks, provided the base model has some degree of robustness. In particular, layering disparate train-time and test-time defenses aids robustness not additively, but synergistically.",
      "keywords": [
        "VLMs",
        "robustness",
        "adversarial attacks",
        "reasoning",
        "scaling",
        "efficiency"
      ],
      "decision": "Reject",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "qYkhCah8OZ",
      "title": "Boosting Knowledge Utilization in Multimodal Large Language Models via Adaptive Logits Fusion and Attention Reallocation",
      "abstract": "Despite their recent progress, Multimodal Large Language Models (MLLMs) often struggle in knowledge-intensive tasks due to the limited and outdated parametric knowledge acquired during training. Multimodal Retrieval Augmented Generation addresses this issue by retrieving contextual knowledge from external databases, thereby enhancing MLLMs with expanded knowledge sources. \nHowever, existing MLLMs often fail to fully leverage the retrieved contextual knowledge for response generation. We examine representative MLLMs and identify two major causes, namely, attention bias toward different tokens and knowledge conflicts between parametric and contextual knowledge. To this end, we design Adaptive Logits Fusion and Attention Reallocation (ALFAR), a training-free and plug-and-play approach that improves MLLM responses by maximizing the utility of the retrieved knowledge. Specifically, ALFAR tackles the challenges from two perspectives. First, it alleviates attention bias by adaptively shifting attention from visual tokens to relevant context tokens according to query-context relevance. Second, it decouples and weights parametric and contextual knowledge at output logits, mitigating conflicts between the two types of knowledge. As a plug-and-play method, ALFAR achieves superior performance across diverse datasets without requiring additional training or external tools. Extensive experiments over multiple MLLMs and benchmarks show that ALFAR consistently outperforms the state-of-the-art by large margins. Our code and data are available at https://github.com/Lackel/ALFAR.",
      "keywords": [
        "Multimodal Large Language Models",
        "Multimodal Retrieval Augmented Generation"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    }
  },
  {
    "group_id": "WOzffPgVjF",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "WOzffPgVjF",
      "title": "Knowing Your Target: Target-Aware Transformer Makes Better Spatio-Temporal Video Grounding",
      "abstract": "Transformer has attracted increasing interest in spatio-temporal video grounding, or STVG, owing to its end-to-end pipeline and promising result. Existing Transformer-based STVG approaches often leverage a set of object queries, which are initialized simply using zeros and then gradually learn target position information via iterative interactions with multimodal features, for spatial and temporal localization. Despite simplicity, these zero object queries, due to lacking target-specific cues, are hard to learn discriminative target information from interactions with multimodal features in complicated scenarios (e.g., with distractors or occlusion), resulting in degradation. Addressing this, we introduce a novel $\\textbf{T}$arget-$\\textbf{A}$ware Transformer for $\\textbf{STVG}$ ($\\textbf{TA-STVG}$), which seeks to adaptively generate object queries via exploring target-specific cues from the given video-text pair, for improving STVG. The key lies in two simple yet effective modules, comprising text-guided temporal sampling (TTS) and attribute-aware spatial activation (ASA), working in a cascade. The former focuses on selecting target-relevant temporal cues from a video utilizing holistic text information, while the latter aims at further exploiting the fine-grained visual attribute information of the object from previous target-aware temporal cues, which is applied for object query initialization. Compared to existing methods leveraging zero-initialized queries, object queries in our TA-STVG, directly generated from a given video-text pair, naturally carry target-specific cues, making them adaptive and better interact with multimodal features for learning more discriminative information to improve STVG. In our experiments on three benchmarks, including HCSTVG-v1/-v2 and VidSTG, TA-STVG achieves state-of-the-art performance and significantly outperforms the baseline, validating its efficacy. Moreover, TTS and ASA are designed for general purpose. When applied to existing methods such as TubeDETR and STCAT, we show substantial performance gains, verifying its generality. Code is released at https://github.com/HengLan/TA-STVG.",
      "keywords": [
        "Spatio-Temporal Video Grounding"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "xSOl0s1u77",
      "title": "TC-Bench: Benchmarking Temporal Compositionality in Conditional Video Generation",
      "abstract": "Video generation has many unique challenges beyond those of image generation. The temporal dimension introduces extensive possible variations across frames, over which consistency and continuity may be violated. In this study, we move beyond evaluating simple actions and argue that generated videos should incorporate the emergence of new concepts and their relation transitions like in real-world videos as time progresses. To assess the \\textbf{T}emporal \\textbf{C}ompositionality of video generation models, we propose TC-Bench, a benchmark of meticulously crafted text prompts, corresponding ground truth videos, and robust evaluation metrics. The prompts articulate the initial and final states of scenes, effectively reducing ambiguities for frame development and simplifying the assessment of transition completion. In addition, by collecting aligned real-world videos corresponding to the prompts, we expand TC-Bench's applicability from text-conditional models to image-conditional ones that can perform generative frame interpolation. We also develop new metrics to measure the completeness of component transitions in generated videos, which demonstrate significantly higher correlations with human judgments than existing metrics. Our comprehensive experimental results reveal that most video generators achieve less than ～20% of the compositional changes, highlighting enormous space for future improvement. Our analysis indicates that current video generation models struggle to interpret descriptions of compositional changes and dynamically map varied semantics across different time steps.",
      "keywords": [
        "Video Generation Benchmark; Text-to-Video Generation; Compositional Video Generation"
      ],
      "decision": "Reject",
      "year": "2025"
    }
  },
  {
    "group_id": "HD6bWcj87Y",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "BQgAToASdX",
      "title": "Generalized Group Data Attribution",
      "abstract": "Data Attribution (DA) methods quantify the influence of individual training data points on model outputs and have broad applications such as explainability, data selection, and noisy label identification. However, existing DA methods are often computationally intensive, limiting their applicability to large-scale machine learning models. To address this challenge, we introduce the Generalized Group Data Attribution (GGDA) framework, which computationally simplifies DA by attributing to groups of training points instead of individual ones. GGDA is a general framework that subsumes existing attribution methods and can be applied to new DA techniques as they emerge. It allows users to optimize the trade-off between efficiency and fidelity based on their needs. Our empirical results demonstrate that GGDA applied to popular DA methods such as Influence Functions, TracIn, and TRAK results in upto 10x-50x speedups over standard DA methods while gracefully trading off attribution fidelity. For downstream applications such as dataset pruning and noisy label identification, \nwe demonstrate that GGDA significantly improves computational efficiency and maintains effectiveness, enabling practical applications in large-scale machine learning scenarios that were previously infeasible.",
      "keywords": [
        "generalized",
        "group",
        "data attribution",
        "efficiency",
        "training data",
        "influence",
        "tracin",
        "trak"
      ],
      "decision": "Reject",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "HD6bWcj87Y",
      "title": "Data Shapley in One Training Run",
      "abstract": "Data Shapley offers a principled framework for attributing the contribution of data within machine learning contexts. However, the traditional notion of Data Shapley requires re-training models on various data subsets, which becomes computationally infeasible for large-scale models. Additionally, this retraining-based definition cannot evaluate the contribution of data for a specific model training run, which may often be of interest in practice. This paper introduces a novel concept, In-Run Data Shapley, which eliminates the need for model retraining and is specifically designed for assessing data contribution for a particular model of interest. In-Run Data Shapley calculates the Shapley value for each gradient update iteration and accumulates these values throughout the training process. We present several techniques that allow the efficient scaling of In-Run Data Shapley to the size of foundation models. In its most optimized implementation, our method adds negligible runtime overhead compared to standard model training. This dramatic efficiency improvement makes it possible to perform data attribution for the foundation model pretraining stage. We present several case studies that offer fresh insights into pretraining data's contribution and discuss their implications for copyright in generative AI and pretraining data curation.",
      "keywords": [
        "Shapley value",
        "data valuation."
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    }
  },
  {
    "group_id": "FDnZFpHmU4",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "f4gF6AIHRy",
      "title": "Combatting Dimensional Collapse in LLM Pre-Training Data via Submodular File Selection",
      "abstract": "Selecting high-quality pre-training data for large language models (LLMs) is crucial for enhancing their overall performance under limited computation budget, improving both training and sample efficiency. Recent advancements in file selection primarily rely on using an existing or trained proxy model to assess the similarity of samples to a target domain, such as high quality sources BookCorpus and Wikipedia. However, upon revisiting these methods, the domain-similarity selection criteria demonstrates a diversity dilemma, i.e. dimensional collapse in the feature space, improving performance on the domain-related tasks but causing severe degradation on generic performance.To prevent collapse and enhance diversity, we propose a DiverSified File selection algorithm (DiSF), which selects the most decorrelated text files in the feature space. We approach this with a classical greedy algorithm to achieve more uniform eigenvalues in the feature covariance matrix of the selected texts, analyzing its approximation to the optimal solution under a formulation of $\\gamma$-weakly submodular optimization problem. Empirically, we establish a benchmark and conduct extensive experiments on the TinyLlama architecture with models from 120M to 1.1B parameters. Evaluating across nine tasks from the Harness framework, DiSF demonstrates a significant improvement on overall performance. Specifically, DiSF saves 98.5\\% of 590M training files in SlimPajama, outperforming the full-data pre-training within a 50B training budget, and achieving about 1.5x training efficiency and 5x data efficiency. Source code\nis available at: https://github.com/MediaBrain-SJTU/DiSF.git.",
      "keywords": [
        "file selection",
        "large language model",
        "pre-training",
        "submodular optimization"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "qLxkXgmWwx",
      "title": "Investigating Factuality in Long-Form Text Generation: The Roles of Self-Known and Self-Unknown",
      "abstract": "Large language models (LLMs) have demonstrated strong capabilities in text understanding and generation. However, they often lack factuality, producing a mixture of true and false information, especially in long-form generation. In this work, we investigates the factuality of long-form text generation across various large language models (LLMs), including GPT-4, Gemini-1.5-Pro, Claude-3-Opus, Llama-3-70B, and Mistral. Our analysis reveals that factuality scores tend to decline in later sentences of the generated text, accompanied by a rise in the number of unsupported claims.\nFurthermore, we explore the effectiveness of different evaluation settings to assess whether LLMs can accurately judge the correctness of their own outputs: Self-Known (the percentage of supported atomic claims, decomposed from LLM outputs, that the corresponding LLMs judge as correct) and Self-Unknown (the percentage of unsupported atomic claims that the corresponding LLMs judge as incorrect). The results indicate that even advanced models like GPT-4 and Gemini-1.5-Pro fail to achieve perfect Self-Known scores, while their Self-Unknown scores remain notably above zero, reflecting ongoing uncertainty in their self-assessments.\nMoreover, we find a correlation between higher Self-Known scores and improved factuality, while higher Self-Unknown scores are associated with lower factuality. Interestingly, even without significant changes in the models' self-judgment (Self-Known and Self-Unknown), the number of unsupported claims can increases, likely as an artifact of long-form generation. These findings show the limitations of current LLMs in long-form generation, and provide valuable insights for improving factuality in long-form text generation.",
      "keywords": [
        "long-form generation",
        "Factuality"
      ],
      "decision": "Reject",
      "year": "2025"
    }
  },
  {
    "group_id": "hUb2At2DsQ",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "NPDnRLFhc0",
      "title": "EvidenceBench: A Benchmark for Extracting Evidence from Biomedical Papers",
      "abstract": "We study the task of automatically finding evidence relevant to hypotheses in biomedical papers. Finding relevant evidence is an important stage when humans write systematic reviews about certain scientific hypotheses. We introduce EvidenceBench to measure models performance on this task, which is created by a novel pipeline that consists of hypothesis generation and sentence-by-sentence annotation of biomedical papers for relevant evidence, completely guided by and faithfully following existing human experts judgment. Our pipeline's value and accuracy is validated by teams of human experts. We evaluate a diverse set of language models and retrieval systems on the benchmark and find the performance of the best models still falls significantly short of expert-level on this task. To show the scalability of our proposed pipeline, we create a larger EvidenceBench-100k with 107,461 fully annotated papers with hypotheses to faciliate model training and development. Both datasets are available at https://github.com/EvidenceBench/EvidenceBench",
      "keywords": [
        "Biomedical Benchmark",
        "Scientific Information Retrieval",
        "Scientific Information Extraction",
        "Large Language Models",
        "BioNLP"
      ],
      "decision": "Reject",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "WJaUkwci9o",
      "title": "Self-Improvement in Language Models: The Sharpening Mechanism",
      "abstract": "Recent work in language modeling has raised the possibility of “self-improvement,” where an LLM evaluates and refines its own generations to achieve higher performance without external feedback. It is impossible for this self-improvement to create information that is not already in the model, so why should we expect that this will lead to improved capabilities? We offer a new theoretical perspective on the capabilities of self-improvement through a lens we refer to as “sharpening.” Motivated by the observation that language models are often better at verifying response quality than they are at generating correct responses, we formalize self-improvement as using the model itself as a verifier during post-training in order to ‘sharpen’ the model to one placing large mass on high-quality sequences, thereby amortizing the expensive inference-time computation of generating good sequences. We begin by introducing a new statistical framework for sharpening in which the learner has sample access to a pre-trained base policy. Then, we analyze two natural families of self improvement algorithms based on SFT and RLHF. We find that (i) the SFT-based approach is minimax optimal whenever the initial model has sufficient coverage, but (ii) the RLHF-based approach can improve over SFT-based self- improvement by leveraging online exploration, bypassing the need for coverage. We view these findings as a starting point toward a foundational understanding that can guide the design and evaluation of self-improvement algorithms.",
      "keywords": [
        "Learning theory",
        "Sample complexity",
        "Self-Improvement",
        "Language Models"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    }
  },
  {
    "group_id": "UWdPsY7agk",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "en3NwykrHW",
      "title": "Minimax Optimal Regret Bound for Reinforcement Learning with Trajectory Feedback",
      "abstract": "We study the reinforcement learning (RL) problem with trajectory feedback. The trajectory feedback based reinforcement learning problem, where the learner can only observe the accumulative noised reward along the trajectory, is particularly suitable for the practical scenarios where the agent suffers extensively from querying the reward in each single step. For a finite-horizon Markov Decision Process (MDP) with $S$ states, $A$ actions and a horizon length of $H$, we develop an algorithm that enjoys an optimal regret of $\\tilde{O}\\left(\\sqrt{SAH^3K}\\right)$ in $K$ episodes for sufficiently large $K$. To achieve this, our technical contributions are two-fold: (1) we incorporate reinforcement learning with linear bandits problem to construct a tighter confidence region for the reward function; (2) we construct a reference transition model to better guide the exploration process.",
      "keywords": [
        "Reinforcement learning theory",
        "regret analysis",
        "trajectory feedback"
      ],
      "decision": "Reject",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "stUKwWBuBm",
      "title": "Tractable Multi-Agent Reinforcement Learning through Behavioral Economics",
      "abstract": "A significant roadblock to the development of principled multi-agent reinforcement learning (MARL) algorithms is the fact that desired solution concepts like Nash equilibria may be intractable to compute. We show how one can overcome this obstacle by introducing concepts from behavioral economics into MARL. To do so, we imbue agents with two key features of human decision-making: risk aversion and bounded rationality. We show that introducing these two properties into games gives rise to a class of equilibria---risk-averse quantal response equilibria (RQE)---which are tractable to compute in \\emph{all} $n$-player matrix and finite-horizon Markov games.  In particular, we show that they emerge as the endpoint of no-regret learning in suitably adjusted versions of the games. Crucially, the class of computationally tractable RQE is independent of the underlying game structure and only depends on agents' degrees of risk-aversion and bounded rationality.  To validate the expressivity of this class of solution concepts we show that it captures peoples' patterns of play in a number of 2-player matrix games previously studied in experimental economics. Furthermore, we give a first analysis of the sample complexity of computing these equilibria in finite-horizon Markov games when one has access to a generative model. We validate our findings on a simple multi-agent reinforcement learning benchmark. Our results open the doors for to the principled development of new decentralized multi-agent reinforcement learning algorithms.",
      "keywords": [
        "behavioral economics",
        "risk-aversion",
        "multi-agent reinforcement learning",
        "quantal response",
        "bounded rationality"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    }
  },
  {
    "group_id": "vyflgpwfJW",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "lXFGpwtkRl",
      "title": "Improving Model Alignment Through Collective Intelligence of Open-Source Models",
      "abstract": "Building helpful and harmless large language models (LLMs) requires effective model alignment approach based on human instructions and feedback; this necessitates high-quality human-labeled data. Constructing such datasets is often expensive and not scalable, and may face potential bottleneck on diversity. To address these challenges, we introduce Mixture-of-Agent Alignment (MoAA), an effective approach that leverages the collective strengths of various language models to provide high-quality data for model alignment. By employing MoAA, we enhance both supervised fine-tuning (SFT) and preference optimization, leading to improved performance compared to using a single model alone, including the state-of-ther-art commercial model. This approach leads to an intriguing direction of model alignment through an scalable and diverse instruction data recipe based on open-sourced models.",
      "keywords": [
        "Model Alignment",
        "Multi-Agent Inference",
        "Large Language Model"
      ],
      "decision": "Reject",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "mtSSFiqW6y",
      "title": "Judge Decoding: Faster Speculative Sampling Requires Going Beyond Model Alignment",
      "abstract": "The performance of large language models (LLMs) is closely linked to their underlying size, leading to ever-growing networks and hence slower inference. Speculative decoding has been proposed as a technique to accelerate autoregressive generation, leveraging a fast draft model to propose candidate tokens, which are then verified in parallel based on their likelihood under the target model. While this approach guarantees to reproduce the target output, it incurs a substantial penalty: many high-quality draft tokens are rejected, even when they represent objectively valid continuations. Indeed, we show that even powerful draft models such as GPT-4o, as well as human text cannot achieve high acceptance rates under the standard verification scheme. This severely limits the speedup potential of current speculative decoding methods, as an early rejection becomes overwhelmingly likely when solely relying on alignment of draft and target.\nWe thus ask the following question: Can we adapt verification to recognize correct, but non-aligned replies? To this end, we draw inspiration from the LLM-as-a-judge framework, which demonstrated that LLMs are able to rate answers in a versatile way. We carefully design a dataset coined TokenCourt to elicit the same capability in the target model by training a compact module on top of the embeddings to produce ``judgements\" of the current continuation. We showcase our strategy on the Llama-3.1 family, where our 8B/405B-Judge achieves a speedup of $9\\times$ over Llama-405B, while maintaining its quality on a large range of benchmarks. These benefits remain present even in optimized inference frameworks, where our method reaches up to $141$ tokens/s for 8B/70B-Judge and $129$ tokens/s for 8B/405B on $2$ and $8$ H100s respectively.",
      "keywords": [
        "LLM inference",
        "speculative decoding"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    }
  },
  {
    "group_id": "zi8YBcmXqA",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "UHPnqSTBPO",
      "title": "Trust or Escalate: LLM Judges with Provable Guarantees for Human Agreement",
      "abstract": "We present a principled approach to provide LLM-based evaluation with a rigorous guarantee of human agreement. We first propose that a reliable evaluation method should not uncritically rely on model preferences for pairwise evaluation, but rather assess the confidence of judge models and selectively decide when to trust its judgement. We then show that under this *selective evaluation* framework, human agreement can be provably guaranteed---such that the model evaluation aligns with that of humans to a user-specified agreement level. As part of our framework, we also introduce *Simulated Annotators*, a novel confidence estimation method that significantly improves judge calibration and thus enables high coverage of evaluated instances. Finally, we propose *Cascaded Selective Evaluation*, where we use cheaper models as initial judges and escalate to stronger models only when necessary---again, while still providing a provable guarantee of human agreement. Experimental results show that Cascaded Selective Evaluation guarantees strong alignment with humans, far beyond what LLM judges could achieve without selective evaluation. For example, on a subset of Chatbot Arena where GPT-4 almost never achieves 80% human agreement, our method, even while employing substantially cost-effective models such as Mistral-7B, *guarantees* over 80% human agreement with almost 80% test coverage.",
      "keywords": [
        "Large Language Model",
        "LLM",
        "LLM Judge",
        "Evaluation",
        "Alignment"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "zi8YBcmXqA",
      "title": "PokeChamp: an Expert-level Minimax Language Agent for Competitive Pokemon",
      "abstract": "We introduce \\texttt{Pok\\'eChamp}, a Large Language Model (LLM) powered game-theoretic aware agent for two-player competitive Pok\\'emon battles, that uses an LLM prior and collected high-Elo human data to model minimax search without any additional training. \\texttt{Pok\\'eChamp} uses a depth-limited minimax search online where the LLM replaces three key components: 1) action sampling from the LLM guided by prompts (including from a damage calculation tool), 2) opponent-modeling via the historical likelihood of actions from our dataset to model the effect of LLM-predicted opponent actions, and 3) state value calculation for the LLM to reflect on each intrinsic state. \\texttt{Pok\\'eChamp} outperforms all existing AIs (76\\%) and heuristic bots (84\\%) by an enormous margin, including winning consistently (>50\\%) against prior human-parity work run with a frontier model, GPT 4-o, while using an open-source 8 billion parameter Llama 3.1 model. \\texttt{Pok\\'eChamp} achieves expert performance in the top 10\\% of players on the online ladder against competitive human players at an Elo of 1500. Finally, we collect the largest Pok\\'emon battling dataset, including 1 million+ games with 150k+ high Elo games, prepare a series of battling benchmarks based on real player data and puzzles to analyze specific battling abilities, and provide crucial updates to the local game engine. Our code is available \\href{https://sites.google.com/view/pokechamp-llm}{online}.",
      "keywords": [
        "multiagent",
        "LLM agents",
        "competitive games",
        "game theory",
        "reinforcement learning"
      ],
      "decision": "Reject",
      "year": "2025"
    }
  },
  {
    "group_id": "vhPE3PtTgC",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "asA7vvsgcI",
      "title": "Detecting Training Data of Large Language Models via Expectation Maximization",
      "abstract": "The widespread deployment of large language models (LLMs) has led to impressive advancements, yet information about their training data, a critical factor in their performance, remains undisclosed. Membership inference attacks (MIAs) aim to determine whether a specific instance was part of a target model's training data. MIAs can offer insights into LLM outputs and help detect and address concerns such as data contamination and compliance with privacy and copyright standards. However, applying MIAs to LLMs presents unique challenges due to the massive scale of pre-training data and the ambiguous nature of membership. Additionally, creating appropriate benchmarks to evaluate MIA methods is not straightforward, as training and test data distributions are often unknown. In this paper, we introduce EM-MIA, a novel MIA method for LLMs that iteratively refines membership scores and prefix scores via an expectation-maximization algorithm, leveraging the duality that the estimates of these scores can be improved by each other. Membership scores and prefix scores assess how each instance is likely to be a member and discriminative as a prefix, respectively. Our method achieves state-of-the-art results on the WikiMIA dataset. To further evaluate EM-MIA, we present OLMoMIA, a benchmark built from OLMo resources, which allows us to control the difficulty of MIA tasks with varying degrees of overlap between training and test data distributions. We believe that EM-MIA serves as a robust MIA method for LLMs and that OLMoMIA provides a valuable resource for comprehensively evaluating MIA approaches, thereby driving future research in this critical area.",
      "keywords": [
        "large language models",
        "membership inference attack",
        "data contamination",
        "memorization"
      ],
      "decision": "Reject",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "vf5aUZT0Fz",
      "title": "DEPT: Decoupled Embeddings for Pre-training Language Models",
      "abstract": "Language Model pre-training uses broad data mixtures to enhance performance across domains and languages. However, training on such heterogeneous text corpora requires extensive and expensive efforts. Since these data sources vary significantly in lexical, syntactic, and semantic aspects, they cause negative interference or the ``curse of multilinguality''. To address these challenges we propose a communication-efficient pre-training framework, DEPT. Our method decouples embeddings from the transformer body while simultaneously training the latter on multiple data sources without requiring a shared vocabulary. DEPT can: (1) train robustly and effectively under significant data heterogeneity, (2) minimize token embedding parameters to only what the data source vocabulary requires, while cutting communication costs in direct proportion to both the communication frequency and the reduction in parameters, (3) enhance transformer body plasticity and generalization, improving both average perplexity (up to 20%) and downstream task performance, and (4) enable training with custom optimized vocabularies per data source. We demonstrate DEPT's potential via the first vocabulary-agnostic federated pre-training of billion-scale models, reducing communication costs by orders of magnitude and embedding memory by 4-5x.",
      "keywords": [
        "Decentralized Training",
        "Federated Learning",
        "Multi-domain Training",
        "Multilingual Training"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    }
  },
  {
    "group_id": "lydPkW4lfz",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "mscnV6JZkT",
      "title": "Distributed Gradient Descent with Many Local Steps in Overparameterized Models",
      "abstract": "In distributed training of machine learning models, gradient descent with local iterative steps is a very popular method, variants of which are commonly known as Local-SGD or the Federated Averaging (FedAvg). In this method, gradient steps based on local datasets are taken independently in distributed compute nodes to update  the local models, which are then aggregated intermittently. Although the existing convergence analysis suggests that with heterogeneous data, FedAvg encounters quick performance degradation as the number of local steps increases, it is shown to work quite well in practice, especially in the distributed training of large language models. In this work we try to explain this good performance from a viewpoint of implicit bias in Local Gradient Descent (Local-GD) with a large number of local steps. In overparameterized regime, the gradient descent at each compute node would lead the model to a specific direction locally. We characterize the dynamics of the aggregated global model and compare it to the centralized model trained with all of the data in one place. In particular, we analyze the implicit bias of gradient descent on linear models, for both regression and classification tasks. Our analysis shows that the aggregated global model  converges exactly to the centralized model for regression tasks, and converges (in direction) to the same feasible set as centralized model  for classification tasks. We further propose a Modified Local-GD with a refined aggregation and theoretically show it converges to the centralized model in direction for linear classification. We empirically verified our theoretical findings in linear models and also conducted experiments on distributed fine-tuning of pretrained neural networks to further apply our theory.",
      "keywords": [
        "Distributed Learning",
        "Overparameterization",
        "Optimization",
        "Federated Learning"
      ],
      "decision": "Reject",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "sbG8qhMjkZ",
      "title": "Improved Finite-Particle Convergence Rates for Stein Variational Gradient Descent",
      "abstract": "We provide finite-particle convergence rates for the Stein Variational Gradient Descent (SVGD) algorithm in the Kernelized Stein Discrepancy ($\\KSD$) and Wasserstein-2 metrics. Our key insight is that the time derivative of the relative entropy between the joint density of $N$ particle locations and the $N$-fold product target measure, starting from a regular initial distribution, splits into a dominant 'negative part' proportional to $N$ times the expected $\\KSD^2$ and a smaller 'positive part'. This observation leads to $\\KSD$ rates of order $1/\\sqrt{N}$, in both continuous and discrete time, providing a near optimal (in the sense of matching the corresponding i.i.d. rates) double exponential improvement over the recent result by~\\cite{shi2024finite}. Under mild assumptions on the kernel and potential, these bounds also grow polynomially in the dimension $d$. By adding a bilinear component to the kernel, the above approach is used to further obtain Wasserstein-2 convergence in continuous time. For the case of `bilinear + Mat\\'ern' kernels, we derive Wasserstein-2 rates that exhibit a curse-of-dimensionality similar to the i.i.d. setting. We also obtain marginal convergence and long-time propagation of chaos results for the time-averaged particle laws.",
      "keywords": [
        "Stein Variational Gradient Descent",
        "Non-asymptotic Rates",
        "Variational Inference"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    }
  },
  {
    "group_id": "GEBkyKZOc4",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "jwGPmIqE99",
      "title": "STRIDE: A Tool-Assisted LLM Agent Framework for Strategic and Interactive Decision-Making",
      "abstract": "Large Language Models (LLMs) have revolutionized natural language processing, showing remarkable linguistic proficiency and reasoning capabilities. However, their application in strategic multi-agent decision-making environments is hampered by significant limitations including poor mathematical reasoning, difficulty in following instructions, and a tendency to generate incorrect information. These deficiencies hinder their performance in strategic and interactive tasks that demand adherence to nuanced game rules, long-term planning, exploration in unknown environments, and anticipation of opponents' moves. To overcome these obstacles, this paper presents a novel LLM agent framework equipped with memory and specialized tools to enhance their strategic decision-making capabilities. We deploy the tools in a number of economically important environments, in particular bilateral bargaining and multi-agent and dynamic mechanism design. We employ quantitative metrics to assess the framework's performance in various strategic decision-making problems. Our findings establish that our enhanced framework significantly improves the strategic decision-making capability of LLMs. While we highlight the inherent limitations of current LLM models, we demonstrate the improvements through targeted enhancements, suggesting a promising direction for future developments in LLM applications for interactive environments.",
      "keywords": [
        "LLM Agent",
        "Strategic Decision Making",
        "Markov Decision Making Process"
      ],
      "decision": "Reject",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "mMPMHWOdOy",
      "title": "WizardMath: Empowering Mathematical Reasoning for Large Language Models via Reinforced Evol-Instruct",
      "abstract": "Large language models (LLMs), such as GPT-4, have shown remarkable performance in natural language processing (NLP) tasks, including challenging mathematical reasoning. However, most existing open-source models are only pre-trained on large-scale internet data and without math-related optimization. In this paper, we present WizardMath, which enhances the mathematical reasoning abilities of LLMs, by applying our proposed Reinforcement Learning from Evol-Instruct Feedback (RLEIF) method to the domain of math. Through extensive experiments on two mathematical reasoning benchmarks, namely GSM8k and MATH, we reveal the extraordinary capabilities of our model. Remarkably, WizardMath-Mistral 7B surpasses all other open-source LLMs by a substantial margin. Furthermore, WizardMath 70B even outperforms ChatGPT-3.5, Claude Instant, Gemini Pro and Mistral Medium. Additionally, our preliminary exploration highlights the pivotal role of instruction evolution and process supervision in achieving exceptional math performance.",
      "keywords": [
        "Mathematical Reasoning",
        "Evol-Instruct",
        "Reinforcement Learning"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    }
  },
  {
    "group_id": "6HcnC3pPkp",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "4FWAwZtd2n",
      "title": "Scaling LLM Test-Time Compute Optimally Can be More Effective than Scaling Parameters for Reasoning",
      "abstract": "Enabling LLMs to improve their outputs by using more test-time compute is a critical step towards building self-improving agents that can operate on open-ended natural language. In this paper, we scale up inference-time computation in LLMs, with a focus on answering: if an LLM is allowed to use a fixed but non-trivial amount of inference-time compute, how much can it improve its performance on a challenging prompt? Answering this question has implications not only on performance, but also on the future of LLM pretraining and how to tradeoff inference-time and pre-training compute. Little research has attempted to understand the scaling behaviors of test-time inference methods, with current work largely providing negative results for a number of these strategies. In this work, we analyze two primary mechanisms to scale test-time computation: (1) searching against dense, process-based verifier reward models (PRMs); and (2) updating the model's distribution over a response adaptively, given the prompt at test time. We find that in both cases, the effectiveness of different approaches to scaling test-time compute critically varies depending on the difficulty of the prompt. This observation motivates applying a \"compute-optimal\" scaling strategy, which acts to, as effectively as possible, allocate test-time compute per prompt in an adaptive manner. Using this compute-optimal strategy, we can improve the efficiency of test-time compute scaling for math reasoning problems by more than 4x compared to a best-of-N baseline. Additionally, in a FLOPs-matched evaluation, we find that on problems where a smaller base model attains somewhat non-trivial success rates, test-time compute  can be used to outperform a 14x larger model.",
      "keywords": [
        "test-time compute",
        "LLMs",
        "scaling",
        "language models"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "7tOc6h8bea",
      "title": "Adaptive Inference-Time Compute: LLMs Can Predict if They Can Do Better, Even Mid-Generation",
      "abstract": "Inference-time computation is a powerful paradigm to enhance the performance of large language models (LLMs), with Best-of-N sampling being a widely used technique. However, this method is computationally expensive, requiring both (1) an external reward model and (2) the generation of multiple samples. In this work, we introduce a new generative self-evaluation scheme designed to adaptively reduce the number of generated samples while maintaining or even improving performance. We use a generative reward model formulation, allowing the LLM to predict mid-generation the probability that restarting the generation will yield a better response. These predictions are obtained without an external reward model and can be used to decide whether or not to generate more samples, prune unpromising samples early on, or to pick the best sample. This capability is very inexpensive as it involves generating a single predefined token. Trained using a dataset constructed with real unfiltered LMSYS user prompts, Llama 3.1 8B's win rate against GPT-4 on AlpacaEval increases from 21\\% to 34\\% with 16 samples and math performance on GSM8K improves from 84\\% to 91\\%. By sampling only when the LLM determines that it is beneficial to do so and adaptively adjusting temperature annealing, we demonstrate that 74\\% of the improvement from using 16 samples can be achieved with only 1.2 samples on average. We further demonstrate that 50–75\\% of samples can be pruned early in generation with minimal degradation in performance. Overall, our methods enable more efficient and scalable compute utilization during inference for LLMs.",
      "keywords": [
        "LLMs",
        "inference-time",
        "inference-time efficiency",
        "Best-of-N",
        "self-evaluation"
      ],
      "decision": "Reject",
      "year": "2025"
    }
  },
  {
    "group_id": "s0JVsx3bx1",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "s0JVsx3bx1",
      "title": "1000 Layer Networks for Self-Supervised RL: Scaling Depth Can Enable New Goal-Reaching Capabilities",
      "abstract": "Scaling up self-supervised learning has driven breakthroughs in language and vision, yet comparable progress has remained elusive in reinforcement learning (RL). In this paper, we study building blocks for self-supervised RL that unlock substantial improvements in scalability, with network depth serving as a critical factor. Whereas most RL papers in recent years have relied on shallow architectures (around 2 -- 5 layers), we demonstrate that increasing the depth up to 1024 layers can significantly boost performance.\nOur experiments are conducted in an unsupervised goal-conditioned setting, where no demonstrations or rewards are provided, so an agent must explore (from scratch) and learn how to maximize the likelihood of reaching commanded goals.\nEvaluated on simulated locomotion and manipulation tasks, our approach increases performance on the self-supervised contrastive RL algorithm by $2\\times$ -- $50\\times$, outperforming other goal-conditioned baselines.\nIncreasing the model depth not only increases success rates but also qualitatively changes the behaviors learned.",
      "keywords": [
        "Reinforcement Learning",
        "Self-Supervised Learning",
        "Contrastive RL",
        "Goal-conditioned RL",
        "Scaling"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "tESKKiKhVp",
      "title": "Evolutionary Distributed Training",
      "abstract": "We introduce Evolutionary Distributed Training (EDT), a nature-inspired approach to distributed model training. EDT replaces centralized gradient synchronization with evaluation, pairwise model crossover, and mutation, enabling communication-efficient training across loosely connected devices. While early investigations show limited effectiveness in language model pretraining, EDT demonstrates strong potential in reinforcement learning (RL). In complex multi-agent environments, EDT facilitates diverse reward exploration and emergent strategies by evolving both policy and reward functions, outperforming traditional training in adaptability and strategic diversity. We also hypothesize EDT as a promising framework for post-training and alignment, offering optimization towards multi-objective, non-differentiable goals. This work positions EDT as a scalable, evolutionary recipe for distributed learning, offering early insights into where it may best fit within the deep learning landscape.",
      "keywords": [
        "Large Language Models",
        "Distributed Training",
        "Evolutionary Algorithms",
        "Reinforcement Learning"
      ],
      "decision": "Reject",
      "year": "2025"
    }
  },
  {
    "group_id": "eafIjoZAHm",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "7pEVq8yN3U",
      "title": "PropMEND: Hypernetworks for Knowledge Propagation in LLMs",
      "abstract": "Knowledge editing techniques for large language models (LLMs) can inject knowledge that is later reproducible verbatim, but they fall short on *propagating* that knowledge: models cannot answer questions that require them to reason with the injected knowledge. We present a hypernetwork-based approach for knowledge propagation, where we meta-learn how to modify gradients of a language modeling loss to encourage injected information to propagate. Our approach, PropMEND, extends the meta-objective of MEND so that gradient updates on a piece of knowledge are transformed to allow answering of multi-hop questions involving that knowledge.\nOn the RippleEdit dataset, our method significantly improves performance on propagation questions whose answers are not explicitly stated in the injected fact, in contrast to existing methods that only improve on propagation questions where the answer can be copied verbatim.\nTo study the extent of generalization that our propagation achieves, we construct StoryPropagation, a controlled dataset focusing on entities and relations that the model already understands well. We find that PropMEND generalizes effectively to partially unseen entity-relation pairs, indicating the effectiveness of our meta-trained hypernetwork for knowledge propagation.",
      "keywords": [
        "Knowledge Editing",
        "Knowledge Propagation",
        "Entity",
        "Large Language Model"
      ],
      "decision": "Reject",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "eafIjoZAHm",
      "title": "GnnXemplar: Exemplars to Explanations - Natural Language Rules for Global GNN Interpretability",
      "abstract": "Graph Neural Networks (GNNs) are widely used for node classification, yet their opaque decision-making limits trust and adoption. While local explanations offer insights into individual predictions, global explanation methods—those that characterize an entire class—remain underdeveloped. Existing global explainers rely on motif discovery in small graphs, an approach that breaks down in large, real-world settings where subgraph repetition is rare, node attributes are high-dimensional, and predictions arise from complex structure-attribute interactions. We propose GnnXemplar, a novel global explainer inspired from Exemplar Theory from cognitive science. GnnXemplar identifies representative nodes in the GNN embedding space—exemplars—and explains predictions using natural language rules derived from their neighborhoods. Exemplar selection is framed as a coverage maximization problem over reverse $k$-nearest neighbors, for which we provide an efficient greedy approximation. To derive interpretable rules, we employ a self-refining prompt strategy using large language models (LLMs). Experiments across diverse benchmarks show that GnnXemplar significantly outperforms existing methods in fidelity, scalability, and human interpretability, as validated by a user study with 60 participants.",
      "keywords": [
        "graph neural network",
        "graph machine learning",
        "explainability",
        "xai",
        "global explanation",
        "text-based explanation",
        "exemplar",
        "exemplar theory"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    }
  },
  {
    "group_id": "4xvE6Iy77Y",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "4xvE6Iy77Y",
      "title": "PRIMT: Preference-based Reinforcement Learning with Multimodal Feedback and Trajectory Synthesis from Foundation Models",
      "abstract": "Preference-based reinforcement learning (PbRL) has emerged as a promising paradigm for teaching robots complex behaviors without reward engineering. However, its effectiveness is often limited by two critical challenges: the reliance on extensive human input and the inherent difficulties in resolving query ambiguity and credit assignment during reward learning. In this paper, we introduce PRIMT, a PbRL framework designed to overcome these challenges by leveraging foundation models (FMs) for multimodal synthetic feedback and trajectory synthesis. Unlike prior approaches that rely on single-modality FM evaluations, PRIMT employs a hierarchical neuro-symbolic fusion strategy, integrating the complementary strengths of vision-language models (VLMs) and large language models (LLMs) in evaluating robot behaviors for more reliable and comprehensive feedback. PRIMT also incorporates foresight trajectory generation to warm-start the trajectory buffer with bootstrapped samples, reducing early-stage query ambiguity, and hindsight trajectory augmentation for counterfactual reasoning with a causal auxiliary loss to improve credit assignment. We evaluate PRIMT on 2 locomotion and 6 manipulation tasks on various benchmarks, demonstrating superior performance over FM-based and scripted baselines. Website at https://primt25.github.io/.",
      "keywords": [
        "Preference-based Reinforcement Learning",
        "Foundation Models for Robotics",
        "Neuro-Symbolic Fusion",
        "Multimodal Feedback",
        "Causal Inference",
        "Trajectory Synthesis",
        "Robot Manipulation"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "vMfJM9oBYL",
      "title": "Learning from Preferences and Mixed Demonstrations in General Settings",
      "abstract": "Reinforcement learning is a general method for learning in sequential settings, but it can often be difficult to specify a good reward function when the task is complex.\nIn these cases, preference feedback or expert demonstrations can be used instead.\nHowever, existing approaches utilising both together are either ad-hoc or rely on domain-specific properties.\nBuilding upon previous work, we develop a mathematical framework for learning from human data and based on this we introduce LEOPARD: Learning Estimated Objectives from Preferences And Ranked Demonstrations.\nLEOPARD can simultaneously learn from a broad range of data, including negative/failed demonstrations, to effectively learn reward functions in general domains.\nIt does this by modelling the human feedback as reward-rational partial orderings over available trajectories.\nWe find that when a limited amount of preference and demonstration feedback is available, LEOPARD outperforms baselines by a significant margin.\nFurthermore, we use LEOPARD to investigate learning from many types of feedback compared to just a single one, and find that a combination of feedback types is often beneficial.",
      "keywords": [
        "reinforcement learning",
        "rl",
        "human feedback",
        "rlhf",
        "modelling",
        "preferences",
        "demonstrations",
        "rankings",
        "machine learning",
        "reward learning"
      ],
      "decision": "Reject",
      "year": "2025"
    }
  },
  {
    "group_id": "F0JzotXYgC",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "F0JzotXYgC",
      "title": "Spectral Perturbation Bounds for Low-Rank Approximation with Applications to Privacy",
      "abstract": "A central challenge in machine learning is to understand how noise or measurement errors affect low-rank approximations, particularly in the spectral norm. This question is especially important in differentially private low-rank approximation, where one aims to preserve the top-$p$ structure of a data-derived matrix while ensuring privacy. Prior work often analyzes Frobenius norm error or changes in reconstruction quality, but these metrics can over- or under-estimate true subspace distortion. The spectral norm, by contrast, captures worst-case directional error and provides the strongest utility guarantees. We establish new high-probability spectral-norm perturbation bounds for symmetric matrices that refine the classical Eckart--Young--Mirsky theorem and explicitly capture interactions between a matrix $A \\in \\mathbb{R}^{n \\times n}$ and an arbitrary symmetric perturbation $E$. Under mild eigengap and norm conditions, our bounds yield sharp estimates for $\\| (A + E)_p - A_p \\|$, where $A_p$ is the best rank-$p$ approximation of $A$, with improvements of up to a factor of $\\sqrt{n}$. As an application, we derive improved utility guarantees for differentially private PCA, resolving an open problem in the literature. Our analysis relies on a novel contour bootstrapping method from complex analysis and extends it to a broad class of spectral functionals, including polynomials and matrix exponentials. Empirical results on real-world datasets confirm that our bounds closely track the actual spectral error under diverse perturbation regimes.",
      "keywords": [
        "Spectral norm",
        "low-rank approximation",
        "differentially private PCA",
        "contour integration",
        "matrix analysis"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "kXdW2KySK5",
      "title": "Variance-Dependent Regret Lower Bounds for Contextual Bandits",
      "abstract": "Variance-dependent regret bounds for linear contextual bandits, which improve upon the classical $\\tilde{O}(d\\sqrt{K})$ regret bound to $\\tilde{O}(d\\sqrt{\\sum_{k=1}^K\\sigma_k^2})$, where $d$ is the context dimension, $K$ is the number of rounds, and $\\sigma^2_k$ is the noise variance in round $k$, has been widely studied in recent years. However, most existing works focus on the regret upper bounds instead of lower bounds. To our knowledge, the only lower bound is from Jia et al. (2024), which proved that for any eluder dimension $d_{\\textbf{elu}}$ and total variance budget $\\Lambda$, there exists an instance with $\\sum_{k=1}^K\\sigma_k^2\\leq \\Lambda$ for which  any algorithm incurs a variance-dependent lower bound of $\\Omega(\\sqrt{d_{\\textbf{elu}}\\Lambda})$. However, this lower bound has a $\\sqrt{d}$ gap with existing upper bounds. Moreover, it only considers a fixed total variance budget $\\Lambda$ and does not apply to a general variance sequence $\\{\\sigma_1^2,\\ldots,\\sigma_K^2\\}$.\nIn this paper, to overcome the limitations of Jia et al. (2024), we consider the general variance sequence under two settings. For a prefixed sequence, where the entire variance sequence is revealed to the learner at the beginning of the learning process, we establish a variance-dependent lower bound of $\\Omega(d \\sqrt{\\sum_{k=1}^K\\sigma_k^2 }/\\log K)$ for linear contextual bandits. For an adaptive sequence, where an adversary can generate the variance $\\sigma_k^2$ in each round $k$ based on historical observations, we show that when the adversary must generate $\\sigma_k^2$ before observing the decision set, a similar lower bound of $\\Omega(d\\sqrt{ \\sum_{k=1}^K\\sigma_k^2} /\\log^6(dK))$ holds. In both settings, our results match the upper bounds of the SAVE algorithm (Zhao et al. 2023) up to logarithmic factors. Furthermore, if the adversary can generate the variance $\\sigma_k$ after observing the decision set $\\mathcal{D}_k$, we construct a counter-example showing that it is impossible to construct a variance-dependent lower bound if the adversary properly selects variances in collaboration with the learner.\nOur lower bound proofs use a novel peeling technique that groups rounds by variance magnitude. For each group, we construct separate instances and assign the learner distinct decision sets. We believe this proof technique may be of independent interest.",
      "keywords": [
        "Bandit",
        "Reinforcement Learning"
      ],
      "decision": "Reject",
      "year": "2025"
    }
  },
  {
    "group_id": "QN0E0KX2LM",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "QN0E0KX2LM",
      "title": "Learning Linear Attention in Polynomial Time",
      "abstract": "Previous research has explored the expressivity of Transformer models in simulating Boolean circuits or Turing machines. However, the efficient learnability of Transformers from data has remained an open question.  Our study addresses this gap by providing the first polynomial-time learnability results (specifically strong, agnostic PAC learning) for single-layer Transformers with linear attention.  We show that learning the optimal multi head linear attention can be recast as finding the optimal kernel predictor in a suitably defined RKHS.  Moving to generalization, we construct an algorithm that, given a dataset, checks in polynomial time whether the set of best fit multi head linear attention networks on this data all perform an identical computation--a powerful notion for out of distribution generalization.  We empirically validate our theoretical findings on several canonical tasks: learning random linear attention networks, key--value associations, and learning to execute finite automata. Our findings bridge a critical gap between theoretical expressivity and learnability of Transformer models.",
      "keywords": [
        "Transformers",
        "Learning Theory",
        "PAC learning"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "wh3p37VYm2",
      "title": "Mechanistic Insights into Grokking from the Embedding Layer",
      "abstract": "Grokking, a delayed generalization in neural networks after perfect training performance, has been observed in Transformers and MLPs, but the components driving it remain underexplored. We show that embeddings are central to grokking: introducing them into MLPs induces delayed generalization in modular arithmetic tasks, whereas MLPs without embeddings can generalize immediately. Our analysis identifies two key mechanisms: (1) Embedding update dynamics, where rare tokens stagnate due to sparse gradient updates and weight decay, and (2) Bilinear coupling, where the interaction between embeddings and downstream weights introduces saddle points and increases sensitivity to initialization.  \nTo confirm these mechanisms, we investigate frequency-aware sampling, which balances token updates by minimizing gradient variance, and embedding-specific learning rates, derived from the asymmetric curvature of the bilinear loss landscape. We prove that an adaptive learning rate ratio, \\(\\frac{\\eta_E}{\\eta_W} \\propto \\frac{\\sigma_{\\max}(E)}{\\sigma_{\\max}(W)} \\cdot \\frac{f_W}{f_E}\\), mitigates bilinear coupling effects, accelerating convergence. Our methods not only improve grokking dynamics but also extend to broader challenges in Transformer optimization, where bilinear interactions hinder efficient training.",
      "keywords": [
        "Embedding learning",
        "Token frequencey",
        "Coupled system"
      ],
      "decision": "Reject",
      "year": "2025"
    }
  },
  {
    "group_id": "JbJVWljk7r",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "1b7whO4SfY",
      "title": "Gated Attention for Large Language Models: Non-linearity, Sparsity, and Attention-Sink-Free",
      "abstract": "Gating mechanisms have been widely utilized, from early models like LSTMs and Highway Networks to recent state space models, linear attention, and also softmax attention.\nYet, existing literature rarely examines the specific effects of gating.\nIn this work, we conduct comprehensive experiments to systematically investigate gating-augmented softmax attention variants.\nSpecifically, we perform a comprehensive comparison over 30 variants of 15B Mixture-of-Experts (MoE) models and 1.7B dense models trained on a 3.5 trillion token dataset.\nOur central finding is that a simple modification—applying a head-specific sigmoid gate after the Scaled Dot-Product Attention (SDPA)—consistently improves performance.\nThis modification also enhances training stability, tolerates larger learning rates, and improves scaling properties.\nBy comparing various gating positions and computational variants, we attribute this effectiveness to two key factors: (1) introducing non-linearity upon the low-rank mapping in the softmax attention, and (2) applying query-dependent sparse gating scores to modulate the SDPA output.\nNotably, we find this sparse gating mechanism mitigates `massive activation`, `attention sink` and enhances long-context extrapolation performance. \nWe also release related codes (https://github.com/qiuzh20/gated_attention}) and models (https://huggingface.co/QwQZh/gated_attention) to facilitate future research.\nFurthermore, the most effective SDPA output gating is used in the Qwen3-Next models (https://huggingface.co/collections/Qwen/qwen3-next).",
      "keywords": [
        "Attention",
        "Large Language Model",
        "Gating"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "YtsX7irxbq",
      "title": "When recalling in-context, Transformers are not SSMs",
      "abstract": "Despite the advantageous subquadratic complexity of modern recurrent deep learning models -- such as state-space models (SSMs) -- recent studies have highlighted their potential shortcomings compared to transformers on reasoning and memorization tasks. In this paper, we dive deeper into one of such benchmarks: associative recall (AR), which has been shown to correlate well with language modeling performance, and inspect in detail the effects of scaling and optimization issues in recently proposed token mixing strategies. We first demonstrate that, unlike standard transformers, the choice of learning rate plays a critical role in the performance of modern recurrent models: an issue that can severely affect reported performance in previous works and suggests further research is needed to stabilize training. Next, we show that recurrent and attention-based models exhibit contrasting benefits when scaling in width as opposed to depth, with attention being notably unable to solve AR when limited to a single layer. We then further inspect 1-layer transformers, revealing that despite their poor performance, their training dynamics surprisingly resemble the formation of induction heads, a phenomenon previously observed only in their 2-layer counterparts. Finally, through architectural ablations, we study how components affects Transformer and Mamba’s performance and optimization stability.",
      "keywords": [
        "SSMs",
        "Attention",
        "In-Context Learning",
        "Language Modeling",
        "Mamba"
      ],
      "decision": "Reject",
      "year": "2025"
    }
  },
  {
    "group_id": "sAFottNlra",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "GmKpnuMUFC",
      "title": "Flipping Against All Odds: Reducing LLM Coin Flip Bias via Verbalized Rejection Sampling",
      "abstract": "Large language models (LLMs) can often accurately describe probability distributions using natural language, yet they still struggle to generate faithful samples from them. This mismatch limits their use in tasks requiring reliable stochasticity, such as Monte Carlo methods, agent-based simulations, and randomized decision-making. We investigate this gap between knowledge and sampling in the context of Bernoulli distributions. We introduce Verbalized Rejection Sampling (VRS), a natural-language adaptation of classical rejection sampling that prompts the LLM to reason about and accept or reject proposed samples. Despite relying on the same Bernoulli mechanism internally, VRS substantially reduces sampling bias across models. We provide theoretical analysis showing that, under mild assumptions, VRS improves over direct sampling, with gains attributable to both the algorithm and prompt design. More broadly, our results show how classical probabilistic tools can be verbalized and embedded into LLM workflows to improve reliability, without requiring access to model internals or heavy prompt engineering.",
      "keywords": [
        "rejection sampling",
        "large language models"
      ],
      "decision": "Reject",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "Q3qAsZAEZw",
      "title": "Understanding and Mitigating Numerical Sources of Nondeterminism in LLM Inference",
      "abstract": "Large Language Models (LLMs) are now integral across various domains and have demonstrated impressive performance. Progress, however, rests on the premise that benchmark scores are both accurate and reproducible. We demonstrate that the reproducibility of LLM performance is fragile: changing system configuration, such as evaluation batch size, GPU count, and GPU version, can introduce significant differences in the generated responses. \nThis issue is especially pronounced in reasoning models, where minor rounding differences in early tokens can cascade into divergent chains of thought, ultimately affecting accuracy. For instance, under bfloat16 precision with greedy decoding, a reasoning model like DeepSeek-R1-Distill-Qwen-7B can exhibit up to 9\\% variation in accuracy and 9,000 tokens difference in response length due to differences in GPU count, type, and evaluation batch size.\nWe trace the root cause of this variability to the non-associative nature of floating-point arithmetic under limited numerical precision. \nThis work presents the first systematic investigation into how numerical precision affects reproducibility in LLM inference. Through carefully controlled experiments across various hardware, software, and precision settings, we quantify when and how model outputs diverge.\nOur analysis reveals that floating-point precision—while critical for reproducibility—is often neglected in evaluation practices.\nInspired by this, we develop a lightweight inference pipeline, dubbed LayerCast, that stores weights in 16-bit precision but performs all computations in FP32, balancing memory efficiency with numerical stability. Code is available at https://github.com/nanomaoli/llm_reproducibility.",
      "keywords": [
        "Large Language Models (LLMs)",
        "Reproducibility",
        "Numerical precision",
        "Deterministic inference"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    }
  },
  {
    "group_id": "qXAABCxYQ2",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "ImpizBSKcu",
      "title": "Dynamical Decoupling of Generalization and Overfitting in Large Two-Layer Networks",
      "abstract": "Understanding the inductive bias and generalization properties of large overparametrized machine learning models requires to characterize the dynamics of the training algorithm.  We study the learning dynamics of large two-layer neural networks via dynamical mean field theory, a well established technique of non-equilibrium statistical physics. We show that, for large network width $m$,\nand large number of samples per input dimension $n/d$, the training dynamics exhibits a separation of timescales which implies:\n$(i)$ The emergence of a slow time scale associated with the growth in Gaussian/Rademacher complexity of the network;\n$(ii)$ Inductive bias towards small complexity if the initialization has small enough complexity;\n$(iii)$ A dynamical decoupling between feature learning and overfitting regimes; $(iv)$ A non-monotone behavior of the test error, associated  `feature unlearning' regime at large times.",
      "keywords": [
        "Overfitting; feature learning; dynamical mean field theory; generalization;"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "vWaMUMrBpF",
      "title": "Inconsistency-Aware Minimization: Improving Generalization with Unlabeled Data",
      "abstract": "Accurately estimating the generalization gap and devising optimization methods that generalize better are crucial for deep learning models, particularly in both theoretical understanding and practical applications. The ability to leverage unlabeled data for these purposes offers significant advantages in real-world scenarios. This paper introduces a novel generalization measure, termed $\\textit{local inconsistency}$, developed from an information-geometric perspective of the neural network's parameter space; a key feature is its computability from unlabeled data. We establish its theoretical underpinnings by connecting local inconsistency to the Fisher Information Matrix (FIM) and the loss Hessian. Empirically, we demonstrate that local inconsistency not only correlates with the generalization gap but also exhibits characteristics comparable to $\\textit{sharpness}$. Based on these findings, we propose Inconsistency-Aware Minimization (IAM), a regularization strategy that incorporates local inconsistency. We demonstrate that in standard supervised learning settings, IAM enhances generalization, achieving performance comparable to existing methods such as Sharpness-Aware Minimization (SAM). Furthermore, IAM exhibits notable efficacy in semi-supervised learning scenarios, where the local inconsistency regularizer is computed from the unlabeled data portion to further improve model performance.",
      "keywords": [
        "Generalization",
        "Regularization",
        "Training Method",
        "Deep Learning",
        "Inconsistency"
      ],
      "decision": "Reject",
      "year": "2025"
    }
  },
  {
    "group_id": "2rgYVFiWPL",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "2rgYVFiWPL",
      "title": "Sample complexity of Schrödinger potential estimation",
      "abstract": "We address the problem of Schrödinger potential estimation, which plays a crucial role in modern generative modelling approaches based on Schrödinger bridges and stochastic optimal control for SDEs. Given a simple prior diffusion process, these methods search for a path between two given distributions $\\rho_0$  and $\\rho_T$ requiring minimal efforts. The optimal drift in this case can be expressed through a Schrödinger potential. In the present paper, we study generalization ability of an empirical Kullback-Leibler (KL) risk minimizer over a class of admissible log-potentials aimed at fitting the marginal distribution at time $T$. Under reasonable assumptions on the target distribution $\\rho_T$ and the prior process, we derive a non-asymptotic high-probability upper bound on the KL-divergence between $\\rho_T$ and the terminal density corresponding to the estimated log-potential. In particular, we show that the excess KL-risk may decrease as fast as $\\mathcal O(\\log n / n)$ when the sample size $n$ tends to infinity even if both $\\rho_0$  and $\\rho_T$ have unbounded supports.",
      "keywords": [
        "Schrödinger bridge",
        "stochastic optimal control",
        "Schrödinger potential",
        "high-probability bounds",
        "excess risk"
      ],
      "decision": "Reject",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "rMhQBlhh4c",
      "title": "Adjoint Schrödinger Bridge Sampler",
      "abstract": "Computational methods for learning to sample from the Boltzmann distribution—where the target distribution is known only up to an unnormalized energy function—have advanced significantly recently. Due to the lack of explicit target samples, however, prior diffusion-based methods, known as _diffusion samplers_, often require importance-weighted estimation or complicated learning processes. Both trade off scalability with extensive evaluations of the energy and model, thereby limiting their practical usage. In this work, we propose **Adjoint Schrödinger Bridge Sampler (ASBS)**, a new diffusion sampler that employs simple and scalable matching-based objectives yet without the need to estimate target samples during training. ASBS is grounded on a mathematical model—the Schrödinger Bridge—which enhances sampling efficiency via kinetic-optimal transportation. Through a new lens of stochastic optimal control theory, we demonstrate how SB-based diffusion samplers can be learned at scale via Adjoint Matching and prove convergence to the global solution. Notably, ASBS generalizes the recent Adjoint Sampling (Havens et al., 2025) to arbitrary source distributions by relaxing the so-called memoryless condition that largely restricts the design space. Through extensive experiments, we demonstrate the effectiveness of ASBS on sampling from classical energy functions, amortized conformer generation, and molecular Boltzmann distributions. Codes are available at https://github.com/facebookresearch/adjoint_samplers",
      "keywords": [
        "Boltzmann distribution",
        "diffusion sampler",
        "Schrödinger bridge"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    }
  },
  {
    "group_id": "xZnjIkIzST",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "aLhA7AYLLR",
      "title": "ControlFusion: A Controllable Image Fusion Network with Language-Vision Degradation Prompts",
      "abstract": "Current image fusion methods struggle with real-world composite degradations and lack the flexibility to accommodate user-specific needs. To address this, we propose ControlFusion, a controllable fusion network guided by language-vision prompts that adaptively mitigates composite degradations. On the one hand, we construct a degraded imaging model based on physical mechanisms, such as the Retinex theory and atmospheric scattering principle, to simulate composite degradations and provide a data foundation for addressing realistic degradations. On the other hand, we devise a prompt-modulated restoration and fusion network that dynamically enhances features according to degradation prompts, enabling adaptability to varying degradation levels. To support user-specific preferences in visual quality, a text encoder is incorporated to embed user-defined degradation types and levels as degradation prompts. Moreover, a spatial-frequency collaborative visual adapter is designed to autonomously perceive degradations from source images, thereby reducing complete reliance on user instructions. Extensive experiments demonstrate that ControlFusion outperforms SOTA fusion methods in fusion quality and degradation handling, particularly under real-world and compound degradations.",
      "keywords": [
        "Image fusion",
        "multimodal images",
        "degradation"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "xZnjIkIzST",
      "title": "Restore3D: Breathing Life into Broken Objects with Shape and Texture Restoration",
      "abstract": "Restoring incomplete or damaged 3D objects is crucial for cultural heritage preservation, occluded object reconstruction, and artistic design.\nExisting methods primarily focus on geometric completion, often neglecting texture restoration and struggling with relatively complex and diverse objects.\nWe introduce Restore3D, a novel framework that simultaneously restores both the shape and texture of broken objects using multi-view images. To address limited training data, we develop an automated data generation pipeline that synthesizes paired incomplete-complete samples from large-scale 3D datasets. \nCentral to Restore3D is a multi-view model, enhanced by a carefully designed Mask Self-Perceiver module with a Depth-Aware Mask Rectifier.\nThe rectified masks, learned through the self-perceiver, facilitate an image integration and enhancement phase that preserves shape and texture patterns of incomplete objects and mitigates the low-resolution limitations of the base model, yielding high-resolution, semantically coherent, and view-consistent multi-view images. \nA coarse-to-fine reconstruction strategy is then employed to recover detailed textured 3D meshes from refined multi-view images. Comprehensive experiments show that Restore3D produces visually and geometrically faithful 3D textured meshes, outperforming existing methods and paving the way for more robust 3D object restoration. Project page: https://nip-ss.github.io/NIPS-anonymous/ .",
      "keywords": [
        "Diffusion Models",
        "3D object completion",
        "Multi-view Image Generation",
        "Multi-view Image Inpainting"
      ],
      "decision": "Reject",
      "year": "2025"
    }
  },
  {
    "group_id": "AYcKh0oT3h",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "AYcKh0oT3h",
      "title": "Online Convex Optimization with Heavy Tails: Old Algorithms, New Regrets, and Applications",
      "abstract": "In Online Convex Optimization (OCO), when the stochastic gradient has a finite variance, many algorithms provably work and guarantee a sublinear regret. However, limited results are known if the gradient estimate has a heavy tail, i.e., the stochastic gradient only admits a finite $\\mathsf{p}$-th central moment for some $\\mathsf{p}\\in\\left(1,2\\right]$. Motivated by it, this work examines different old algorithms for OCO (e.g., Online Gradient Descent) in the more challenging heavy-tailed setting. Under the standard bounded domain assumption, we establish new regrets for these classical methods without any algorithmic modification. Remarkably, these regret bounds are fully optimal in all parameters (can be achieved even without knowing $\\mathsf{p}$), suggesting that OCO with heavy tails can be solved effectively without any extra operation (e.g., gradient clipping). Our new results have several applications. A particularly interesting one is the first provable convergence result for nonsmooth nonconvex optimization under heavy-tailed noise without gradient clipping.",
      "keywords": [
        "Online Learning",
        "Online Convex Optimization",
        "Heavy Tails"
      ],
      "decision": "Reject",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "UVDihUz0iT",
      "title": "High-Dimensional Calibration from Swap Regret",
      "abstract": "We study the online calibration of multi-dimensional forecasts over an arbitrary convex set $\\mathcal{P} \\subset \\mathbb{R}^d$ relative to an arbitrary norm $\\Vert\\cdot\\Vert$. We connect this with the problem of external regret minimization for online linear optimization, showing that if it is possible to guarantee $O(\\sqrt{\\rho T})$ worst-case regret after $T$ rounds when actions are drawn from $\\mathcal{P}$ and losses are drawn from the dual $\\Vert \\cdot \\Vert_*$ unit norm ball, then it is also possible to obtain $\\epsilon$-calibrated forecasts after $T = \\exp(O(\\rho /\\epsilon^2))$ rounds. When $\\mathcal{P}$ is the $d$-dimensional simplex and $\\Vert \\cdot \\Vert$ is the $\\ell_1$-norm, the existence of $O(\\sqrt{T\\log d})$ algorithms for learning with experts implies that it is possible to obtain $\\epsilon$-calibrated forecasts after $T = \\exp(O(\\log{d}/\\epsilon^2)) = d^{O(1/\\epsilon^2)}$ rounds, recovering a recent result of Peng 2025.\n\nInterestingly, our algorithm obtains this guarantee without requiring access to any online linear optimization subroutine or knowledge of the optimal rate $\\rho$ -- in fact, our algorithm is identical for every setting of $\\mathcal{P}$ and $\\Vert \\cdot \\Vert$. Instead, we show that the optimal regularizer for the above OLO problem can be used to upper bound the above calibration error by a swap regret, which we then minimize by running the recent TreeSwap algorithm with Follow-The-Leader as a subroutine. The resulting algorithm is highly efficient and plays a distribution over simple averages of past observations in each round.\n\nFinally, we prove that any online calibration algorithm that guarantees $\\epsilon T$ $\\ell_1$-calibration error over the $d$-dimensional simplex requires $T \\geq \\exp(\\mathrm{poly}(1/\\epsilon))$ (assuming $d \\geq \\mathrm{poly}(1/\\epsilon)$). This strengthens the corresponding $d^{\\Omega(\\log{1/\\epsilon})}$ lower bound of Peng 2025, and shows that an exponential dependence on $1/\\epsilon$ is necessary.",
      "keywords": [
        "Calibration",
        "Swap Regret",
        "Online Learning"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    }
  },
  {
    "group_id": "YqzAsStE6n",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "YqzAsStE6n",
      "title": "Linear Bandits with Non-i.i.d. Noise",
      "abstract": "We study the linear stochastic bandit problem, relaxing the standard i.i.d. assumption on the observation noise. \nAs an alternative to this restrictive assumption, we allow the noise terms across rounds to be sub-Gaussian but \ninterdependent, with dependencies that decay over time. To address this setting, we develop new confidence sequences \nusing a recently introduced reduction scheme to sequential probability assignment, and use these to derive a bandit \nalgorithm based on the principle of optimism in the face of uncertainty. We provide regret bounds for the \nresulting algorithm, expressed in terms of the decay rate of the strength of dependence between observations. Among \nother results, we show that our bounds recover the standard rates up to a factor of the mixing time for geometrically \nmixing observation noise.",
      "keywords": [
        "linear bandits",
        "non-i.i.d",
        "online learning"
      ],
      "decision": "Reject",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "gL4muAFwsh",
      "title": "Does Stochastic Gradient really succeed for bandits?",
      "abstract": "Recent works of Mei et al. (2023, 2024) have deepened the theoretical understanding of the *Stochastic Gradient Bandit* (SGB) policy, showing that using a constant learning rate guarantees asymptotic convergence to the optimal policy, and that sufficiently *small* learning rates can yield logarithmic regret. However, whether logarithmic regret holds beyond small learning rates remains unclear. In this work, we take a step towards characterizing the regret *regimes* of SGB as a function of its learning rate. For two--armed bandits, we identify a sharp threshold, scaling with the sub-optimality gap $\\Delta$, below which SGB achieves *logarithmic* regret on all instances, and above which it can incur *polynomial* regret on some instances. \nThis result highlights the necessity of knowing (or estimating) $\\Delta$ to ensure logarithmic regret with a constant learning rate.\nFor general $K$-armed bandits, we further show the learning rate must scale inversely with $K$ to avoid polynomial regret. We introduce novel techniques to derive regret upper bounds for SGB, laying the groundwork for future advances in the theory of gradient-based bandit algorithms.",
      "keywords": [
        "bandits",
        "policy gradient"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    }
  },
  {
    "group_id": "jTaxGFy34h",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "VYLdKb5dzO",
      "title": "Tighter CMI-Based Generalization Bounds via Stochastic Projection and Quantization",
      "abstract": "In this paper, we leverage stochastic projection and lossy compression to establish new conditional mutual information (CMI) bounds on the generalization error of statistical learning algorithms. It is shown that these bounds are generally tighter than the existing ones. In particular, we prove that for certain problem instances for which existing MI and CMI bounds were recently shown in Attias et al. [2024] and Livni [2023] to become vacuous or fail to describe the right generalization behavior, our bounds yield suitable generalization guarantees of the order of $\\mathcal{O}(1/\\sqrt{n})$, where $n$ is the size of the training dataset. Furthermore, we use our bounds to investigate the problem of data \"memorization\" raised in those works, and which asserts that there are learning problem instances for which any learning algorithm that has good prediction there exist distributions under which the algorithm must \"memorize'' a big fraction of the training dataset. We show that for every learning algorithm, there exists an auxiliary algorithm that does not memorize and which yields comparable generalization error for any data distribution. In part, this shows that memorization is not necessary for good generalization.",
      "keywords": [
        "generalization error",
        "information theory",
        "conditional mutual information",
        "CMI",
        "learning theory",
        "projection",
        "memorization"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "jTaxGFy34h",
      "title": "Robust Wasserstein  $k$-center Clustering: Algorithms and Acceleration",
      "abstract": "The classical metric $k$-center problem is widely used in data representation tasks. However, real-world datasets often contain noise and exhibit complex structures, making the traditional metric $k$-center problem insufficient for such scenarios. To address these challenges, we present the \\textbf{R}obust \\textbf{W}asserstein \\textbf{C}enter clustering (RWC-clustering)  problem.\nCompared to the classical setting, the main challenge in designing an algorithm for the RWC-clustering problem lies in effectively handling noise in the cluster centers. To this end, we introduce a dedicated purification step to eliminate noise, based on which we develop our clustering algorithm.\nFurthermore, when dealing with large-scale datasets, both storage and computation become highly resource-intensive. To alleviate this, we adopt the \\textit{coreset} technique to improve the computational and storage efficiency by compressing the dataset.  \nRoughly speaking, this coreset method enables us to calculate the objective value on a small-size coreset, while ensuring a close approximation to the value on the original dataset in theory; thus, it substantially saves the storage and computation resources.  \nFinally, experimental results show the effectiveness of our RWC-clustering  problem and the efficiency of the coreset method.",
      "keywords": [
        "clustering; coreset; Wasserstein distance"
      ],
      "decision": "Reject",
      "year": "2025"
    }
  },
  {
    "group_id": "F20AfNqMq9",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "F20AfNqMq9",
      "title": "Deep Active Inference Agents for Delayed and Long-Horizon Environments",
      "abstract": "With the recent success of world-model agents—which extend the core idea of model-based reinforcement learning by learning a differentiable model for sample-efficient control across diverse tasks—active inference (AIF) offers a complementary, neuroscience-grounded paradigm that unifies perception, learning, and action within a single probabilistic framework powered by a generative model. Despite this promise, practical AIF agents still rely on accurate immediate predictions and exhaustive planning, a limitation that is exacerbated in delayed environments requiring plans over long horizons—tens to hundreds of steps. Moreover, most existing agents are evaluated on robotic or vision benchmarks which, while natural for biological agents, fall short of real-world industrial complexity. We address these limitations with a generative–policy architecture featuring (i) a multi-step latent transition that lets the generative model predict an entire horizon in a single look-ahead, (ii) an integrated policy network that enables the transition and receives gradients of the expected free energy, (iii) an alternating optimization scheme that updates model and policy from a replay buffer, and (iv)  a single gradient step that plans over long horizons, eliminating exhaustive planning from the control loop. We evaluate our agent in an environment that mimics a realistic industrial scenario with delayed and long-horizon settings. The empirical results confirm the effectiveness of the proposed approach, demonstrating the coupled world-model with the AIF formalism yields an end-to-end probabilistic controller capable of effective decision making in delayed, long-horizon settings without handcrafted rewards or expensive planning.",
      "keywords": [
        "Active Inference - Deep Probabilistic Models - Model-based Reinforcement Learning - World Models"
      ],
      "decision": "Reject",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "s0JVsx3bx1",
      "title": "1000 Layer Networks for Self-Supervised RL: Scaling Depth Can Enable New Goal-Reaching Capabilities",
      "abstract": "Scaling up self-supervised learning has driven breakthroughs in language and vision, yet comparable progress has remained elusive in reinforcement learning (RL). In this paper, we study building blocks for self-supervised RL that unlock substantial improvements in scalability, with network depth serving as a critical factor. Whereas most RL papers in recent years have relied on shallow architectures (around 2 -- 5 layers), we demonstrate that increasing the depth up to 1024 layers can significantly boost performance.\nOur experiments are conducted in an unsupervised goal-conditioned setting, where no demonstrations or rewards are provided, so an agent must explore (from scratch) and learn how to maximize the likelihood of reaching commanded goals.\nEvaluated on simulated locomotion and manipulation tasks, our approach increases performance on the self-supervised contrastive RL algorithm by $2\\times$ -- $50\\times$, outperforming other goal-conditioned baselines.\nIncreasing the model depth not only increases success rates but also qualitatively changes the behaviors learned.",
      "keywords": [
        "Reinforcement Learning",
        "Self-Supervised Learning",
        "Contrastive RL",
        "Goal-conditioned RL",
        "Scaling"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    }
  },
  {
    "group_id": "z5KTxW5sJd",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "jMhRbV47pS",
      "title": "The emergence of sparse attention: impact of data distribution and benefits of repetition",
      "abstract": "Emergence is a fascinating property of large language models and neural networks more broadly: as models scale and train for longer, they sometimes develop new abilities in sudden ways. Despite initial studies, we still lack a comprehensive understanding of how and when these abilities emerge. To address this gap, we study the emergence over training of sparse attention, a critical and frequently observed attention pattern in Transformers. By combining theoretical analysis of a toy model with empirical observations on small Transformers trained on a linear regression variant, we uncover the mechanics driving sparse attention emergence and reveal that emergence timing follows power laws based on task structure, architecture, and optimizer choice. We additionally find that repetition can greatly speed up emergence. Finally, we confirm these results on a well-studied in-context associative recall task. Our findings provide a simple, theoretically grounded framework for understanding how data distributions and model design influence the learning dynamics behind one form of emergence.",
      "keywords": [
        "emergence",
        "sparse attention",
        "in-context learning",
        "induction head"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "obXGSmmG70",
      "title": "AdaCoT: Pareto-Optimal Adaptive Chain-of-Thought Triggering via Reinforcement Learning",
      "abstract": "Large Language Models (LLMs) have demonstrated remarkable capabilities but often face challenges with tasks requiring sophisticated reasoning. While Chain-of-Thought (CoT) prompting significantly enhances reasoning, it indiscriminately generates lengthy reasoning steps for all queries, leading to substantial computational costs and inefficiency, especially for simpler inputs. To address this critical issue, we introduce AdaCoT (Adaptive Chain-of-Thought), a novel framework enabling LLMs to adaptively decide when to invoke CoT. AdaCoT framed adaptive reasoning as a Pareto optimization problem that seeks to balance model performance with the costs associated with CoT invocation (both frequency and computational overhead). We propose a reinforcement learning (RL) based method, specifically utilizing Proximal Policy Optimization (PPO), to dynamically control the CoT triggering decision boundary by adjusting penalty coefficients, thereby allowing the model to determine CoT necessity based on implicit query complexity. A key technical contribution is Selective Loss Masking (SLM), designed to counteract decision boundary collapse during multi-stage RL training, ensuring robust and stable adaptive triggering. Experimental results demonstrate that AdaCoT successfully navigates the Pareto frontier, achieving substantial reductions in CoT usage for queries not requiring elaborate reasoning. For instance, on our production traffic testset, AdaCoT reduced CoT triggering rates to as low as 3.18% and decreased average response tokens by 69.06% on APP, while maintaining high performance on complex tasks. This substantial token decrease directly translates to a significant reduction in inference computational load. AdaCoT pioneers adaptive CoT triggering, offering a practical and principled solution for developing more efficient, responsive, and cost-effective LLMs, particularly crucial for interactive and resource-sensitive applications.",
      "keywords": [
        "Adaptive Reasoning",
        "Chain-of-Thought",
        "Large Language Models"
      ],
      "decision": "Reject",
      "year": "2025"
    }
  },
  {
    "group_id": "24wDPGiDzA",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "LPUr2CexmX",
      "title": "DO-EM: Density Operator Expectation Maximization",
      "abstract": "Density operators, quantum generalizations of probability distributions, are gaining prominence in machine learning due to their foundational role in quantum computing. Generative modeling based on density operator models (**DOMs**) is an emerging field, but existing training algorithms - such as those for the Quantum Boltzmann Machine - do not scale to real-world data, such as the MNIST dataset. The Expectation-Maximization algorithm has played a fundamental role in enabling scalable training of probabilistic latent variable models on real-world datasets. *In this paper, we develop an Expectation-Maximization framework to learn latent variable models defined through **DOMs** on classical hardware, with resources comparable to those used for probabilistic models, while scaling to real-world data.* However, designing such an algorithm is nontrivial due to the absence of a well-defined quantum analogue to conditional probability, which complicates the Expectation step. To overcome this, we reformulate the Expectation step as a quantum information projection (QIP) problem and show that the Petz Recovery Map provides a solution under sufficient conditions. Using this formulation, we introduce the Density Operator Expectation Maximization (DO-EM) algorithm - an iterative Minorant-Maximization procedure that optimizes a quantum evidence lower bound. We show that the **DO-EM** algorithm ensures non-decreasing log-likelihood across iterations for a broad class of models. Finally, we present Quantum Interleaved Deep Boltzmann Machines (**QiDBMs**), a **DOM** that can be trained with the same resources as a DBM. When trained with **DO-EM** under Contrastive Divergence, a **QiDBM** outperforms larger classical DBMs in image generation on the MNIST dataset, achieving a 40–60% reduction in the Fréchet Inception Distance.",
      "keywords": [
        "Density Operators",
        "Expectation-Maximization",
        "Quantum Unsupervised Learning",
        "Latent Variable Models"
      ],
      "decision": "Reject",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "WCRPgBpbcA",
      "title": "A multiscale analysis of mean-field transformers in the moderate interaction regime",
      "abstract": "In this paper, we study the evolution of tokens through the depth of encoder-only transformer models at inference time by modeling them as a system of particles interacting in a mean-field way and studying the corresponding dynamics. More specifically, we consider this problem in the moderate interaction regime, where the number $N$ of tokens is large and the inverse temperature parameter $\\beta$ of the model scales together with $N$. In this regime, the dynamics of the system displays a multiscale behavior: a fast phase, where the token empirical measure collapses on a low-dimensional space, an intermediate phase, where the measure further collapses into clusters, and a slow one, where such clusters sequentially merge into a single one. We provide a rigorous characterization of the limiting dynamics in each of these phases and prove convergence in the above mentioned limit, exemplifying our results with some simulations.",
      "keywords": [
        "mean-field limits",
        "moderate interaction",
        "mean-field transformers",
        "self-attention models",
        "clustering",
        "multiscale"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    }
  },
  {
    "group_id": "6VoDizmIoY",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "FQbkBcpcvA",
      "title": "Rethinking cross entropy for continual fine-tuning: policy gradient with entropy annealing",
      "abstract": "While large pretrained vision models have achieved widespread success, their post-training adaptation in continual learning remains vulnerable to catastrophic forgetting. We challenge the conventional use of cross-entropy (CE) loss, a surrogate for 0-1 loss, by reformulating classification through reinforcement learning. Our approach frames classification as a one-step Markov Decision Process (MDP), where input samples serve as states, class labels as actions, and a fully observable reward model is derived from ground-truth labels.  From this formulation, we derive Expected Policy Gradient (EPG), a gradient-based method that directly minimizes the 0-1 loss (i.e., misclassification error). Theoretical and empirical analyses reveal a critical distinction between EPG and CE: while CE encourages exploration via high-entropy outputs, EPG adopts an exploitation-centric approach, prioritizing high-confidence samples through implicit sample weighting. Building on this insight, we propose an adaptive entropy annealing strategy (aEPG) that transitions from exploratory to exploitative learning during continual adaptation of a pre-trained model. Our method outperforms CE-based optimization across diverse benchmarks (Split-ImageNet-R, Split-Food101, Split-CUB100, CLRS) and parameter-efficient modules (LoRA, Adapter, Prefix). More broadly, we evaluate various entropy regularization methods and demonstrate that lower entropy of the output prediction distribution enhances adaptation in pretrained vision models. These findings suggest that excessive exploration may disrupt pretrained knowledge and establish exploitative learning as a crucial principle for adapting foundation vision models to evolving classification tasks.",
      "keywords": [
        "Continual learning",
        "reinforcement learning",
        "cross-entropy",
        "class-incremental learning"
      ],
      "decision": "Reject",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "zIzZxDsNNP",
      "title": "PhySense: Sensor Placement Optimization for Accurate Physics Sensing",
      "abstract": "Physics sensing plays a central role in many scientific and engineering domains, which inherently involves two coupled tasks: reconstructing dense physical fields from sparse observations and optimizing scattered sensor placements to observe maximum information. While deep learning has made rapid advances in sparse-data reconstruction, existing methods generally omit optimization of sensor placements, leaving the mutual enhancement between reconstruction and placement on the shelf. To change this suboptimal practice, we propose PhySense, a synergistic two-stage framework that learns to jointly reconstruct physical fields and to optimize sensor placements, both aiming for accurate physics sensing. The first stage involves a flow-based generative model enhanced by cross-attention to adaptively fuse sparse observations. Leveraging the reconstruction feedback, the second stage performs sensor placement via projected gradient descent to satisfy spatial constraints. We further prove that the learning objectives of the two stages are consistent with classical variance-minimization principles, providing theoretical guarantees. Extensive experiments across three challenging benchmarks, especially a 3D geometry dataset, indicate PhySense achieves state-of-the-art physics sensing accuracy and discovers informative sensor placements previously unconsidered. Code is available at this repository: https://github.com/thuml/PhySense.",
      "keywords": [
        "Physics sensing",
        "sensor placement",
        "flow models"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    }
  },
  {
    "group_id": "jSeWBdH0Xx",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "DwFDfrPsm8",
      "title": "NOVA: A Benchmark for Rare Anomaly Localization and Clinical Reasoning in Brain MRI",
      "abstract": "In many real-world applications, deployed models encounter inputs that differ from the data seen during training. Open-world recognition ensures that such systems remain robust as ever-emerging, previously _unknown_ categories appear and must be addressed without retraining.\nFoundation and vision-language models are pre-trained on large and diverse datasets with the expectation of broad generalization across domains, including medical imaging.\nHowever, benchmarking these models on test sets with only a few common outlier types silently collapses the evaluation back to a closed-set problem, masking failures on rare or truly novel conditions encountered in clinical use.\n\nWe therefore present NOVA, a challenging, real-life _evaluation-only_ benchmark of $\\sim$900 brain MRI scans that span 281 rare pathologies and heterogeneous acquisition protocols. Each case includes rich clinical narratives and double-blinded expert bounding-box annotations. Together, these enable joint assessment of anomaly localisation, visual captioning, and diagnostic reasoning. \nBecause NOVA is never used for training, it serves as an _extreme_ stress-test of out-of-distribution generalisation: models must bridge a distribution gap both in sample appearance and in semantic space.  \nBaseline results with leading vision-language models (GPT-4o, Gemini 2.0 Flash, and Qwen2.5-VL-72B) reveal substantial performance drops, with approximately a 65\\% gap in localisation compared to natural-image benchmarks and 40\\% and 20\\% gaps in captioning and reasoning, respectively, compared to resident radiologists. Therefore, NOVA establishes a testbed for advancing models that can detect, localize, and reason about truly unknown anomalies.",
      "keywords": [
        "Vision-Language Models",
        "Zero-shot Learning",
        "Anomaly Detection",
        "Dataset Benchmarking",
        "Medical Imaging",
        "Brain MRI",
        "Multi-modal Data",
        "Rare Diseases"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "eR8raBLZW7",
      "title": "BriLLM: Brain-inspired Large Language Model",
      "abstract": "This paper reports the brain-inspired large language model (BriLLM). This is a non-Transformer, non-GPT, non-traditional machine learning input-output controlled generative language model. The model is based on the Signal Fully-connected flowing (SiFu) definition on the directed graph in terms of the neural network, and has the interpretability of all nodes on the graph of the whole model, instead of the traditional machine learning model that only has limited interpretability at the input and output ends. In the language model scenario, the token is defined as a node in the graph. A randomly shaped or user-defined signal flow flows between nodes on the principle of \"least resistance\" along paths. The next token or node to be predicted or generated is the target of the signal flow. As a language model, BriLLM theoretically supports infinitely long $n$-gram models when the model size is independent of the input and predicted length of the model. The model's working signal flow provides the possibility of recall activation and innate multi-modal support similar to the cognitive patterns of the human brain. At present, we released the first BriLLM versions in Chinese and English, with 4000 tokens, 32-dimensional node size, 32-token sequence prediction ability, model sizes around 2B and 1B respectively, bringing language model prediction performance comparable to GPT-1.",
      "keywords": [
        "LLM"
      ],
      "decision": "Reject",
      "year": "2025"
    }
  },
  {
    "group_id": "IoSLbwZkal",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "AhyjbXSmUN",
      "title": "Probabilistic Temporal Sampling for Anomaly Detection in Ethereum Networks",
      "abstract": "The rapid growth of the Ethereum network necessitates advanced anomaly detection techniques to enhance security, transparency, and resilience against evolving malicious activities. While there have been significant strides in anomaly detection, they often fall short in capturing the intricate spatial-temporal patterns inherent in blockchain transactional data. This study presents a scalable framework that integrates Graph Convolutional Networks (GCNs) with Temporal Random Walks (TRW) specifically designed to adapt to the complexities and temporal dynamics of the Ethereum transaction network. Unlike traditional methods that focus on detecting specific attack types, such as front-running or flash loan exploits, our approach targets time-sensitive anomalies more broadly—detecting irregularities such as rapid transaction bursts, anomalous token swaps, and sudden volume spikes. This broader focus reduces reliance on pre-defined attack categories, making the method more adaptable to emerging and evolving malicious strategies. To ground our contributions, we establish three theoretical results: (1) the effectiveness of TRW in enhancing GCN-based anomaly detection by capturing temporal dependencies, (2) the identification of weight cancellation conditions in the anomaly detection process, and (3) the scalability and efficiency improvements of GCNs achieved through probabilistic sampling. Empirical evaluations demonstrate that the TRW-GCN framework outperforms state-of-the-art Temporal Graph Attention Networks (TGAT) in detecting time-sensitive anomalies. Furthermore, as part of our ablation study, we evaluated various anomaly detection techniques on the TRW-GCN embeddings and found that our proposed scoring classifier consistently achieves higher accuracy and precision compared to baseline methods such as Isolation Forest, One-Class SVM, and DBSCAN, thereby validating the robustness and adaptability of our framework.",
      "keywords": [
        "Probabilistic sampling",
        "Temporal random walk",
        "Graph convolutional networks",
        "Transaction anomaly detection",
        "Ethereum networks"
      ],
      "decision": "Reject",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "eafIjoZAHm",
      "title": "GnnXemplar: Exemplars to Explanations - Natural Language Rules for Global GNN Interpretability",
      "abstract": "Graph Neural Networks (GNNs) are widely used for node classification, yet their opaque decision-making limits trust and adoption. While local explanations offer insights into individual predictions, global explanation methods—those that characterize an entire class—remain underdeveloped. Existing global explainers rely on motif discovery in small graphs, an approach that breaks down in large, real-world settings where subgraph repetition is rare, node attributes are high-dimensional, and predictions arise from complex structure-attribute interactions. We propose GnnXemplar, a novel global explainer inspired from Exemplar Theory from cognitive science. GnnXemplar identifies representative nodes in the GNN embedding space—exemplars—and explains predictions using natural language rules derived from their neighborhoods. Exemplar selection is framed as a coverage maximization problem over reverse $k$-nearest neighbors, for which we provide an efficient greedy approximation. To derive interpretable rules, we employ a self-refining prompt strategy using large language models (LLMs). Experiments across diverse benchmarks show that GnnXemplar significantly outperforms existing methods in fidelity, scalability, and human interpretability, as validated by a user study with 60 participants.",
      "keywords": [
        "graph neural network",
        "graph machine learning",
        "explainability",
        "xai",
        "global explanation",
        "text-based explanation",
        "exemplar",
        "exemplar theory"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    }
  },
  {
    "group_id": "yrrU5YChQr",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "7AwFJzgIUW",
      "title": "Dynamical Low-Rank Compression of Neural Networks with Robustness under Adversarial Attacks",
      "abstract": "Deployment of neural networks on resource-constrained devices demands models that are both compact and robust to adversarial inputs. However, compression and adversarial robustness often conflict. In this work, we introduce a dynamical low-rank training scheme enhanced with a novel spectral regularizer that controls the condition number of the low-rank core in each layer. This approach mitigates the sensitivity of compressed models to adversarial perturbations without sacrificing clean accuracy. The method is model- and data-agnostic, computationally efficient, and supports rank adaptivity to automatically compress the network at hand. Extensive experiments across standard architectures, datasets, and adversarial attacks show the regularized networks can achieve over 94 compression while recovering or improving adversarial accuracy relative to uncompressed baselines.",
      "keywords": [
        "Low Rank",
        "Adversarial Robustenss",
        "Adversarial Attacks",
        "Rank Adaptive",
        "Computer Vision",
        "Compression"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "QJtanJS4T9",
      "title": "Irreducible Loss Floors in Gradient Descent Convergence and Energy Footprint",
      "abstract": "Despite their central role, convergence analyses of the dynamics of loss functions\nduring training require strong assumptions (e.g convexity and smoothness) which\nare non-trivial to prove. In this work, we introduce a framework for deriving\nnecessary convergence conditions that hold without restrictive assumptions on\nthe dataset or the model architecture. By linking microscopic properties such as\nindividual sample losses and their gradient to macroscopic training dynamics, we\nderive tight lower bounds for loss functions, applicable to both full-batch and mini-\nbatch gradient systems. These bounds reveal the presence of irreducible floors\nthat optimizers cannot surpass and beyond theoretical guarantees, this framework offers a practical tool for anticipating convergence speed, and estimating\nminimum training time and energy requirements. Thus, this framework can be\nused to ensure the sustainability and feasibility of large-scale training regimes.",
      "keywords": [
        "gradient descent",
        "convergence",
        "loss bounds",
        "optimization",
        "training dynamics",
        "sustainability",
        "efficiency",
        "feasibility",
        "computational cost",
        "irreducible loss",
        "non-convex optimization",
        "lower bounds"
      ],
      "decision": "Reject",
      "year": "2025"
    }
  },
  {
    "group_id": "I4fBSpDOha",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "RxkCwOKVKa",
      "title": "Breaking the Performance Ceiling in Reinforcement Learning requires Inference Strategies",
      "abstract": "Reinforcement learning (RL) systems have countless applications, from energy-grid management to protein design. However, such real-world scenarios are often extremely difficult, combinatorial in nature, and require complex coordination between multiple agents. This level of complexity can cause even state-of-the-art RL systems, trained until convergence, to hit a performance ceiling which they are unable to break out of with zero-shot inference. Meanwhile, many digital or simulation-based applications allow for an inference phase that utilises a specific time and compute budget to explore multiple attempts before outputting a final solution. In this work, we show that such an inference phase employed at execution time, and the choice of a corresponding inference strategy, are key to breaking the performance ceiling observed in complex multi-agent RL problems. Our main result is striking: we can obtain up to a 126% and, on average, a 45% improvement over the previous state-of-the-art across 17 tasks, using only a couple seconds of extra wall-clock time during execution. We also demonstrate promising compute scaling properties, supported by over 60k experiments, making it the largest study on inference strategies for complex RL to date. We make all of our experimental data and code available.",
      "keywords": [
        "reinforcement learning",
        "inference strategies",
        "complex decision-making"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "cEd00CXWE5",
      "title": "Beyond Two-Stage Training: Integrating SFT and RL for Improved Reasoning in LLMs",
      "abstract": "Reinforcement learning (RL) has proven effective in incentiving the reasoning abilities of large language models (LLMs), but faces significant efficiency challenges due to its extensive trial-and-error nature. A common practice is to employ supervised fine-tuning (SFT) as a warm-up stage; however, this decoupled two-stage approach limits interaction between SFT and RL, thereby constraining overall effectiveness. This study introduces a novel method for learning reasoning models that employs bilevel optimization to facilitate better cooperation between these training paradigms. Specifically, the SFT objective is explicitly conditioned on the optimal solution of the RL objective. During training, lower-level updates enable the model to receive SFT supervision concurrently with RL-based exploration, while upper-level updates are optimized to ensure that the joint training yields higher rewards than RL alone. Empirical evaluations on five reasoning benchmarks demonstrate that our method consistently outperforms baselines and achieves a better balance between effectiveness and efficiency.",
      "keywords": [
        "LLM",
        "Reasoning",
        "RL",
        "SFT"
      ],
      "decision": "Reject",
      "year": "2025"
    }
  },
  {
    "group_id": "IL1wvzOgqD",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "b6s1jIHj6o",
      "title": "Hear you are: Teaching LLMs Spatial Reasoning with Vision and Spatial Sound",
      "abstract": "Many audio-visual learning methods have focused on aligning audio and visual information, either through semantic or temporal correspondence. However, most of these works have utilized monaural audio, which does not contain information about the spatial location of the sound source. In contrast, humans and other animals utilize binaural hearing to perceive this spatial information. Combining spatial sound and visual perception enables powerful high-level reasoning: for example, a person looking for their phone may hear the ringing sound coming from a backpack sitting on a table, and quickly infer that the missing phone is inside the backpack. In this paper, we investigate the problem of Audio-Visual Spatial Reasoning. We design a spatial audio-visual question answering dataset to cover scenarios where semantic correspondence between audio and visual signals is absent but spatial alignment exists, as well as cases with multiple audio-visual semantic correspondences that require spatial reasoning to disambiguate. We propose a model that learns spatial comprehension across the audio and vision modalities by connecting them with a large language model and experimentally demonstrate that spatial sound perception is an essential part of our task.",
      "keywords": [
        "Audio-visual Spatial Reasoning",
        "Spatial audio",
        "Multi-modal LLMs"
      ],
      "decision": "Reject",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "jzPQRbGkAq",
      "title": "Deep Compositional Phase Diffusion for Long Motion Sequence Generation",
      "abstract": "Recent research on motion generation has shown significant progress in generating semantically aligned motion with singular semantics. However, when employing these models to create composite sequences containing multiple semantically generated motion clips, they often struggle to preserve the continuity of motion dynamics at the transition boundaries between clips, resulting in awkward transitions and abrupt artifacts. To address these challenges, we present Compositional Phase Diffusion, which leverages the Semantic Phase Diffusion Module (SPDM) and Transitional Phase Diffusion Module (TPDM) to progressively incorporate semantic guidance and phase details from adjacent motion clips into the diffusion process. Specifically, SPDM and TPDM operate within the latent motion frequency domain established by the pre-trained Action-Centric Motion Phase Autoencoder (ACT-PAE). This allows them to learn semantically important and transition-aware phase information from variable-length motion clips during training. Experimental results demonstrate the competitive performance of our proposed framework in generating compositional motion sequences that align semantically with the input conditions, while preserving phase transitional continuity between preceding and succeeding motion clips. Additionally, motion inbetweening task is made possible by keeping the phase parameter of the input motion sequences fixed throughout the diffusion process, showcasing the potential for extending the proposed framework to accommodate various application scenarios. Codes are available at\nhttps://github.com/asdryau/TransPhase.",
      "keywords": [
        "Motion Generation",
        "Phase Autoencoder",
        "Long Term Motion Sequence Generation",
        "Motion Inbetweening"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    }
  },
  {
    "group_id": "uUWb5eawL9",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "7AwFJzgIUW",
      "title": "Dynamical Low-Rank Compression of Neural Networks with Robustness under Adversarial Attacks",
      "abstract": "Deployment of neural networks on resource-constrained devices demands models that are both compact and robust to adversarial inputs. However, compression and adversarial robustness often conflict. In this work, we introduce a dynamical low-rank training scheme enhanced with a novel spectral regularizer that controls the condition number of the low-rank core in each layer. This approach mitigates the sensitivity of compressed models to adversarial perturbations without sacrificing clean accuracy. The method is model- and data-agnostic, computationally efficient, and supports rank adaptivity to automatically compress the network at hand. Extensive experiments across standard architectures, datasets, and adversarial attacks show the regularized networks can achieve over 94 compression while recovering or improving adversarial accuracy relative to uncompressed baselines.",
      "keywords": [
        "Low Rank",
        "Adversarial Robustenss",
        "Adversarial Attacks",
        "Rank Adaptive",
        "Computer Vision",
        "Compression"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "QJtanJS4T9",
      "title": "Irreducible Loss Floors in Gradient Descent Convergence and Energy Footprint",
      "abstract": "Despite their central role, convergence analyses of the dynamics of loss functions\nduring training require strong assumptions (e.g convexity and smoothness) which\nare non-trivial to prove. In this work, we introduce a framework for deriving\nnecessary convergence conditions that hold without restrictive assumptions on\nthe dataset or the model architecture. By linking microscopic properties such as\nindividual sample losses and their gradient to macroscopic training dynamics, we\nderive tight lower bounds for loss functions, applicable to both full-batch and mini-\nbatch gradient systems. These bounds reveal the presence of irreducible floors\nthat optimizers cannot surpass and beyond theoretical guarantees, this framework offers a practical tool for anticipating convergence speed, and estimating\nminimum training time and energy requirements. Thus, this framework can be\nused to ensure the sustainability and feasibility of large-scale training regimes.",
      "keywords": [
        "gradient descent",
        "convergence",
        "loss bounds",
        "optimization",
        "training dynamics",
        "sustainability",
        "efficiency",
        "feasibility",
        "computational cost",
        "irreducible loss",
        "non-convex optimization",
        "lower bounds"
      ],
      "decision": "Reject",
      "year": "2025"
    }
  },
  {
    "group_id": "7ieS4EYKnB",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "InyYuWLWHD",
      "title": "LayerGuard: Poisoning-Resilient Federated Learning via Layer-Wise Similarity Analysis",
      "abstract": "In recent years, model poisoning attacks have gradually evolved from conventional global parameter manipulations to more stealthy and strategic Targeted Layer Poisoning (TLP) attacks.These attacks achieve high attack success rates by selectively poisoning only a subset of layers. However, most existing defenses rely on evaluation of the entire network and are thus ineffective against TLP attacks, posing new challenges to the security of Federated Learning (FL).In this paper, we propose \\textbf{LayerGuard}, a comprehensive defense framework featuring dynamic detection and adaptive aggregation to protect FL against advanced model poisoning attacks. Diverging from traditional methods that analyze the entire network collectively, \\textbf{LayerGuard} performs layer-wise similarity analysis to detect anomalous clients and adaptively identifies layers under attack based on the clustering behavior of malicious updates, facilitating more precise threat detection. Building on this, we introduce a joint weighting mechanism in the aggregation process, which evaluates each client's credibility at the layer level from two complementary informational dimensions: inter-layer and intra-layer, balancing attack mitigation and benign contribution retention. Extensive experiments across various datasets and model architectures demonstrate that \\textbf{LayerGuard} successfully reduces the average attack success rate of TLP attacks to around 5\\%. Moreover, when confronted with other advanced model poisoning attacks, \\textbf{LayerGuard} consistently maintains global model accuracy—even under high poisoning rates and severe non-IID conditions—comparable to that of FedAvg under no-attack settings, marking a significant improvement over existing defenses.",
      "keywords": [
        "Federated Learning; Security; Model Poisoning Attacks; Robust Aggregation"
      ],
      "decision": "Reject",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "aUAG1WS7J2",
      "title": "Class-wise Balancing Data Replay for Federated Class-Incremental Learning",
      "abstract": "Federated Class Incremental Learning (FCIL) aims to collaboratively process continuously increasing incoming tasks across multiple clients. Among various approaches, data replay has become a promising solution, which can alleviate forgetting by reintroducing representative samples from previous tasks. However, their performance is typically limited by class imbalance, both within the replay buffer due to limited global awareness and between replayed and newly arrived classes. To address this issue, we propose a class-wise balancing data replay method for FCIL (FedCBDR), which employs a global coordination mechanism for class-level memory construction and reweights the learning objective to alleviate the aforementioned imbalances. Specifically, FedCBDR has two key components: 1) the global-perspective data replay module reconstructs global representations of prior task knowledge in a privacy-preserving manner, which then guides a class-aware and importance-sensitive sampling strategy to achieve balanced replay; 2) Subsequently, to handle class imbalance across tasks, the task-aware temperature scaling module adaptively adjusts the temperature of logits at both class and instance levels based on task dynamics, which reduces the model’s overconfidence in majority classes while enhancing its sensitivity to minority classes. Experimental results verified that FedCBDR achieves balanced class-wise sampling under heterogeneous data distributions and improves generalization under task imbalance between earlier and recent tasks, yielding a 2%-15% Top-1 accuracy improvement over six state-of-the-art methods.",
      "keywords": [
        "Federated Learning;Federated Class-Incremental Learning; Data Replay"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    }
  },
  {
    "group_id": "pISLZG7ktL",
    "difficulty": "easy",
    "pair_type": "reject-oral",
    "paper_a": {
      "paper_id": "JaRihIHbZm",
      "title": "VideoAgent: Self-Improving Video Generation",
      "abstract": "Video generation has been used to generate visual plans for controlling robotic systems. Given an image observation and a language instruction, previous work has generated video plans which are then converted to robot controls to be executed. However, a major bottleneck in leveraging video generation for control lies in the quality of the generated videos, which often suffer from hallucinatory content and unrealistic physics, resulting in low task success when control actions are extracted from the generated videos. While scaling up dataset and model size provides a partial solution, integrating external feedback is both natural and essential for grounding video generation in the real world. With this observation, we propose VideoAgent for self-improving generated video plans based on external feedback. Instead of directly executing the generated video plan, VideoAgent first refines the generated video plans using a novel procedure which we call self-conditioning consistency, utilizing feedback from a pretrained vision-language model (VLM). As the refined video plan is being executed, VideoAgent collects additional data from the environment to further improve video plan generation. Experiments in simulated robotic manipulation from MetaWorld and iTHOR show that VideoAgent drastically reduces hallucination, thereby boosting success rate of downstream manipulation tasks. We further illustrate that VideoAgent can effectively refine real-robot videos, providing an early indicator that robotics can be an effective tool in grounding video generation in the physical world.",
      "keywords": [
        "sequential decision making",
        "video generation",
        "self improvement"
      ],
      "decision": "Reject",
      "year": "2025"
    },
    "paper_b": {
      "paper_id": "pISLZG7ktL",
      "title": "Data Scaling Laws in Imitation Learning for Robotic Manipulation",
      "abstract": "Data scaling has revolutionized fields like natural language processing and computer vision, providing models with remarkable generalization capabilities. In this paper, we investigate whether similar data scaling laws exist in robotics, particularly in robotic manipulation, and whether appropriate data scaling can yield single-task robot policies that can be deployed zero-shot for any object within the same category in any environment. To this end, we conduct a comprehensive empirical study on data scaling in imitation learning. By collecting data across numerous environments and objects, we study how a policy’s generalization performance changes with the number of training environments, objects, and demonstrations. Throughout our research, we collect over 40,000 demonstrations and execute more than 15,000 real-world robot rollouts under a rigorous evaluation protocol. Our findings reveal several intriguing results: the generalization performance of the policy follows a roughly power-law relationship with the number of environments and objects. The diversity of environments and objects is far more important than the absolute number of demonstrations; once the number of demonstrations per environment or object reaches a certain threshold, additional demonstrations have minimal effect. Based on these insights, we propose an efficient data collection strategy. With four data collectors working for one afternoon, we collect sufficient data to enable the policies for two tasks to achieve approximately 90\\% success rates in novel environments with unseen objects.",
      "keywords": [
        "Data Scaling Laws",
        "Imitation Learning",
        "Robotic Manipulation"
      ],
      "decision": "Accept (Oral)",
      "year": "2025"
    }
  }
]