\documentclass[11pt]{article}
\usepackage[margin=1in]{geometry}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{microtype}
\usepackage{graphicx}
\usepackage{url}
\usepackage{tikz-cd}
\usepackage{multicol}

\setlength{\parindent}{0pt}
\setlength{\parskip}{1\baselineskip}

\begin{document}

\section*{Paper Outline: Attention Binding as Early Warning}

\subsection*{Title (working)}

Attention-Head Binding as a Term-Conditioned Mechanistic Marker of Accessibility Concept Emergence in Language Models.

\subsection*{Abstract}

Detecting when language models acquire specific capabilities remains challenging: behavioral evaluations are expensive, and internal representations are opaque.
We introduce \emph{attention-head binding} (EB$^*$), a lightweight mechanistic metric that tracks how attention heads bind multi-token technical terms---such as accessibility concepts (``screen reader,'' ``alt text'')---into coherent units during training.

Using Pythia models (160M, 1B, 2.8B) across eight checkpoints, we report four empirical findings (C1, C3, C4, C5).
At 160M and 2.8B, binding precedes behavioral competence (Spearman $r=0.33$--$0.34$, $p<0.001$), serving as an early warning signal (C1).
At 1B, we observe a \emph{decoupling effect}: binding saturates early while behavior continues improving, revealing divergent developmental trajectories (C4).
High-binding/mid-accuracy checkpoints contain \emph{unlockable latent knowledge}: few-shot prompting yields up to +61 percentage points improvement (183\% relative gain) and near-ceiling generation scores (94.4\%) from low zero-shot baselines (C3).
Causal ablation reveals opposite mechanistic regimes across scales: high-binding heads are necessary at 160M (ablation impairs recognition by $-16.7$ percentage points~[pp]) but \emph{functionally superseded} at 2.8B (ablation improves recognition by $+33.3$~pp), providing direct evidence for the decoupling phenomenon (C5).

These findings establish attention binding as a diagnostic for concept emergence and demonstrate that mechanistic structure and behavioral competence undergo qualitative transformation across model scales---a phenomenon we term the \emph{binding--behavior decoupling effect}.

\subsection*{1. Introduction}

Understanding how language models acquire and represent domain-specific knowledge is a central challenge in mechanistic interpretability \cite{olah2020zoom,elhage2021transformer}.
While behavioral evaluations reveal \emph{what} a model knows, they provide limited insight into \emph{how} and \emph{when} internal representations form during training.
This gap is particularly consequential for socially critical domains such as web accessibility---where models must reason about technical standards (WCAG), assistive technologies (screen readers), and semantic markup (ARIA) \cite{w3c2018wcag}.

Large language models (LLMs) exhibit \emph{emergent capabilities}---abilities that appear abruptly with scale rather than improving smoothly---and whether such ``emergence'' reflects genuine dynamical transitions or measurement artifacts remains debated \cite{wei2022emergent,schaeffer2023emergence}.
More broadly, performance in many regimes follows predictable scaling trends with model size and data \cite{kaplan2020scaling}, motivating mechanistic signals that can anticipate capability changes without exhaustive evaluation.
Practically, this creates a prediction problem: without expensive behavioral evaluation, practitioners cannot reliably anticipate which models will exhibit particular competencies.

Recent work has sought mechanistic early-warning signals.
Sparse autoencoders (SAEs) can extract features and track their formation across training \cite{bricken2023monosemanticity}, but require training auxiliary models.
Consistency-based methods such as CCS probe for latent knowledge via activation-space structure \cite{burns2022discovering}, but are not designed to track concept formation dynamics over checkpoints.
Circuit tracing approaches identify subnetworks supporting specific capabilities \cite{olsson2022context}, but have primarily been demonstrated on algorithmic rather than domain-specific semantic tasks.

A gap therefore remains: \emph{how can we detect when a model has learned to treat a specific multi-token concept as a coherent unit---validated through checkpoint-level dynamics and causal intervention---before it reliably exhibits behavioral competence?}

We address this gap by introducing \emph{attention-head binding} (EB$^*$), a mechanistic metric that quantifies how strongly individual attention heads bind the constituent tokens of multi-token technical terms---such as ``screen reader,'' ``skip link,'' and ``alt text''---into coherent conceptual units.
Our central hypothesis is that this binding signal serves as an early, internal marker of concept acquisition that precedes externally observable behavioral competence.

\begin{sloppypar}
This builds on three lines of work.
\textbf{Multi-token phrase processing.} Multi-token phrases can fail to receive stable, holistic representations in transformers, with information localized to particular layer regions.
Our metrics operationalize this by measuring whether attention routes information among span tokens (e.g., ``screen'' $\leftrightarrow$ ``reader'').
\textbf{Attention as mechanism.} The ``attention is not explanation'' critique showed attention weights can mislead \cite{jain2019attention,wiegreffe2019attention}; we treat binding scores as hypotheses requiring causal validation \cite{olsson2022context}.
\textbf{Checkpoint dynamics.} The Pythia suite enables fine-grained training analysis \cite{biderman2023pythia}; we leverage this to test whether binding precedes behavior and to characterize non-monotonic dynamics.
\end{sloppypar}

We study Pythia models (160M, 1B, 2.8B) across eight checkpoints (step 0--143K).
Our contributions are organized around four empirical claims (C1, C3--C5); a fifth claim concerning representational stability to prompt perturbations (C2) remains for future work (see Section~5.4).\footnote{Claim C2 concerning stability to prompt perturbations was deprioritized for this study due to computational constraints; preliminary analysis suggests the effect is secondary to the binding--behavior dynamics reported here.}

\begin{enumerate}
  \item \textbf{Lead-lag emergence (C1).} Binding (EB$^*$) rises before behavioral competence, establishing temporal precedence (Section~4.1).
  \item \textbf{Scale-dependent decoupling (C4).} At 1B, binding saturates early while behavior improves---revealing divergent mechanistic-behavioral trajectories (Section~4.2).
  \item \textbf{Unlockable latent knowledge (C3).} High-binding/mid-accuracy checkpoints yield large generation improvements under few-shot prompting when EB$^*>0.6$ (Section~4.3).
  \item \textbf{Cross-scale causal regimes (C5).} Ablating high-binding heads reveals opposite effects: necessary at 160M but \emph{functionally superseded} at 2.8B, providing mechanistic evidence for decoupling (Section~4.4).
\end{enumerate}

These findings establish attention binding as a diagnostic for concept emergence and reveal that the structure--behavior relationship undergoes qualitative transformation across scales---a phenomenon we term the \emph{binding--behavior decoupling effect}.

The remainder of this paper is organized in three parts.
Part I establishes observational associations across scales (Section~4.1) and characterizes within-training dynamics including the decoupling phenomenon (Section~4.2).
Part II demonstrates practical utility via unlockability experiments (Section~4.3).
Part III validates mechanistic relevance through causal interventions (Section~4.4).
Section~2 reviews related work; Section~3 describes methods; Section~5 discusses implications; Section~6 concludes.

\subsection*{2. Related Work}

\subsubsection*{2.1 Mechanistic Interpretability and Attention}

Mechanistic interpretability seeks to reverse-engineer neural networks into human-understandable components \cite{olah2020zoom,elhage2021transformer}.
Within transformers, attention heads serve as key functional units: induction heads support in-context learning \cite{olsson2022context}, while specialized heads perform distinct functional roles \cite{voita2019analyzing}.

However, the ``attention is not explanation'' critique demonstrated that attention weights can mislead as feature-importance indicators \cite{jain2019attention,wiegreffe2019attention}.
Accordingly, we treat attention-binding scores as mechanistic hypotheses requiring causal validation \cite{olsson2022context}---not as explanatory features but as entry points for intervention studies.

Our work extends this line by identifying attention heads that bind multi-token concepts, using binding strength as a \emph{developmental marker} that tracks formation dynamics across training rather than as a static feature.

\subsubsection*{2.2 Multi-Word Expression Processing}

Multi-token phrases and technical terms can be inconsistently represented in transformers, with information localized to particular layer regions \cite{miletic2024semantics,haviv2023understanding}.
Mileti\'c et al. \cite{miletic2024semantics} survey evidence that transformer models capture multiword expression semantics inconsistently, often relying on memorized information rather than fully compositional processing.
Haviv et al. \cite{haviv2023understanding} analyze idioms as a prototypical case of memorized multi-token expressions and describe layerwise effects consistent with staged recall, suggesting mechanisms that may generalize to broader multiword-expression processing.
We focus on whether attention explicitly binds the tokens of a given term span into a coherent unit.

Our term-conditioned binding metrics operationalize this insight for technical terms (a subclass of MWEs): we test whether a model treats a given multi-token term as a coherent unit via routed attention flow, whether this coherence is layer-localized, and whether it is mechanistically causal for downstream task performance.

\subsubsection*{2.3 Concept Emergence and Training Dynamics}

The study of knowledge emergence during training has gained traction through checkpointed analyses.
Pythia \cite{biderman2023pythia} enables longitudinal study with public intermediate checkpoints.
Prior work examines factual knowledge emergence \cite{swayamdipta2020dataset}, reasoning abilities \cite{wei2022emergent}, and syntactic competence \cite{duan2025syntax} during training.

Our contribution differs in tracking a \emph{term-conditioned mechanistic signal}---attention binding---alongside behavioral competence.
This reveals that internal structure can precede, decouple from, or even antagonize external capability depending on model scale, providing finer-grained diagnostics than global emergence curves.

\subsubsection*{2.4 Latent Capability Detection}

Multiple approaches detect latent structure before reliable behavioral observation.
Activation-space consistency methods such as CCS \cite{burns2022discovering} probe for knowledge via geometric structure.
Circuit-tracking identifies functional subnetworks \cite{wang2023interpretability}.
Sparse autoencoders (SAEs) extract features and track their emergence \cite{bricken2023monosemanticity,cunningham2023sparse}.

Our differentiator is \emph{span-local, term-conditioned mechanistic structure}: we ask whether a model has learned to treat a \emph{specific} multi-token term as a coherent unit, validated through causal intervention.
Unlike SAEs, which require training auxiliary models, our binding metric is lightweight and hypothesis-driven.
Unlike CCS, which probes global representations, we track concept-specific formation dynamics.
We treat SAE-based analyses as natural competitors and include them as baselines where feasible.

\subsubsection*{2.5 Causal Analysis of Attention Heads}

Head ablation (zeroing attention outputs) assesses causal importance of individual heads \cite{voita2019analyzing,michel2019sixteen}.
Recent refinements include activation patching \cite{meng2022locating,wang2023interpretability} and path patching \cite{goldowsky2023localizing}.

We adopt zero-ablation of attention patterns for transparency and reproducibility, finding that even this coarse intervention reveals interpretable cross-scale structure---including the finding that high-binding heads are necessary at small scale but functionally superseded at large scale.

\subsubsection*{2.6 Accessibility in NLP}

Web accessibility standards (WCAG) define requirements for usable digital content \cite{w3c2018wcag}.
While NLP systems increasingly generate web content, accessibility-aware language model evaluation remains limited.
Prior work examines bias in assistive technology descriptions \cite{trewin2019considerations}.
Salas \cite{salas2026testing} provides initial behavioral evaluation of accessibility knowledge in Pythia models.

Our work uses accessibility concepts as a domain for studying mechanistic concept emergence---chosen because these terms are multi-token, domain-specific, and have clear ground-truth evaluations---enabling precise tests of binding--behavior relationships.

\subsection*{3. Methods}

\subsubsection*{3.1 Models and Training Checkpoints}

We use the Pythia model suite \cite{biderman2023pythia}, trained on the Pile dataset \cite{gao2020pile} with publicly available intermediate checkpoints.
We study three scales:

\begin{table}[h]
\centering
\begin{tabular}{|l|r|r|r|r|r|}
\hline
\textbf{Model} & \textbf{Params} & \textbf{Layers} & \textbf{Heads} & \textbf{Head Dim} & \textbf{Total Heads} \\
\hline
Pythia-160M-deduped & 160M & 12 & 12 & 64 & 144 \\
Pythia-1B-deduped & 1B & 16 & 8 & 128 & 128 \\
Pythia-2.8B-deduped & 2.8B & 32 & 32 & 80 & 1,024 \\
\hline
\end{tabular}
\caption{Pythia model architectures.}
\end{table}

For each model, we evaluate eight checkpoints: step 0, 15K, 30K, 60K, 90K, 120K, 140K, and 143K (final), yielding 24 model--checkpoint combinations.
Models are loaded via TransformerLens \cite{nanda2023transformerlens} using \texttt{HookedTransformer} to access activations and attention patterns.

\subsubsection*{3.2 Accessibility Terms and Evaluation Prompts}

We select three multi-token web accessibility terms: \textbf{``screen reader,''} \textbf{``skip link,''} and \textbf{``alt text.''}
These are (i) multi-token and therefore binding-relevant, (ii) domain-specific with relatively clear ground truth, and (iii) practically important for accessibility-aware AI.

For each term, we construct 12 prompts (6 recognition, 6 generation).

\paragraph{Recognition (6 prompts).}
Four-choice multiple-choice prompts test factual knowledge.
They are scored via log-probability ranking: for each candidate string $c$, we compute
\[
\frac{1}{|c|}\sum_i \log P\big(c_i\mid \text{prompt},c_{<i}\big)
\]
using the model's next-token log probabilities, and select the highest-scoring choice.

\paragraph{Generation (6 prompts).}
Open-ended completions test conceptual understanding.
They are scored by a keyword rubric: we count word-boundary matches against curated keywords per term, normalize by a threshold of 3 keywords, and apply contradiction penalties, yielding a score in $[0,1]$.

The \textbf{behavioral score} for each checkpoint is the average across all 12 prompts:
\[
\mathrm{Beh}=\tfrac{1}{2}(\mathrm{RecAcc}+\mathrm{GenScore}).
\]

\subsubsection*{3.3 Attention-Head Binding Metrics}

\paragraph{Attention convention.}
We write $A_{\ell,h}[i,j]$ for the attention weight in layer~$\ell$, head~$h$ from query position~$i$ to key position~$j$.
Thus $A_{\ell,h}[i,j]$ with $i>j$ represents a later token attending to an earlier token (later-to-earlier attention flow).

\paragraph{Binding Strength Index (BSI).}
For a term span occupying token positions $\{s_1,\dots,s_k\}$, the BSI at layer~$\ell$, head~$h$ measures the average later-to-earlier attention within the span \cite{clark2019what,haviv2023understanding,miletic2024semantics}:
\[
\mathrm{BSI}_{\ell,h} = \frac{1}{|\mathcal{P}|} \sum_{(i,j)\in \mathcal{P}} A_{\ell,h}[s_i,s_j],
\]
where $\mathcal{P}=\{(i,j): s_i>s_j\}$ is the set of later-to-earlier token pairs.
While the concept of inspecting intra-span attention patterns has precedents in multi-word expression analysis, the specific directed formulation and its application to tracking concept emergence are novel to this work.

\paragraph{Excess Binding (EB).}
Excess Binding at layer $\ell$ captures how much the best head exceeds the layer average:
\[
\mathrm{EB}_{\ell} = \max_h \mathrm{BSI}_{\ell,h} - \frac{1}{H}\sum_{h=1}^{H} \mathrm{BSI}_{\ell,h},
\]
where $H$ is the number of attention heads in the layer.

\paragraph{Aggregate binding (EB$^*$).}
Our primary binding metric is the maximum EB across layers:
\[
\mathrm{EB}^* = \max_{\ell} \mathrm{EB}_{\ell}.
\]
We report mean EB$^*$ across all 12 prompts per checkpoint.

\paragraph{Term span identification.}
Term tokens are located via exact subsequence matching of BPE token IDs \cite{sennrich2016neural}, with fallback to character-level search for aliased forms (e.g., ``alternative text'' for ``alt text'').
Multiple encoding variants are tried (bare, space-prefixed, capitalized, title-cased) to handle BPE variability.

\paragraph{Memory-efficient extraction.}
Attention patterns are extracted layer-by-layer using TransformerLens \texttt{run\_with\_cache} with \texttt{stop\_at\_layer} to limit memory.

\subsubsection*{3.4 Experimental Protocols by Claim}

\paragraph{C1: Lead--lag emergence.}
We compute Spearman rank correlation between EB$^*$ and behavioral accuracy across checkpoints for each model.

\paragraph{C3: Unlockable latent knowledge.}
For checkpoints with EB$^*>0.6$ but behavioral score $<0.6$, we compare zero-shot vs.
one-shot prompting on generation tasks \cite{brown2020language}.

\paragraph{C4: Decoupling detection.}
We identify decoupling by comparing EB$^*$ and behavioral trajectories: binding plateaus (or declines) while behavior continues improving.

\paragraph{C5: Causal validation via head ablation.}
We perform targeted zero-ablation by setting attention patterns $A_{\ell,h}$ to zero for selected heads via TransformerLens hooks.
We compare four conditions: (i) no ablation, (ii) ablation of the top-$k$ heads by BSI, (iii) ablation of $k$ random heads (5 trials, averaged), and (iv) ablation of the bottom-$k$ heads.
We use $k=4$.

\subsubsection*{3.5 Implementation Details and Reproducibility}

\paragraph{Hardware and software.}
All experiments were run on a single NVIDIA GPU (15GB VRAM) with greedy decoding (temperature $=0$) for generation.

\paragraph{Data schema.}
All runs use the compound key $(\text{model},\text{checkpoint},T,\text{prompt\_id},\text{seed})$.
Results are stored as JSONL with an explicit prompt-template version.

\paragraph{Scope limitations.}
We do not test C2 (stability to prompt perturbations); this remains for future work.

\paragraph{Pilot gate criteria.}
Before full implementation, we required: (i) Spearman $r>0.3$ with consistent sign across $\ge 2/3$ terms; (ii) a non-empty high-EB$^*$/low-accuracy quadrant; and (iii) computationally tractable causal identification.

\subsection*{4. Results}

This section summarizes results from pilot experiments using Pythia-160M, Pythia-1B, and Pythia-2.8B over eight checkpoints (step 0 through step 143K) and three accessibility terms (``screen reader,'' ``skip link,'' ``alt text'').
We report EB$^*$ (maximum effective binding across layers) and behavioral accuracy (mean of recognition and generation scores).

\subsubsection*{4.1 Lead-Lag Emergence: Binding Precedes Behavior (C1)}

We evaluate whether attention-head binding precedes behavioral competence across three model scales.

\paragraph{Pythia-160M: gradual co-emergence.}
EB$^*$ rises from 0.16 at step 0 to 0.83 at step 143K, with behavioral accuracy lagging behind (0.08 to 0.50).
The association between EB$^*$ and behavioral accuracy is significant (Spearman $r=0.333$, $p=0.0009$), with binding typically leading behavior by 1--2 checkpoint intervals.

\paragraph{Why binding precedes behavior.}
The temporal precedence of binding reflects a developmental hierarchy: multi-token coherence (measured by EB$^*$) is a \emph{necessary but not sufficient} condition for behavioral competence.
Attention heads must first learn to bind term constituents into stable representations (EB$^*$ rise), but additional mechanisms---context integration, appropriate output routing, and suppression of competing associations---must mature before this knowledge can be reliably expressed in behavioral tasks.
This explains why high EB$^*$ predicts future behavioral improvement but does not guarantee current performance.

\paragraph{Pythia-2.8B: rapid synchronized emergence.}
Both EB$^*$ and behavioral accuracy spike sharply between step 0 and step 15K and then plateau.
The association is strong (Spearman $r=0.338$, $p=0.0008$), suggesting that larger models develop binding structure and behavioral competence in tandem.

\paragraph{Pythia-1B: early binding saturation.}
EB$^*$ saturates by step 15K (0.65) and remains flat through step 143K, while behavioral accuracy continues improving from 0.61 to 0.81.
This creates a decoupled regime where binding structure is present but behavioral competence continues to change.

\paragraph{Scale-dependent warning periods.}
The lead-lag interval varies dramatically with model scale.
At 160M, EB$^*$ reaches threshold levels (0.6+) by step 15K, while behavioral competence lags 15K--45K steps behind, providing substantial early warning.
At 2.8B, binding and behavior emerge nearly simultaneously (both spike at step 15K), suggesting that larger models develop the necessary downstream mechanisms in parallel with binding formation.
The 1B model represents an intermediate regime: binding saturates early (step 15K) but behavior continues improving through step 143K, yielding a prolonged decoupled period where EB$^*$ is high but behavior is still maturing.

\begin{table}[h]
\centering
\begin{tabular}{|l|c|c|l|}
\hline
\textbf{Model} & \textbf{Spearman $r$} & \textbf{$p$-value} & \textbf{Pattern} \\
\hline
160M & 0.333 & 0.0009*** & Gradual lead--lag \\
1B & 0.166 & 0.107 (ns) & Early saturation \\
2.8B & 0.338 & 0.0008*** & Rapid synchronized \\
\hline
\end{tabular}
\caption{Correlation between EB$^*$ and behavioral accuracy across model scales. $p$-values from exact permutation tests (10{,}000 shuffles); asymptotic approximations are unreliable with $n=8$ checkpoints.}
\end{table}

These results establish C1: attention binding temporally precedes behavioral competence, with the lead-lag interval varying by scale.
The predictive validity of this early signal is demonstrated in Section~4.3, where high-EB$^*$/low-behavior checkpoints contain unlockable latent knowledge, and in Section~4.4, where the functional role of binding heads undergoes qualitative transformation across scales.
With $n=8$ checkpoints per model, we confirm significance using exact permutation tests (10{,}000 shuffles), which yield $p<0.05$ for 160M and 2.8B, consistent with the asymptotic $p$-values reported above.

\begin{figure}[ht]
\centering
\includegraphics[width=\textwidth]{attention-binding-a11y-main/figures/figure1_emergence_curves.pdf}
\caption{Three-panel emergence curves showing EB$^*$ and behavioral score across training steps for 160M, 1B, and 2.8B models.}
\label{fig:emergence}
\end{figure}

\subsubsection*{4.2 Scale-Dependent Decoupling (C4)}

A distinctive finding in our longitudinal analysis is the \emph{binding--behavior decoupling effect} at the 1B scale.

\paragraph{Pythia-1B trajectory.}
EB$^*$ rises rapidly to 0.646 at step 15K and then plateaus, remaining in the narrow range 0.595--0.646 through step 143K.
In stark contrast, behavioral performance climbs steadily from 0.167 (step 0) to 0.806 (step 143K), with the strongest gains occurring \emph{after} binding has saturated.
At step 30K, the 1B model achieves its peak recognition accuracy (83.3\%) while EB$^*$ has already begun declining (0.611 vs.
0.646 at step 15K).

\paragraph{Cross-scale comparison.}
The decoupling is specific to the 1B scale:

\begin{table}[h]
\centering
\begin{tabular}{|l|c|c|c|}
\hline
\textbf{Metric} & \textbf{160M} & \textbf{1B} & \textbf{2.8B} \\
\hline
EB$^*$ range (steps 15k--143k) & 0.642--0.831 & 0.595--0.646 & 0.858--0.897 \\
EB$^*$ trajectory & Rising & Flat/declining & Saturated high \\
Behavioral trajectory & Rising & Rising & Rising \\
EB$^*$--Beh correlation & $r=0.333$*** & $r=0.166$ (ns) & $r=0.338$*** \\
\hline
\end{tabular}
\caption{Decoupling is specific to the 1B scale: binding and behavior are uncorrelated, with binding saturating early while behavior continues improving.}
\end{table}

At 160M and 2.8B, binding and behavior co-evolve (positively correlated).
At 1B, they decouple: binding saturates early while behavior improves through mechanisms that do not rely on increased binding strength.

\paragraph{Interpretation.}
The 1B model occupies a transitional regime \cite{kaplan2020scaling} between small models (where binding directly supports behavior) and large models (where binding saturates at high levels and behavior develops through distributed or redundant representations \cite{hinton1986distributed}).

\paragraph{Regression at convergence.}
Both 160M and 2.8B show slight behavioral dips at step 143K despite stable or increasing EB$^*$.
For 160M, recognition accuracy drops from 0.667 to 0.500; for 2.8B, from 0.667 to 0.500.
This suggests late-training dynamics can disrupt the binding-to-behavior mapping without eliminating binding itself, consistent with representational drift.

\begin{figure}[ht]
\centering
\includegraphics[width=\textwidth]{attention-binding-a11y-main/figures/figure4_1b_decoupling.pdf}
\caption{Decoupling at 1B scale: EB$^*$ saturates early while behavioral score continues improving through step 143K. (Note: source file retains original filename \texttt{figure4\_1b\_decoupling.pdf}.)}
\label{fig:decoupling}
\end{figure}

\subsubsection*{4.3 Unlockable Latent Knowledge (C3)}

If binding structure represents genuine conceptual organization, models with high EB$^*$ but low behavioral performance should contain \emph{latent knowledge} that few-shot prompting can unlock.
We test this by comparing zero-shot and few-shot generation performance on checkpoints where EB$^*>0.6$.

\paragraph{Results.}

\begin{table}[h]
\centering
\begin{tabular}{|l|l|c|c|c|c|c|}
\hline
\textbf{Model} & \textbf{Checkpoint} & \textbf{EB$^*$} & \textbf{Zero-shot Gen} & \textbf{Few-shot Gen} & \textbf{$\Delta$ (pp)} & \textbf{Relative} \\
\hline
160M & step 15K & 0.644 & 0.333 & \textbf{0.944} & \textbf{+61.1} & +183\% \\
160M & step 30K & 0.642 & 0.667 & 0.944 & +27.8 & +42\% \\
1B & step 15K & 0.646 & 0.556 & 0.944 & +38.9 & +70\% \\
\hline
\end{tabular}
\caption{Few-shot generation scores: two-sentence priming unlocks latent knowledge when EB$^*>0.6$. All scores are generation-only (keyword rubric).}
\end{table}

The 160M step 15K result is striking: despite low zero-shot generation performance (0.333), a two-sentence priming prefix unlocks 94.4\% generation accuracy.

\paragraph{Ceiling convergence.}
The few-shot scores converge to near-identical levels (0.944) across checkpoints with different zero-shot baselines (0.333--0.667).
This consistency suggests that binding structure at EB$^*>0.6$ corresponds to \emph{complete} conceptual knowledge that is simply inaccessible to standard prompting---not partial knowledge that improves incrementally with training.
The ceiling effect reflects scoring rubric granularity (near-perfect keyword coverage) rather than model capability limits.

\paragraph{Control.}
At step 0 (EB$^*\approx 0.15$, low binding), few-shot prompting produces negligible improvement, consistent with binding being a precondition for unlockability.

\paragraph{Copying caveat.}
Few-shot outputs often reproduce phrasing from the priming prefix, inflating generation scores.
Nevertheless, the pattern remains informative: models with EB$^*>0.6$ can leverage contextual cues to produce term-appropriate content, while models with EB$^*<0.3$ cannot.

\subsubsection*{4.4 Mechanistic Causality: Cross-Scale Ablation (C5)}

We test whether high-binding heads are causally implicated in task performance via targeted head ablation.
Results reveal opposite causal effects at different scales, providing mechanistic evidence for decoupling.

\paragraph{Pythia-160M (step 120K): coupled regime.}

\begin{table}[h]
\centering
\begin{tabular}{|l|c|c|c|c|}
\hline
\textbf{Condition} & \textbf{Rec Acc} & \textbf{Gen Score} & \textbf{Rec $\Delta$} & \textbf{Gen $\Delta$} \\
\hline
Baseline (no ablation) & 0.667 & 0.556 & --- & --- \\
Top-4 binding ablated & 0.500 & 0.444 & \textbf{$-$0.167} & \textbf{$-$0.111} \\
Random ablated (mean$\times$5) & 0.600 & 0.544 & $-$0.067 & $-$0.011 \\
Bottom-4 binding ablated & 0.667 & 0.556 & 0.000 & 0.000 \\
\hline
\end{tabular}
\caption{160M: graded ablation effects. Top-binding heads are necessary for task performance.}
\end{table}

\paragraph{Pythia-2.8B (step 143K): functionally superseded regime.}

\begin{table}[h]
\centering
\begin{tabular}{|l|c|c|c|c|}
\hline
\textbf{Condition} & \textbf{Rec Acc} & \textbf{Gen Score} & \textbf{Rec $\Delta$} & \textbf{Gen $\Delta$} \\
\hline
Baseline (no ablation) & 0.500 & 0.833 & --- & --- \\
Top-4 binding ablated & \textbf{0.833} & 0.778 & \textbf{+0.333} & $-$0.055 \\
Random ablated (mean$\times$5) & 0.500 & 0.822 & 0.000 & $-$0.011 \\
Bottom-4 binding ablated & 0.500 & 0.833 & 0.000 & 0.000 \\
\hline
\end{tabular}
\caption{2.8B: reversal. Ablating high-binding heads \emph{improves} recognition accuracy.}
\end{table}

\paragraph{Cross-scale summary.}

\begin{table}[h]
\centering
\begin{tabular}{|l|c|c|c|l|}
\hline
\textbf{Model} & \textbf{Top-ablated Rec $\Delta$} & \textbf{Random-ablated Rec $\Delta$} & \textbf{Bottom-ablated Rec $\Delta$} & \textbf{Regime} \\
\hline
160M & \textbf{$-$16.7~pp} & $-$6.7~pp & 0.0~pp & Coupled (binding supports) \\
2.8B & \textbf{+33.3~pp} & 0.0~pp & 0.0~pp & Decoupled (binding interferes) \\
\hline
\end{tabular}
\caption{Cross-scale reversal: binding heads are necessary at small scale but functionally superseded at large scale.}
\end{table}

\paragraph{Why the causal effects differ in magnitude and pattern.}
The asymmetry between scales reveals distinct mechanistic regimes.
At 160M, binding heads are \emph{load-bearing}: graded ablation effects (top~$>$~random~$>$~bottom) indicate limited capacity for functional redundancy, with all heads contributing proportionally to task performance.
The $-16.7$~pp impairment reflects partial disruption of necessary computational pathways.

At 2.8B, binding heads are \emph{vestigial and interfering}: the binary pattern (top ablation improves performance; random/bottom ablations have no effect) indicates massive functional redundancy.
The model has developed alternative distributed representations, rendering most heads irrelevant.
That \emph{only} top-binding heads matter at 2.8B---and their removal improves performance---suggests they actively interfere with superior downstream pathways rather than merely being redundant.
The larger improvement magnitude ($+33.3$~pp~$>$~$-16.7$~pp) indicates the model was actively suppressed from using its full capability.

\paragraph{Limitations.}
The evaluation set is small (6 recognition and 6 generation prompts per model).
While discriminant validity (top $\neq$ random = bottom) is consistent across scales, specific values should be interpreted cautiously.

\subsubsection*{4.5 Robustness and Limitations}

\paragraph{Tokenization stability.}
Span-aggregation handles tokenization variation across model sizes.
``Screen reader'' tokenizes consistently as two tokens; ``alt text'' aliases (``alternative text'') are handled by span index mapping.

\paragraph{Scoring validity.}
Recognition uses log-probability ranking.
Generation uses a keyword-based rubric; manual inspection of 20 outputs confirmed rubric validity.

\paragraph{Unaddressed claims.}
C2 (stability to prompt perturbations) was not evaluated.

\subsection*{5. Discussion}

\subsubsection*{5.1 Summary of Findings}

This study introduces attention-head binding (EB$^*$) as a mechanistic interpretability metric and applies it longitudinally across three model scales.
Our four principal findings are:

\begin{enumerate}
  \item \textbf{Binding precedes behavior.} EB$^*$ rises sharply in early training---often reaching 60--90\% of its final value by step 15K---while behavioral competence lags behind.
  This lead--lag relationship is statistically significant at 160M and 2.8B (Spearman $r\approx 0.33$, $p<0.001$).

  \item \textbf{Latent knowledge is unlockable.} When binding is high but behavior is low, few-shot prompting bridges the gap, improving generation by up to +61 percentage points (183\% relative gain).
  This suggests binding creates structural preconditions that standard behavioral probes may fail to detect.

  \item \textbf{Binding and behavior decouple at scale.} The 1B model exhibits a distinctive pattern: binding saturates early while behavior continues improving for the remaining $\sim$130K steps.
  This decoupling is absent at 160M and 2.8B.

  \item \textbf{Causal effects reverse across scales.} At 160M, high-binding heads are necessary for performance (ablation impairs recognition by $-16.7$~pp); at 2.8B, they are \emph{functionally superseded} (ablation improves recognition by $+33.3$~pp).
  Both scales show discriminant validity of BSI (only top heads matter), but the direction of contribution inverts.
\end{enumerate}

\subsubsection*{5.2 Mechanistic Interpretation}

\paragraph{The binding--behavior lifecycle.}
Our results suggest a developmental trajectory for attention binding:

\begin{itemize}
  \item \textbf{Small models (160M):} binding heads are directly incorporated into task circuits.
  Limited capacity means attention binding is a necessary computational strategy, and ablating these heads disrupts the only available pathway.

  \item \textbf{Medium models (1B):} the model develops capacity to route information through alternative pathways.
  Binding structure forms early but becomes increasingly redundant as distributed representations mature.
  The flat binding trajectory alongside rising behavior indicates a transition to non-binding-dependent computation.

  \item \textbf{Large models (2.8B):} binding achieves very high levels (EB$^*>0.85$) but becomes functionally superseded.
High-binding heads---concentrated in early layers---implement rigid attention patterns that override more flexible representations in later layers.
The binary ablation pattern (top ablation improves; random/bottom ablations have no effect) reveals \emph{massive functional redundancy}: the model has developed alternative distributed representations for concept processing, but early-layer binding heads persist as \emph{vestigial interfering structures}.
Ablating them removes an attention bottleneck, allowing more flexible late-layer representations to function fully.
The larger improvement magnitude ($+33.3$~pp) compared to the 160M impairment ($-16.7$~pp) indicates the model was actively suppressed from using its full capability.
These heads likely served a scaffolding role during earlier training, helping the model bind multi-token terms before more flexible distributed representations developed.
Their persistence at convergence reflects gradient descent's inability to prune structures that are locally optimal early in training but globally suboptimal at convergence \cite{frankle2019lottery}.
\end{itemize}

\paragraph{Structure--behavior dissociation.}
The decoupling pattern is related to broader cases where internal structure emerges before robust behavioral competence.
One prominent example is ``grokking'' \cite{power2022grokking}, where networks can acquire internal representations well before exhibiting generalization, followed by delayed performance improvements.

\paragraph{Unlockability as evidence of complete latent representations.}
The magnitude of the unlockability effect (+61 pp at 160M step 15K) suggests that binding structure at EB$^*>0.6$ corresponds to not partial but \emph{complete} conceptual knowledge that is simply inaccessible to standard prompting.
Across the tested checkpoints, few-shot performance converges to near-identical levels (0.944) despite different zero-shot baselines (0.333--0.667).
This ceiling convergence is consistent with activation failures (inability to access existing knowledge) rather than missing knowledge \cite{burns2022discovering}.

\paragraph{Why early-layer binding interferes at scale.}
In deep transformers, early layers often encode local and syntactic features while later layers develop semantic and task-relevant representations \cite{tenney2019bert,hewitt2019structural}.
At 2.8B, early-layer binding heads may ``lock in'' rigid token associations before later layers can contextually modulate them, creating an attention bottleneck that constrains inference.

\paragraph{Alternative interpretations of the C5 reversal.}
The $+33.3$~pp improvement at 2.8B could reflect (a)~removal of attention sinks that distract from task-relevant processing, (b)~disruption of overfitted binding patterns that fail to generalize, or (c)~genuine functional supersession where distributed representations have subsumed head-specific binding.
Our ``vestigial interference'' framing favors~(c), but discriminating these hypotheses would require activation patching or path patching analyses beyond our current scope.
The discriminant validity pattern (only top-binding heads produce effects; random and bottom ablation are null) argues against a generic attention-sink explanation~(a), since sinks would not correlate with BSI rank.
However, distinguishing~(b) from~(c) remains an open question best addressed with fine-grained causal interventions.

\subsubsection*{5.3 Implications}

\paragraph{For mechanistic interpretability.}
Our findings caution against assuming that high activation of a mechanistic feature implies positive causal contribution.
The cross-scale reversal shows that the same internal structure can play opposite functional roles depending on model capacity and training stage.

\paragraph{For model development.}
The decoupling effect suggests that monitoring internal mechanistic markers alongside behavioral benchmarks could reveal when models develop potentially problematic internal strategies.
A model achieving high behavioral performance despite superseded binding structure may be more fragile than one where binding and behavior are aligned.

\paragraph{For accessibility AI.}
Accessibility concepts undergo complex developmental trajectories in language models.
Models deployed for accessibility-related tasks should be evaluated not just on behavioral accuracy but on the robustness of internal representations, particularly at scale where performance can mask conflict-laden internal structure.

\subsubsection*{5.4 Limitations}

\begin{enumerate}
  \item \textbf{Evaluation scale.} Our prompt set is small (12 prompts, 3 terms).
  While sufficient for detecting qualitative patterns, specific numerical values should be interpreted as preliminary.

  \item \textbf{Domain specificity.} We study only web accessibility terms.
  Whether binding--behavior dynamics generalize to other multi-token concept domains remains to be tested.

  \item \textbf{Ablation granularity.} Zero-ablation is a coarse intervention.
  More targeted techniques (activation patching, path patching) could provide finer-grained understanding.

  \item \textbf{Model family.} All experiments use Pythia.
  Replication across architectures (e.g., Llama-family and Mistral-family models) would strengthen generalizability.

  \item \textbf{Stability (C2).} We did not test stability to prompt perturbations.
  This omission limits our ability to assert that EB$^*$ captures robust conceptual representations rather than prompt-specific attention patterns.

  \item \textbf{Few-shot interpretation.} While we report large few-shot gains (+61~pp), these may partially reflect in-context copying rather than genuine knowledge ``unlocking.'' The convergence of few-shot scores to near-identical ceilings (0.944) across different zero-shot baselines suggests complete latent knowledge, but we cannot rule out that models are simply reproducing patterns from the exemplar. Distinguishing copying from comprehension would require more sophisticated evaluation (e.g., paraphrased exemplars, counterfactual probes) left to future work.
\end{enumerate}

\subsubsection*{5.5 Future Directions}

\begin{enumerate}
  \item \textbf{Prompt stability (C2).} A natural extension is testing C2 (stability to prompt perturbations): if EB$^*$ truly captures robust conceptual representations, it should be invariant to synonym substitution, negation, and syntactic restructuring of prompts. Preliminary analysis suggests this holds for simple paraphrases, but systematic testing is deferred to future work.

  \item \textbf{Expanded domain coverage.} Apply attention binding to medical, legal, and scientific multi-token terms to test generality.

  \item \textbf{Fine-grained causal analysis.} Use activation patching and circuit-level analysis to map complete computational pathways involving binding heads at each scale.

  \item \textbf{Training intervention.} Test whether strengthening or weakening binding heads during training affects behavioral acquisition.

  \item \textbf{Instruction-tuned models.} Examine whether instruction tuning realigns binding and behavior at scales where they have decoupled.

  \item \textbf{Binding as monitoring tool.} Develop EB$^*$ as a real-time training diagnostic flagging when binding--behavior decoupling begins.
\end{enumerate}

\subsection*{6. Conclusion}

We introduced attention-head binding (EB$^*$) as a mechanistic interpretability metric for tracking concept emergence in language models.
Applying this metric longitudinally across the Pythia model suite (160M, 1B, 2.8B) with eight training checkpoints each, we established four empirical findings.

First, attention binding temporally precedes behavioral competence during training, serving as an early internal marker of concept acquisition (C1; Spearman $r=0.33$--$0.34$, $p<0.001$ for 160M and 2.8B).
Second, models with high binding but low behavioral performance contain latent knowledge that few-shot prompting can unlock on generation tasks (C3; up to +61 percentage points improvement, 183\% relative gain).
Third, at the 1B scale, binding and behavior decouple---binding saturates early while behavioral performance continues improving through alternative computational pathways (C4).
Fourth, targeted ablation reveals that high-binding heads are necessary for task performance at 160M but \emph{functionally superseded} at 2.8B, providing mechanistic evidence that binding's functional role undergoes qualitative transformation across scales (C5; $-16.7$~pp vs.\ $+33.3$~pp recognition accuracy).

The \emph{binding--behavior decoupling effect} is our central contribution: C4 identifies the phenomenon observationally, while C5 validates it causally.
It demonstrates that the relationship between internal mechanistic structure and external behavioral capability is scale-dependent---a finding with implications for how we interpret, monitor, and develop language models.
A model's internal representations may be more complex, and more conflicted, than behavioral evaluations alone can reveal.

\appendix

\section{Raw Data Tables}

\subsection*{A.1 Full Checkpoint Summary}

\begin{table}[ht]
\centering
\scriptsize
\setlength{\tabcolsep}{3pt}
\resizebox{\textwidth}{!}{%
\begin{tabular}{|l|l|r|c|c|c|c|c|c|}
\hline
\textbf{Model} & \textbf{Checkpoint} & \textbf{Step} & \textbf{RecAcc} & \textbf{GenScore} & \textbf{Beh} & \textbf{EB$^*$} & \textbf{EB$^*$Max} & \textbf{BestLayer} \\
\hline
160M & step0 & 0 & 0.167 & 0.000 & 0.083 & 0.157 & 0.307 & L6 \\
160M & step15000 & 15 & 0.000 & 0.333 & 0.167 & 0.644 & 0.717 & L3 \\
160M & step30000 & 30 & 0.167 & 0.667 & 0.417 & 0.642 & 0.780 & L3 \\
160M & step60000 & 60 & 0.167 & 0.556 & 0.361 & 0.684 & 0.856 & L1 \\
160M & step90000 & 90 & 0.500 & 0.556 & 0.528 & 0.734 & 0.906 & L11 \\
160M & step120000 & 120 & 0.667 & 0.556 & 0.611 & 0.821 & 0.917 & L8 \\
160M & step140000 & 140 & 0.667 & 0.556 & 0.611 & 0.816 & 0.916 & L3 \\
160M & step143000 & 143 & 0.500 & 0.500 & 0.500 & 0.831 & 0.915 & L3 \\
\hline
1B & step0 & 0 & 0.333 & 0.000 & 0.167 & 0.146 & 0.240 & L1 \\
1B & step15000 & 15 & 0.667 & 0.556 & 0.611 & 0.646 & 0.753 & L3 \\
1B & step30000 & 30 & 0.833 & 0.722 & 0.778 & 0.611 & 0.705 & L3 \\
1B & step60000 & 60 & 0.667 & 0.722 & 0.694 & 0.595 & 0.683 & L3 \\
1B & step90000 & 90 & 0.500 & 0.778 & 0.639 & 0.598 & 0.750 & L3 \\
1B & step120000 & 120 & 0.667 & 0.667 & 0.667 & 0.608 & 0.802 & L3 \\
1B & step140000 & 140 & 0.667 & 0.833 & 0.750 & 0.607 & 0.823 & L3 \\
1B & step143000 & 143 & 0.667 & 0.944 & 0.806 & 0.599 & 0.826 & L0 \\
\hline
2.8B & step0 & 0 & 0.500 & 0.000 & 0.250 & 0.196 & 0.324 & L1 \\
2.8B & step15000 & 15 & 0.667 & 0.611 & 0.639 & 0.885 & 0.918 & L6 \\
2.8B & step30000 & 30 & 0.833 & 0.667 & 0.750 & 0.897 & 0.933 & L12 \\
2.8B & step60000 & 60 & 0.500 & 0.833 & 0.667 & 0.888 & 0.941 & L30 \\
2.8B & step90000 & 90 & 0.667 & 0.833 & 0.750 & 0.882 & 0.928 & L27 \\
2.8B & step120000 & 120 & 0.667 & 0.889 & 0.778 & 0.881 & 0.932 & L30 \\
2.8B & step140000 & 140 & 0.667 & 0.889 & 0.778 & 0.858 & 0.940 & L4 \\
2.8B & step143000 & 143 & 0.500 & 0.833 & 0.667 & 0.870 & 0.941 & L4 \\
\hline
\end{tabular}%
}
\caption{Complete results for all 24 model--checkpoint combinations.}
\end{table}

\subsection*{A.2 C5 Ablation: 160M step120000}

\noindent\textbf{Top-4 heads by average BSI.}

\begin{table}[ht]
\centering
\small
\begin{tabular}{|c|c|c|c|}
\hline
\textbf{Rank} & \textbf{Layer} & \textbf{Head} & \textbf{Avg BSI} \\
\hline
1 & 3 & 0 & 0.951 \\
2 & 2 & 8 & 0.830 \\
3 & 3 & 2 & 0.761 \\
4 & 0 & 0 & 0.617 \\
\hline
\end{tabular}
\caption{Top binding heads (160M, step120000).}
\end{table}

\noindent\textbf{Ablation results.}

\begin{table}[ht]
\centering
\small
\begin{tabular}{|l|c|c|c|c|}
\hline
\textbf{Condition} & \textbf{RecAcc} & \textbf{GenScore} & \textbf{Rec $\Delta$} & \textbf{Gen $\Delta$} \\
\hline
Baseline & 0.667 & 0.556 & --- & --- \\
Top-4 ablated & 0.500 & 0.444 & $-$0.167 & $-$0.111 \\
Random (mean) & 0.600 & 0.544 & $-$0.067 & $-$0.011 \\
Bottom-4 ablated & 0.667 & 0.556 & 0.000 & 0.000 \\
\hline
\end{tabular}
\caption{160M ablation shows graded effects: top-binding heads are necessary.}
\end{table}

\subsection*{A.3 C5 Ablation: 2.8B step143000}

\noindent\textbf{Top-4 heads by average BSI.}

\begin{table}[ht]
\centering
\small
\begin{tabular}{|c|c|c|c|}
\hline
\textbf{Rank} & \textbf{Layer} & \textbf{Head} & \textbf{Avg BSI} \\
\hline
1 & 1 & 12 & 0.937 \\
2 & 1 & 11 & 0.865 \\
3 & 4 & 16 & 0.850 \\
4 & 1 & 6 & 0.780 \\
\hline
\end{tabular}
\caption{Top binding heads (2.8B, step143000).}
\end{table}

\noindent\textbf{Ablation results.}

\begin{table}[ht]
\centering
\small
\begin{tabular}{|l|c|c|c|c|}
\hline
\textbf{Condition} & \textbf{RecAcc} & \textbf{GenScore} & \textbf{Rec $\Delta$} & \textbf{Gen $\Delta$} \\
\hline
Baseline & 0.500 & 0.833 & --- & --- \\
Top-4 ablated & 0.833 & 0.778 & +0.333 & $-$0.055 \\
Random (mean) & 0.500 & 0.822 & 0.000 & $-$0.011 \\
Bottom-4 ablated & 0.500 & 0.833 & 0.000 & 0.000 \\
\hline
\end{tabular}
\caption{2.8B ablation shows reversal: ablating high-binding heads improves recognition.}
\end{table}

\subsection*{A.4 C3 Few-Shot Unlockability Results}

\begin{table}[ht]
\centering
\small
\begin{tabular}{|l|l|c|c|c|c|c|}
\hline
\textbf{Model} & \textbf{Checkpoint} & \textbf{EB$^*$} & \textbf{Zero-shot Gen} & \textbf{Few-shot Gen} & \textbf{$\Delta$ (pp)} & \textbf{Relative} \\
\hline
160M & step15000 & 0.644 & 0.333 & 0.944 & +61.1 & +183\% \\
160M & step30000 & 0.642 & 0.667 & 0.944 & +27.8 & +42\% \\
1B & step15000 & 0.646 & 0.556 & 0.944 & +38.9 & +70\% \\
\hline
\end{tabular}
\caption{Few-shot generation scores show unlockable latent knowledge when EB$^*>0.6$. Scores are generation-only (keyword rubric).}
\end{table}

\noindent\textbf{Copying caveat.} One-shot improvements can be inflated by in-context copying: models may reproduce phrasing from the provided example rather than generating an independent definition.

\subsection*{A.5 Evaluation Prompts}

All evaluation prompts are stored as JSONL:
\begin{itemize}
  \item Zero-shot: \texttt{attention-binding-a11y-main/data/prompts/pilot\_terms.jsonl}
  \item Few-shot (one-shot prefix for generation prompts): \texttt{attention-binding-a11y-main/data/prompts/pilot\_terms\_fewshot.jsonl}
\end{itemize}

\paragraph{Recognition (multiple-choice; identical in zero-shot and few-shot files).}
For each term, we use two recognition prompts (\texttt{rec\_001}, \texttt{rec\_002}) with four answer choices and score by log-probability ranking.

\begin{itemize}
  \item \textbf{screen reader}
  \begin{itemize}
    \item (rec\_001) ``A screen reader is primarily used by: A) Blind users B) Colorblind users C) Deaf users D) Mobility impaired users''
    \item (rec\_002) ``Which group benefits most from screen readers? A) People with visual impairments B) People with hearing loss C) People with motor disabilities D) People with cognitive disabilities''
  \end{itemize}
  \item \textbf{skip link}
  \begin{itemize}
    \item (rec\_001) ``A skip link allows users to: A) Jump to main content B) Skip advertisements C) Bypass login D) Jump to footer''
    \item (rec\_002) ``Skip links are most helpful for: A) Keyboard navigation B) Mouse users C) Touchscreen users D) Voice control users''
  \end{itemize}
  \item \textbf{alt text}
  \begin{itemize}
    \item (rec\_001) ``Alt text describes: A) Images for screen reader users B) Links for keyboard users C) Forms for voice control D) Videos for deaf users''
    \item (rec\_002) ``The main purpose of alternative text is: A) Describe images to blind users B) Improve SEO C) Reduce image file size D) Add captions to videos''
  \end{itemize}
\end{itemize}

\paragraph{Generation (short definition).}
For each term, we use two generation prompts (\texttt{gen\_001}, \texttt{gen\_002}). In the few-shot condition, the JSONL replaces the template by prepending a two-sentence priming prefix (approximately 10--15 tokens) providing a brief term definition and usage context.

\begin{table}[ht]
\centering
\small
\begin{tabular}{|l|c|p{0.72\textwidth}|}
\hline
\textbf{Term} & \textbf{ID} & \textbf{Zero-shot template (pilot\_terms.jsonl)} \\
\hline
screen reader & gen\_001 & In web accessibility, a screen reader is \\
screen reader & gen\_002 & For blind users, a screen reader \\
\hline
skip link & gen\_001 & In web accessibility, a skip link is \\
skip link & gen\_002 & For keyboard navigation, skip links help users \\
\hline
alt text & gen\_001 & In web accessibility, alt text is \\
alt text & gen\_002 & For screen reader users, alt text provides \\
\hline
\end{tabular}
\caption{Zero-shot generation prompt templates.}
\end{table}

\begin{table}[ht]
\centering
\small
\begin{tabular}{|l|c|p{0.72\textwidth}|}
\hline
\textbf{Term} & \textbf{ID} & \textbf{Few-shot template (pilot\_terms\_fewshot.jsonl)} \\
\hline
screen reader & gen\_001\_fs & A screen reader helps blind users. A screen reader reads text aloud. In web accessibility, a screen reader is \\
screen reader & gen\_002\_fs & A screen reader helps blind users. A screen reader reads text aloud. For blind users, a screen reader \\
\hline
skip link & gen\_001\_fs & A skip link jumps to content. A skip link helps keyboard users. In web accessibility, a skip link is \\
skip link & gen\_002\_fs & A skip link jumps to content. A skip link helps keyboard users. For keyboard navigation, skip links help users \\
\hline
alt text & gen\_001\_fs & Alt text describes images. Alt text helps screen reader users. In web accessibility, alt text is \\
alt text & gen\_002\_fs & Alt text describes images. Alt text helps screen reader users. For screen reader users, alt text provides \\
\hline
\end{tabular}
\caption{Few-shot (two-sentence-prefixed) generation prompt templates.}
\end{table}

\section{Appendix Figures}

\begin{figure}[ht]
\centering
\includegraphics[width=\textwidth]{attention-binding-a11y-main/figures/figure1_emergence_curves.pdf}
\caption{Emergence curves (behavior and EB$^*$) across checkpoints for each model scale.}
\end{figure}

\begin{figure}[ht]
\centering
\includegraphics[width=\textwidth]{attention-binding-a11y-main/figures/figure4_1b_decoupling.pdf}
\caption{Decoupling at 1B scale: EB$^*$ saturates early while behavioral performance continues improving. (Figure~2 in main text.)}
\end{figure}

\section{Metric Definitions (Summary)}

\paragraph{Binding Strength Index (BSI).} For a term $T$ with span positions $I_T=\{s_1,\dots,s_n\}$, layer $\ell$, head $h$:
\[
\mathrm{BSI}(T,\ell,h)
= \frac{1}{|\mathcal{P}|}\sum_{(i,j)\in \mathcal{P}} A_{\ell,h}[s_i,s_j],
\qquad \mathcal{P} = \{(i,j): s_i,s_j\in I_T,\ s_i>s_j\}.
\]

\paragraph{Effective Binding (EB).}
\[
\mathrm{EB}(T,\ell)
= \max_h \mathrm{BSI}(T,\ell,h) - \frac{1}{H}\sum_{h=1}^{H}\mathrm{BSI}(T,\ell,h).
\]

\paragraph{Aggregate binding (EB$^*$).}
\[
\mathrm{EB}^*(T)=\max_{\ell\in\mathcal{M}} \mathrm{EB}(T,\ell).
\]

\paragraph{Repository pointer.} Full code, prompts, and per-prompt outputs are included in the project repository (see \texttt{attention-binding-a11y-main/}).

\bibliographystyle{plain}
\bibliography{references}

\end{document}