\documentclass[pmlr]{jmlr}
\usepackage{longtable}
\usepackage{booktabs}
\usepackage[load-configurations=version-1]{siunitx}
\usepackage{graphicx}
\usepackage{pifont} 
\usepackage{wrapfig}
% In your preamble
\newcommand{\NCLMCTT}{\textbf{N}eural \textbf{C}ode \textbf{L}anguage \textbf{M}odel for \textbf{C}ontrollable \textbf{T}imbre \textbf{T}ransfer (\textbf{NCLMCTT})}

\jmlrvolume{303}
\jmlryear{2026}
\jmlrworkshop{EAIM2026 at AAAI}

\title[Neural Codec Language Model for Timbre Transfer]{Neural Codec Language Model for Controllable Timbre Transfer in Music Synthesis}

\author{\Name{Sheldon Liu} \Email{shilong@amazon.com}\\
\Name{Tianyu Liu} \Email{xyzliu@amazon.com} \\
\Name{Deepak Dalakoti} \Email{dalakoti@amazon.com} \\ 
\Name{Adithya Suresh} \Email{adxthya@amazon.com} \\
\Name{Yueying Teng} \Email{yyteng@amazon.com} \\
\Name{Xuefeng Liu} \Email{liuxuefe@amazon.com} \\
\Name{Atanu Roy} \Email{atanuroy@amazon.com} \\
\addr Amazon Web Services Australia Pty. Ltd., 2 Park Street, NSW, Australia
\AND
\Name{Randeep Bhatia} \Email{randeep@splashmusic.com} \\
\Name{Daniel Hatadi} \Email{danielh@splashmusic.com} \\
\Name{Prabhjeet Ghuman} \Email{prabh@splashmusic.com} \\
\addr Splash, Brisbane, QLD, Australia
}

% \editors{}

\begin{document}

\maketitle

\begin{abstract}
Neural codec language models have revolutionized speech synthesis but face significant challenges when adapted to music generation, particularly in achieving precise timbre control while preserving melodic content. We introduce \NCLMCTT, a novel architecture that enables zero-shot instrument cloning through direct audio conditioning without explicit timbre learning. Our approach combines a 385M-parameter transformer for coarse musical structure modeling with a specialized upsampler for fine timbral detail, achieving flexible control through 1-5 second reference audio segments. We establish the first comprehensive benchmark dataset for controllable timbre transfer evaluation, comprising 62,500 high-fidelity samples across 50 synthesizer presets with ground truth targets. Extensive experiments demonstrate substantial improvements over the TokenSynth baseline: 27.1\% reduction in SI-SDR, 50.9\% in Mel Distance, and 59.4\% in STFT Distance, while maintaining strong melodic coherence (Chroma Similarity: 0.85). Our method achieves robust zero-shot generalization, with performance on unseen instrument presets matching that of seen presets. Ablation studies confirm that extended reference audio duration (40.8\% improvement), cross-attention mechanisms (11.9\% improvement), and increased model capacity contribute meaningfully to overall performance. By separating melodic content from timbral characteristics and enabling implicit timbre control, NCLMCTT provides both immediate practical value for music creators and a methodological foundation for advancing controllable neural audio synthesis.
\end{abstract}

\begin{keywords}
Neural codec language models, Timbre transfer, Controllable music synthesis, Zero-shot generalization, Audio generation
\end{keywords}

\section{Introduction}
\label{sec:intro}

The democratization of music creation through AI has reached a critical juncture. While commercial systems like Suno and Udio generate impressive musical compositions~\cite{nugroho2024use}, the field lacks precise controllable synthesis mechanisms that separate melodic content from timbral characteristics. Current text-to-music models struggle with fine-grained instrument control due to natural language ambiguity~\cite{schneider2024mousai,agostinelli2023musiclm}.

Neural codec language models revolutionized speech synthesis by treating audio generation as discrete token prediction~\cite{wang2023neural}. Recent efforts to adapt these models to music generation show promise but face critical challenges: existing approaches either rely on token-level manipulation without explicit conditioning mechanisms or require pretrained timbre encoders that limit flexibility. The need for music-specific architectures that can leverage the efficiency of neural codec models while providing precise timbre control remains largely unaddressed.

The evaluation crisis compounds these challenges. Current metrics like Fréchet Audio Distance (FAD) show poor correlation with human judgment, while commercial systems now outperform reference datasets~\cite{grotschla2025benchmarking}. The lack of standardized protocols for controllable synthesis has hindered rigorous comparison and slowed progress in the field.

Our contributions advance neural codec language models for music synthesis across technical and methodological dimensions. We introduce \NCLMCTT (Fig. \ref{fig:NCLMCTT}), featuring flexible control signal durations, zero-shot instrument control through direct audio conditioning. We establish the first comprehensive benchmark dataset for controllable timbre transfer evaluation, comprising 62,500 high-fidelity samples with standardized metrics. 
\begin{figure}[t]
\centering
\includegraphics[width=\textwidth]{imgs/NCLMCTT_v2.png}
\caption{NCLMCTT architecture for controllable timbre transfer. The pipeline processes MIDI input and control signals through four stages: (1) Dataset augmentation and DAC conversion, (2) Feature extraction from MIDI, control (reference audio), and melody (source audio) into tokenized representations, (3) Embedding with positional encoding, and (4) Transformer-based modeling with multi-head self-attention and cross-attention mechanisms.}
\label{fig:NCLMCTT}
\end{figure}
Our empirical validation demonstrates substantial improvements over the TokenSynth ~\cite{kim2025tokensynth} baseline: 27.1\% reduction in Scale-Invariant Signal-to-Distortion Ratio (SI-SDR), 50.9\% in Mel Distance, and 59.4\% in Short-Time Fourier Transform (STFT) Distance, while maintaining strong melodic preservation. These improvements are achieved through music-specific architectural modifications including hierarchical token generation, explicit cross-attention mechanisms for timbre conditioning, and flexible reference audio duration from 1 to 5 seconds.

We provide open-source access to our curated dataset, evaluation protocols, and complete implementation, establishing a reproducible foundation for advancing controllable neural music synthesis research. By positioning neural codec language models as a bridge between symbolic music representation and audio synthesis, our work provides both immediate practical value for music creators and a methodological foundation for advancing controllable audio generation research.
\vspace{-5mm}
\section{Related Work}
\label{sec:related}

\textbf{Neural Codec Language Models}: VALL-E~\cite{wang2023neural} pioneered neural codec language models for speech synthesis, using EnCodec's hierarchical representations for zero-shot voice cloning with 3-second enrollment. VALL-E 2~\cite{chen2024vall} achieved human parity through repetition-aware sampling and grouped code modeling, reducing word error rates by 50\%. AudioLM~\cite{borsos2023audiolm} introduced semantic-acoustic decomposition using w2v-BERT and SoundStream tokens, enabling controllable synthesis without text transcripts but requiring computationally expensive cascaded models. \textbf{Text-to-Music Generation}: MusicLM~\cite{agostinelli2023musiclm} adapts AudioLM's hierarchical approach but struggles with precise instrument control due to language ambiguity. MusicGen~\cite{copet2023simple} revolutionized efficiency through single-stage architecture with token interleaving across EnCodec's 4 codebooks, achieving superior performance while reducing computational requirements. Moûsai~\cite{schneider2024mousai} employs latent diffusion with 64× compression for high-quality stereo generation but requires complex sampling procedures. \textbf{Controllable Music Synthesis and Audio Codecs}: NSynth~\cite{engel2017neural} established timbre control through 16-dimensional embeddings, while DDSP approaches~\cite{engel2020ddsp} enable interpretable control through synthesizer parameters. However, most methods require explicit timbre learning and lack zero-shot generalization. Recent audio codecs have achieved extreme compression: EnCodec~\cite{defossez2022high} provides 16-32× compression, Descript Audio Codec (DAC)~\cite{kumar2023high} achieves 90× ratios with superior quality, and WavTokenizer reduces audio to 40-75 tokens per second~\cite{ji2024wavtokenizer}. \textbf{Instrument Cloning}: TokenSynth~\cite{kim2025tokensynth} performs zero-shot polyphonic instrument cloning using CLAP-conditioned transformers that generate DAC tokens autoregressively, enabling text-guided timbre manipulation through cross-modal embedding interpolation. However, the method relies on pretrained CLAP embeddings for timbre conditioning, requiring explicit timbre representations learned during pretraining. In contrast, our approach achieves adaptive timbre transfer without explicit timbre learning, enabling more flexible generalization. We benchmark against TokenSynth as the most closely related work to our framework. \textbf{Evaluation and Our Approach}: Current evaluation methodologies suffer from significant limitations, with FAD showing poor correlation with human judgment and lack of standardized protocols for controllable synthesis~\cite{gui2024adapting,grotschla2025benchmarking}. Our work addresses these gaps through music-specific architectural components and audio-based conditioning for zero-shot generalization. Most importantly, we introduce the first comprehensive benchmark specifically designed for controllable timbre transfer, enabling systematic evaluation of both melodic preservation and timbral fidelity.
\vspace{-5mm}
\section{Problem Formulation}
\label{sec-problem-formulation}

\textbf{Task Definition:} We address timbre-conditioned melody synthesis in neural audio generation, where timbre control is derived directly from reference audio rather than text descriptions or predefined instrument categories. Given a dataset containing 1,250 MIDI melodies $\mathcal{M} = \{m_1, \ldots, m_{1250}\}$, 50 synthesizer presets $\mathcal{P} = \{p_1, \ldots, p_{50}\}$, and corresponding rendered waveforms $\mathcal{W} = \{w_{i,j} |\ i \in [1, 1250], j \in [1, 50]\}$, we formalize the task as learning a mapping function $f_{\theta}$ that transforms a MIDI melody $m_i$ and a reference audio snippet $c_{a,j}$ (1-5s crop from $w_{a,j}$) into a synthesized waveform:
\begin{equation}
\hat{w}_{i,j} = f_{\theta}(m_i, c_{a,j})
\end{equation}

\textbf{Timbre Transfer Mechanism:} The reference audio $c_{a,j}$ serves as the sole source of timbre information, enabling the model to extract and transfer timbral characteristics without relying on intermediate representations such as text embeddings or explicit instrument labels. When $a \neq i$, the control signal contains different melodic content compared to the input MIDI $m_i$, enabling adaptive timbre transfer without explicit timbre learning.

\textbf{Evaluation Framework:} Critically, this formulation enables objective evaluation by providing ground truth waveforms $w_{i,j}$ for direct comparison with generated outputs $\hat{w}_{i,j}$ across multiple metrics (see Section~\ref{sec:experiments} for details), eliminating the need for costly and potentially inconsistent human subjective evaluations that plague many timbre transfer and music generation benchmarks. We measure success through four complementary metrics: \textbf{SI-SDR} ($\downarrow$) for time-domain fidelity, \textbf{MEL Distance} ($\downarrow$) for perceptual quality, \textbf{STFT Distance} ($\downarrow$) for time-frequency accuracy, and \textbf{Chroma Similarity} ($\uparrow$) for melodic preservation, ensuring both timbral fidelity and melodic accuracy are rigorously assessed. 
\vspace{-3mm}
\section{Proposed Method}

\subsection{Preprocessing and Feature Extraction}
Our pipeline transforms 1,250 MIDI melodies and 50 synthesizer presets into 62,500 unique audio waveforms. Training triplets consist of $(mi_x, \mathbf{C}, me_x)$ where $mi_x$ is input MIDI melody, $\mathbf{C}$ is tiled control signal from potentially any melody using preset $x$, and $me_x$ is target waveform with preset consistency. DAC Encoder processes three modalities (MIDI files, complete audio, and cropped audio) into features with shape $(B, L, C)$, where $B$ is batch size, $L$ is number of codebooks (9 when using DAC), and $C$ is sequence length.

\subsection{Architecture Overview}

Our NCLMCTT architecture separates coarse musical structure generation from fine timbral detail synthesis. The first stage employs a 385M-parameter transformer-based LLM for autoregressive coarse codebook token generation, establishing musical structure while incorporating timbre conditioning. The second stage utilizes a specialized upsampling module for non-autoregressive fine token prediction, transforming coarse structure into high-fidelity audio.

\textbf{Stage 1 - LLM for Coarse Token Generation}: Our LLM employs a decoder-only transformer ($L=24$ layers, $d_{model}=1024$, $h=16$ heads) designed for musical token prediction. The model autoregressively generates coarse codebook sequences:
\begin{equation}
p(z_{1:T}^1) = \prod_{t=1}^{T} p(z_t^1 | z_{<t}^1, m_i, c_{a,j})
\end{equation}
For stereo audio, we implement channel dependency modeling. Input tokens are processed through learnable embeddings with sinusoidal positional encodings. Our model incorporates control tokens enabling flexible conditioning, with cross-attention layers integrating timbre control signals with MIDI input. Training employs standard autoregressive language modeling with cross-entropy loss, while inference uses temperature scaling and top-$k$ filtering with Gumbel-max sampling.

\textbf{Stage 2 - Specialized Upsampling Module}: The upsampling module transforms coarse tokens into high-fidelity audio through non-autoregressive fine token prediction. Given coarse tokens $z_{1:T}^1$, the upsampler predicts fine tokens through:
\begin{equation}
P_{\theta}(\hat{z}^2_{1:T}, \ldots, \hat{z}^9_{1:T} | z^1_{1:T}) = \prod_{i=2}^{9} P_{\theta}(\hat{z}^i_{1:T} | z^1_{1:T}, \hat{z}^{2:i-1}_{1:T})
\end{equation}

The module incorporates three conditioning types: \textbf{Metrical Conditioning} using beat phase information, \textbf{Harmonic Conditioning} through pitch class histograms and root note embeddings, and \textbf{Channel Conditioning} for stereo generation. Training employs masked token prediction with selective masking of fine tokens while preserving coarse structure.
\vspace{-4mm}
\subsection{Data Augmentation and Training}
\begin{wrapfigure}{r}{0.6\textwidth}
\centering
\includegraphics[width=0.58\textwidth, height=5cm, keepaspectratio]{imgs/data_augmentation.png}
\caption{Data augmentation strategy for NCLMCTT}
\label{fig:data_augmentation}
\end{wrapfigure}
We implemented a data augmentation strategy (Figure \ref{fig:data_augmentation}) creating triplets of MIDI input $mi_x$, tiled control signal $\mathbf{C}$, and target melody $me_x$. Control signals are extracted crops ($t_c$ seconds) from target waveforms using the same synthesizer preset but potentially different melodies, then tiled to match the target length. This technique decouples timbre from specific melodic content. By pairing each MIDI sequence with different preset renderings, we expanded our dataset from 62,500 to approximately 1.25 million samples (strategically selected from a theoretical 3 million possibilities), while varying crop lengths (1-5 seconds) to teach the model to extract timbral characteristics from control signals of different duration.

% \begin{figure}[h]
% \centering
% \includegraphics[width=0.8\columnwidth, height=5cm,keepaspectratio]{imgs/data_augmentation.png}
% \caption{Data augmentation strategy for NCLMCTT}
% \label{fig:data_augmentation}
% \end{figure}

\section{Experiments}
\label{sec:experiments}

\subsection{Training Configuration}

\subsubsection{First Codebook Model Training}

We trained our transformer-based model for first codebook token prediction using distributed data parallel across 8 NVIDIA L40S GPUs, completing in 4-6 hours with a per-GPU batch size of 16-24 (global batch size 128-192) for 200-500 steps per epoch. The training used a learning rate of $1 \times 10^{-4}$ with 200 warm-up steps in cosine scheduling, implemented bfloat16 mixed-precision via PyTorch AMP for efficient memory usage, and employed Adam optimizer with CrossEntropyLoss, while monitoring all metrics through TensorBoard to ensure training stability.

\subsubsection{Upsampler Training}
\label{subsec: upsampler}
The upsampler predicts fine codebook tokens ($z_2$ through $z_9$) conditioned on the first codebook $z_1$ using masked token prediction. We selectively mask tokens in codebooks $z_2$-$z_9$ while preserving $z_1$, with masking rates from 0.5-99.3\%. The model optimizes cross-entropy loss per codebook level via non-autoregressive parallel prediction. Implementation uses a 20-layer transformer (dimension 1280) with AdamW optimizer (learning rate 3e-5), trained on 2048-token segments ($\sim$24s) using bfloat16 precision and an 80/20 train/test split (350 test samples).

\subsection{Identifying the First Codebook as Bottleneck}
To validate our hierarchical design, we evaluated upsampler performance when conditioned on ground truth (GT) first codebook tokens. Figure~\ref{fig:upsampling_performance} shows results across all 50 presets. Upsampling from GT tokens achieves near-perfect reconstruction (median Chroma Similarity: 0.9879, SI-SDR: -2.7 dB, Mel Distance: 0.9234, STFT Distance: 1.711), demonstrating that the upsampler effectively generates fine timbral details when provided with accurate coarse structure. This validates our architectural decomposition: the first codebook captures the primary bottleneck for timbre transfer quality. Based on this analysis, we use end-to-end evaluation for baseline comparison (Section~\ref{subsec:compare_wt_tokensynth}) and first codebook evaluation for architectural analysis (Section~\ref{sec:ablation}).
\begin{figure}[h]
\floatconts
  {fig:upsampling_performance}
  {\caption{Upsampling from ground-truth first codebook tokens across 50 presets. Near-perfect reconstruction (median Chroma Similarity: 0.9879) confirms first codebook modeling as the key bottleneck in timbre transfer. (a) SI-SDR, (b) MEL Distance, (c) STFT Distance, (d) Chroma Similarity.}\vspace{-3mm}}
  {%
    \subfigure[]{\label{fig:upsampling_sisd}%
      \includegraphics[width=0.24\linewidth]{imgs/HMEU/sisd_publication.png}}%
    \subfigure[]{\label{fig:upsampling_mel}%
      \includegraphics[width=0.24\linewidth]{imgs/HMEU/mel_publication.png}}%
    \subfigure[]{\label{fig:upsampling_stft}%
      \includegraphics[width=0.24\linewidth]{imgs/HMEU/stft_publication.png}}%
    \subfigure[]{\label{fig:upsampling_chroma}%
      \includegraphics[width=0.24\linewidth]{imgs/HMEU/chroma_similarity_publication.png}}%
  }
\end{figure}
\vspace{-6mm}
\subsection{Evaluation Metrics}
\label{subsec:evaluation-metrics}
\vspace{-3mm}
To assess our method quantitatively, we employ four complementary metrics capturing different aspects of audio quality:

\textbf{SI-SDR($\downarrow$)}: Scale-Invariant Signal-to-Distortion Ratio measures time-domain fidelity while being invariant to scaling (the SI-SDR is negated, so lower value indicates better performance):
\begin{equation}
\text{SI-SDR}(x, \hat{x}) = -10 \log_{10} \frac{|\alpha x|^2}{|\alpha x - \hat{x}|^2}, \text{ where } \alpha = \frac{\hat{x}^T x}{|x|^2}
\label{eq:si-sdr}
\end{equation}
\vspace{-1mm}
\textbf{MEL Distance($\downarrow$)}: Evaluates perceptual differences in the mel-frequency domain using multi-scale mel spectrograms:
\begin{equation}
d_{\text{MEL}}(x, \hat{x}) = \frac{1}{TM} \sum_{t,m} |\text{MEL}_{t,m}(x) - \text{MEL}_{t,m}(\hat{x})|_1
\label{eq:mel}
\end{equation}
\vspace{-1mm}
\textbf{STFT Distance($\downarrow$)}: Measures time-frequency representation discrepancies:
\begin{equation}
d_{\text{STFT}}(x, \hat{x}) = \frac{1}{TF} \sum_{t,f} |\text{STFT}_{t,f}(x) - \text{STFT}_{t,f}(\hat{x})|_1
\label{eq:stft}
\end{equation}
\vspace{-1mm}
\textbf{Chroma Similarity($\uparrow$)}: Quantifies musical similarity through chromagram cosine similarity:
\begin{equation}
\text{ChromaCosSim}(X, \hat{X}) = \frac{1}{T} \sum_{t} \frac{x_t \cdot \hat{x}_t}{|x_t|_2 \cdot |\hat{x}_t|_2 + \epsilon}
\label{eq:chroma}
\end{equation}
We implemented SI-SDR, MEL Distance, and STFT Distance using the \texttt{audiotools} library\footnote{https://github.com/descriptinc/audiotools}, specifically from the \texttt{spectral.py} and \texttt{distance.py} modules. For SI-SDR, we used the default parameters with zero-mean normalization and mean reduction across batches. For MEL Distance, we employed a multi-scale approach using two resolutions (150 and 80 mel bands) with window lengths of 2048 and 512, combining both magnitude and log-magnitude L1 losses. Similarly, the STFT Distance was calculated using multi-scale STFT with window lengths of 2048 and 512, also combining magnitude and log-magnitude losses with equal weighting.

\subsection{Baseline Selection}
Table~\ref{tab:method_comparison} compares existing controllable music synthesis methods across key functional dimensions to identify appropriate baselines for benchmarking. \textbf{Excluded Methods}: Speech synthesis methods (VALL-E~\cite{wang2023neural}, VALL-E 2~\cite{chen2024vall}, AudioLM~\cite{borsos2023audiolm}) are optimized for linguistic structure rather than musical characteristics, operating on phoneme-aligned inputs and shorter temporal contexts suitable for utterances but insufficient for musical phrases. Text-to-music methods (MusicLM~\cite{agostinelli2023musiclm}, MusicGen~\cite{copet2023simple}, Moûsai~\cite{schneider2024mousai}) cannot perform reference-based timbre transfer as they rely solely on text descriptions, which introduce ambiguity for precise instrument specification. \textbf{Selected Baseline}: We benchmark against \textit{TokenSynth}~\cite{kim2025tokensynth}, which represents the state-of-the-art in zero-shot instrument synthesis through discrete token manipulation. TokenSynth is the only existing method that combines reference audio control with zero-shot transfer capabilities and flexible control length, making it functionally most similar to our approach and enabling direct comparison of timbre transfer quality.

\textbf{Architectural Inspiration}: While VALL-E~\cite{wang2023neural} inspired our hierarchical coarse-to-fine generation strategy, NCLMCTT introduces substantial music-specific modifications: (1) cross-attention mechanisms for explicit timbre conditioning beyond concatenation
-based approaches; (2) flexible reference duration (1-5s) for varying musical phrase lengths; (3) MIDI-based melodic features and extended temporal contexts optimized for musical structure. These deviations address fundamental differences between speech synthesis (phoneme-to-audio with speaker identity) and music synthesis (MIDI-to-audio with timbral control), making TokenSynth the appropriate benchmark for timbre transfer evaluation rather than adapting speech models.
\begin{table*}[t]
\centering
\tiny
\caption{Functional comparison of controllable music synthesis methods. NCLMCTT uniquely combines reference audio conditioning, flexible control length, and zero-shot transfer without explicit timbre learning.}
\label{tab:method_comparison}
\resizebox{\textwidth}{!}{
\begin{tabular}{lcccc}
\toprule
\textbf{Method} & \textbf{Control Signal} & \textbf{No Explicit Timbre} & \textbf{Flexible Control} & \textbf{Zero-shot} \\
 & \textbf{Type} & \textbf{Learning} & \textbf{Length} & \textbf{Transfer} \\
\midrule
VALL-E~\cite{wang2023neural} & Reference Audio & \checkmark & \ding{55} & \checkmark \\
VALL-E 2~\cite{chen2024vall} & Reference Audio & \checkmark & \ding{55} & \checkmark \\
AudioLM~\cite{borsos2023audiolm} & Reference Audio & \ding{55} & \ding{55} & \ding{55} \\
MusicLM~\cite{agostinelli2023musiclm} & Text & \checkmark & \checkmark & \ding{55} \\
MusicGen~\cite{copet2023simple} & Text & \checkmark & \checkmark & \ding{55} \\
Moûsai~\cite{schneider2024mousai} & Text & \checkmark & \checkmark & \ding{55} \\
TokenSynth~\cite{kim2025tokensynth} & Text + Reference & \checkmark & \checkmark & \checkmark \\
\midrule
\textbf{NCLMCTT (Ours)} & \textbf{Reference Audio} & \checkmark & \checkmark & \checkmark \\
\bottomrule
\end{tabular}
}
\end{table*}
\vspace{-3mm}
\subsection{Benchmarking}

\subsubsection{Comparison with TokenSynth}
\label{subsec:compare_wt_tokensynth}

Table~\ref{tab:results} presents the quantitative comparison between TokenSynth and NCLMCTT. Our method achieves substantial improvements in timbral fidelity across all spectral metrics: 27.1\% lower SI-SDR, 50.9\% lower Mel Distance, and 59.4\% lower STFT Distance. These improvements demonstrate that our direct audio conditioning approach enables significantly more accurate timbre replication than TokenSynth's CLAP embedding-based method.

TokenSynth achieves marginally higher Chroma Similarity (0.878 vs. 0.850, a 3.2\% difference), indicating slightly better melodic preservation. However, both methods maintain strong melodic coherence above 0.85, suggesting that NCLMCTT successfully balances timbre transfer with melodic integrity. The results validate our hypothesis that avoiding pretrained timbre encoders enables more flexible and accurate timbre transfer, achieving 50-59\% improvements in spectral distance metrics while maintaining competitive melodic performance.
\begin{table}[b]
\centering
\tiny
\caption{Comparison of audio quality metrics between TokenSynth and NCLMCTT. Lower values indicate better performance for SI-SDR, Mel Distance, and STFT Distance, while higher values are better for Chroma Similarity. Bold indicates best performance.}
\label{tab:results}
\begin{tabular}{lccc}
\toprule
\textbf{Metric} & \textbf{TokenSynth} & \textbf{NCLMCTT (Ours)} & \textbf{Improvement} \\
\midrule
SI-SDR $\downarrow$ & 29.22 & \textbf{21.30} & 27.1\% \\
Mel Distance $\downarrow$ & 3.69 & \textbf{1.81} & 50.9\% \\
STFT Distance $\downarrow$ & 6.41 & \textbf{2.60} & 59.4\% \\
Chroma Similarity $\uparrow$ & \textbf{0.878} & 0.850 & -3.2\% \\
\bottomrule
\end{tabular}
\end{table}
\vspace{-3mm}
\subsection{Zero-shot Generalization Performance}

NCLMCTT demonstrates robust generalization to unseen instrument presets, validating its ability to transfer learned timbre modeling capabilities to novel sounds without additional training. Figure~\ref{fig:zeroshot_performance} presents a comprehensive comparison between zero-shot performance on unseen presets (presets 41-50) and performance on four groups of seen presets from the training set (presets 1-10, 11-20, 21-30, 31-40).

The results reveal strong zero-shot generalization across all metrics. For SI-SDR and Mel Distance, unseen presets achieve performance comparable to seen presets, with distributions largely overlapping. STFT Distance shows particularly robust generalization, with zero-shot performance matching or exceeding several seen preset groups. Chroma Similarity maintains consistent performance across both seen and unseen presets, indicating that melodic preservation is not degraded when transferring to novel timbres.
\begin{figure}[t]
\centering
\begin{tabular}{cc}
\includegraphics[width=0.4\textwidth]{imgs/zeroshot/si_sdr_zeroshot_performance.png} &
\includegraphics[width=0.4\textwidth]{imgs/zeroshot/mel_zeroshot_performance.png} \\
\includegraphics[width=0.4\textwidth]{imgs/zeroshot/stft_zeroshot_performance.png} &
\includegraphics[width=0.4\textwidth]{imgs/zeroshot/chroma_similarity_zeroshot_performance.png} \\
\end{tabular}
\caption{Zero-shot performance comparison between seen training presets (groups 1-10, 11-20, 21-30, 31-40) and unseen test presets (41-50). Box plots show distribution of SI-SDR, Mel Distance, STFT Distance, and Chroma Similarity across preset groups, demonstrating robust generalization to novel instrument timbres.}
\label{fig:zeroshot_performance}
\end{figure}
\vspace{-3mm}
\subsection{Qualitative Analysis}
Figure~\ref{fig:qualitative_analysis} presents detailed spectral and waveform visualizations for four representative samples, demonstrating NCLMCTT's accurate reconstruction capabilities across diverse timbral characteristics. Spectral comparisons (panels a, c, e, g) reveal close alignment between generated and ground truth spectrograms, with predicted outputs preserving harmonic structure, formant characteristics, and temporal spectral evolution. Chromagram analysis (bottom rows of spectral panels) confirms strong melodic preservation across all samples, with Chroma Similarity scores ranging from 0.96 to 0.99. Waveform comparisons (panels b, d, f, h) show faithful reproduction of temporal envelopes, though absolute difference plots reveal notable deviations concentrated at note onsets where precise transient modeling remains challenging—a common limitation in autoregressive token-based generation. Additionally, subtle high-frequency artifacts are visible in some spectrograms (particularly Sample 1), suggesting that extremely fine timbral details occasionally suffer minor degradation. Despite these localized imperfections, the consistent overall performance across samples (SI-SDR: -3.64 to -4.72 dB, Mel Distance: 2.35-2.49, STFT Distance: 3.05-3.26) validates NCLMCTT's robust generalization, successfully transferring timbre while maintaining melodic integrity across varying melodic patterns and instrumental characteristics without major artifacts such as spectral smearing or temporal jitter.

\begin{figure}[h]
\floatconts
  {fig:qualitative_analysis}
  {\caption{Audio visualization for all samples. Each sample shows spectral features (left) and waveform (right). Sample 1 (SI-SDR: -4.72, MEL: 2.35, STFT: 3.05, Chroma: 0.98), Sample 2 (SI-SDR: -3.69, MEL: 2.48, STFT: 3.19, Chroma: 0.96), Sample 3 (SI-SDR: -3.68, MEL: 2.48, STFT: 3.19, Chroma: 0.98), Sample 4 (SI-SDR: -3.64, MEL: 2.49, STFT: 3.26, Chroma: 0.99)}}
  {%
    % Row 1: Sample 1 Spectral, Sample 1 Waveform, Sample 2 Spectral, Sample 2 Waveform
    \subfigure[]{\label{fig:audio_1_spectral}%
      \includegraphics[width=0.2\linewidth]{appendix_imgs/vis_prediction/spectral_features_comparison_0_sisd-4.7187371253967285_mel2.3471322059631348_stft3.050343990325928_chroma0.9828484654426576.png}}%
    \subfigure[]{\label{fig:audio_1_waveform}%
      \includegraphics[width=0.2\linewidth]{appendix_imgs/vis_prediction/waveform_comparison_0_sisd-4.7187371253967285_mel2.3471322059631348_stft3.050343990325928_chroma0.9828484654426576.png}}%
    \subfigure[]{\label{fig:audio_2_spectral}%
      \includegraphics[width=0.2\linewidth]{appendix_imgs/vis_prediction/spectral_features_comparison_1_sisd-3.6866588592529297_mel2.4843292236328125_stft3.198698043823242_chroma0.9596139788627625.png}}%
    \subfigure[]{\label{fig:audio_2_waveform}%
      \includegraphics[width=0.2\linewidth]{appendix_imgs/vis_prediction/waveform_comparison_1_sisd-3.6866588592529297_mel2.4843292236328125_stft3.198698043823242_chroma0.9596139788627625.png}}%
    \\
    % Row 2: Sample 3 Spectral, Sample 3 Waveform, Sample 4 Spectral, Sample 4 Waveform
    \subfigure[]{\label{fig:audio_3_spectral}%
      \includegraphics[width=0.2\linewidth]{appendix_imgs/vis_prediction/spectral_features_comparison_2_sisd-3.680769443511963_mel2.475346803665161_stft3.1878504753112797_chroma0.9761099815368652.png}}%
    \subfigure[]{\label{fig:audio_3_waveform}%
      \includegraphics[width=0.2\linewidth]{appendix_imgs/vis_prediction/waveform_comparison_2_sisd-3.680769443511963_mel2.475346803665161_stft3.1878504753112797_chroma0.9761099815368652.png}}%
    \subfigure[]{\label{fig:audio_4_spectral}%
      \includegraphics[width=0.2\linewidth]{appendix_imgs/vis_prediction/spectral_features_comparison_3_sisd-3.644216060638428_mel2.485275983810425_stft3.2612528800964355_chroma0.9871653318405152.png}}%
    \subfigure[]{\label{fig:audio_4_waveform}%
      \includegraphics[width=0.2\linewidth]{appendix_imgs/vis_prediction/waveform_comparison_3_sisd-3.644216060638428_mel2.485275983810425_stft3.2612528800964355_chroma0.9871653318405152.png}}%
  }
\end{figure}
\vspace{-12mm}
\section{Ablation Studies}
\label{sec:ablation}
To validate our design choices, we conduct ablation studies examining control signal duration, cross-attention mechanisms, and model capacity. Table~\ref{tab:ablation} presents first codebook reconstruction performance across different NCLMCTT configurations. \textbf{Control Signal Duration.} We evaluate reference audio lengths from 1s to 5s with fixed architecture (0.3B parameters, no cross-attention). Performance improves consistently with longer references: nclmctt\_5s achieves 40.8\% lower SI-SDR than nclmctt\_1s (5.27 → 3.12 dB), with corresponding improvements in Mel Distance (3.81 → 3.52) and Chroma Similarity (0.86 → 0.88). This validates that extended reference signals provide richer timbral information for more accurate transfer. \textbf{Cross-Attention Mechanism.} Adding explicit cross-attention layers (nclmctt\_cross\_attn) further reduces SI-SDR from 3.12 to 2.75 dB (11.9\% improvement over nclmctt\_5s), with gains in Mel Distance (3.52 → 3.46). This demonstrates that explicit attention to reference audio enhances timbre conditioning beyond concatenation-based approaches. \textbf{Model Capacity.} Scaling from 0.3B to 1.2B parameters (nclmctt\_cross\_attn\_large) maintains SI-SDR (2.75 dB) while achieving best overall performance: Mel Distance improves to 3.28 (5.2\% better), STFT Distance to 4.41 (3.1\% better), and Chroma Similarity to 0.89 (1.1\% better). Increased capacity enables better spectral modeling and melodic preservation without overfitting.

\begin{table}[t]
\centering
\tiny
\caption{Ablation study of NCLMCTT design choices on first codebook reconstruction. Configurations vary by control signal duration (1s-5s), cross-attention mechanism, and model size (0.3B vs. 1.2B parameters). Best values are highlighted in bold.}
\label{tab:ablation}
\begin{tabular}{lcccc}
\toprule
\textbf{Model} & \textbf{SI-SDR} & \textbf{Mel} & \textbf{STFT} & \textbf{Chroma} \\
& \textbf{(dB) $\downarrow$} & \textbf{Dist. $\downarrow$} & \textbf{Dist. $\downarrow$} & \textbf{Sim. $\uparrow$} \\
\midrule
\multicolumn{5}{l}{\textit{Control Signal Duration (0.3B params, no cross-attn)}} \\
nclmctt\_1s & 5.27 $\pm$ 4.18 & 3.81 $\pm$ 1.30 & 4.70 $\pm$ 1.74 & 0.86 $\pm$ 0.08 \\
nclmctt\_2s & 4.50 $\pm$ 2.92 & 3.77 $\pm$ 1.13 & 4.72 $\pm$ 1.65 & 0.86 $\pm$ 0.08 \\
nclmctt\_3s & 4.37 $\pm$ 2.66 & 3.53 $\pm$ 0.95 & 4.51 $\pm$ 1.39 & 0.87 $\pm$ 0.06 \\
nclmctt\_4s & 4.33 $\pm$ 3.57 & 3.70 $\pm$ 1.19 & 4.65 $\pm$ 1.63 & 0.87 $\pm$ 0.07 \\
nclmctt\_5s & 3.12 $\pm$ 2.51 & 3.52 $\pm$ 1.03 & 4.54 $\pm$ 1.55 & 0.88 $\pm$ 0.06 \\
\midrule
\multicolumn{5}{l}{\textit{Architecture Enhancements (5s reference)}} \\
nclmctt\_cross\_attn & 2.75 $\pm$ 2.63 & 3.46 $\pm$ 1.02 & 4.55 $\pm$ 1.56 & 0.88 $\pm$ 0.06 \\
nclmctt\_cross\_attn\_large & \textbf{2.75 $\pm$ 2.13} & \textbf{3.28 $\pm$ 0.87} & \textbf{4.41 $\pm$ 1.32} & \textbf{0.89 $\pm$ 0.06} \\
\bottomrule
\end{tabular}
\end{table}
\vspace{-5mm}
\section{Conclusion}
\label{sec:conclusion}
We introduced NCLMCTT, a neural codec language model that advances controllable timbre transfer through implicit audio conditioning without pretrained timbre encoders and the first comprehensive benchmark dataset for systematic evaluation. Compared to TokenSynth, NCLMCTT achieves substantial improvements in timbral fidelity (27.1\% reduction in SI-SDR, 50.9\% in Mel Distance, 59.4\% in STFT Distance) while maintaining strong melodic coherence (Chroma Similarity: 0.85). Zero-shot evaluation on unseen presets confirms robust generalization without performance degradation. Ablation studies reveal that control signal duration provides the largest impact (40.8\% improvement), followed by cross-attention mechanisms (11.9\%) and model scaling (up to 5.2\%). The analysis demonstrates that first codebook modeling represents the primary bottleneck, with upsampling from ground truth tokens achieving near-perfect reconstruction. By establishing rigorous evaluation protocols and providing open-source access to our dataset and implementation, NCLMCTT serves as both a practical tool for music creators and a methodological foundation for advancing controllable neural audio synthesis.
\vspace{-6mm}
\section{Limitations and Future Work} 
While NCLMCTT demonstrates strong performance, several directions warrant further investigation: incorporating explicit pitch-aware mechanisms to close the 3.2\% melodic preservation gap with TokenSynth and scaling to longer musical phrases with real-time generation capabilities. By establishing a rigorous evaluation framework and providing open-source access to our dataset, protocols, and implementation, we aim to accelerate progress in controllable neural music synthesis. Our work demonstrates that neural codec language models can effectively bridge symbolic music representation and audio synthesis, enabling precise timbre control while maintaining melodic integrity, positioning NCLMCTT as both a practical tool for music creators and a methodological foundation for advancing controllable audio generation research.

\bibliography{nsynth}

\end{document}