\documentclass[pmlr]{jmlr}% new name PMLR (Proceedings of Machine Learning Research)
% Template adapted for the 1st Workshop on Emerging AI Technologies for Music, as part of AAAI
% https://amaai-lab.github.io/EAIM2026/

 % The following packages will be automatically loaded:
 % amsmath, amssymb, natbib, graphicx, url, algorithm2e

 %\usepackage{rotating}% for sideways figures and tables
\usepackage{longtable}% for long tables

 % The booktabs package is used by this sample document
 % (it provides \toprule, \midrule and \bottomrule).
 % Remove the next line if you don't require it.
\usepackage{booktabs}
\usepackage{multirow}
\usepackage{pgfplots}
\pgfplotsset{compat=1.18} % Use a recent compatibility version
\usepgfplotslibrary{groupplots} % For creating grids of plots
\usepackage{tikz}
\usepackage{listings}
\usepackage{color}
 % The siunitx package is used by this sample document
 % to align numbers in a column by their decimal point.
 % Remove the next line if you don't require it.
\usepackage[load-configurations=version-1]{siunitx} % newer version
 %\usepackage{siunitx}

 % The following command is just for this sample document:
\newcommand{\cs}[1]{\texttt{\char`\\#1}}

\definecolor{dkgreen}{rgb}{0,0.6,0}
\definecolor{gray}{rgb}{0.5,0.5,0.5}
\definecolor{mauve}{rgb}{0.58,0,0.82}

\lstset{frame=tb,
  language=Python,
  aboveskip=3mm,
  belowskip=3mm,
  showstringspaces=false,
  columns=flexible,
  basicstyle={\small\ttfamily},
  numbers=none,
  numberstyle=\tiny\color{gray},
  keywordstyle=\color{blue},
  commentstyle=\color{dkgreen},
  stringstyle=\color{mauve},
  breaklines=true,
  breakatwhitespace=true,
  tabsize=3
}

 % Define an unnumbered theorem just for this sample document:
\theorembodyfont{\upshape}
\theoremheaderfont{\scshape}
\theorempostheader{:}
\theoremsep{\newline}
\newtheorem*{note}{Note}

 % change the arguments, as appropriate, in the following:
\jmlrvolume{303}
\jmlryear{2026}
\jmlrworkshop{EAIM2026 at AAAI}

\title[Evaluating mLLMs on Music]{LLMs can read music, but struggle to hear it. \\An evaluation of core music perception tasks}

 % Use \Name{Author Name} to specify the name.

 % Spaces are used to separate forenames from the surname so that
 % the surnames can be picked up for the page header and copyright footer.
 
 % If the surname contains spaces, enclose the surname
 % in braces, e.g. \Name{John {Smith Jones}} similarly
 % if the name has a "von" part, e.g \Name{Jane {de Winter}}.
 % If the first letter in the forenames is a diacritic
 % enclose the diacritic in braces, e.g. \Name{{\'E}louise Smith}

 % *** Make sure there's no spurious space before \nametag ***

 % Two authors with the same address
  % \author{\Name{Brandon James Carone} \Email{bcarone@nyu.edu} \and
  %  \Name{Pablo Ripollés} \Email{pripolles@nyu.edu}\\
  %  \addr New York University}

 % Three or more authors with the same address:
 % \author{\Name{Author Name1} \Email{an1@sample.com}\\
 %  \Name{Author Name2} \Email{an2@sample.com}\\
 %  \Name{Author Name3} \Email{an3@sample.com}\\
 %  \Name{Author Name4} \Email{an4@sample.com}\\
 %  \Name{Author Name5} \Email{an5@sample.com}\\
 %  \Name{Author Name6} \Email{an6@sample.com}\\
 %  \Name{Author Name7} \Email{an7@sample.com}\\
 %  \Name{Author Name8} \Email{an8@sample.com}\\
 %  \Name{Author Name9} \Email{an9@sample.com}\\
 %  \Name{Author Name10} \Email{an10@sample.com}\\
 %  \Name{Author Name11} \Email{an11@sample.com}\\
 %  \Name{Author Name12} \Email{an12@sample.com}\\
 %  \Name{Author Name13} \Email{an13@sample.com}\\
 %  \Name{Author Name14} \Email{an14@sample.com}\\
 %  \addr Address}


%  % Authors with different addresses:
 \author{\Name{Brandon James Carone} \Email{bcarone@nyu.edu}\\
 \addr Department of Psychology, Music and Audio Research Laboratory (MARL),  Center for Language, Music, and Emotion (CLaME), New York University
 \AND
 \Name{Iran R. Roman} \Email{i.roman@qmul.ac.uk}\\
 \addr School of Electronic Engineering and Computer Science, Queen Mary University of London
 \AND
 \Name{Pablo Ripollés} \Email{pripolles@nyu.edu}\\
 \addr Department of Psychology, Music and Audio Research Laboratory (MARL),  Center for Language, Music, and Emotion (CLaME), New York University
 }

% \editors{D. Herremans, K. Bhandari, A. Roy, S. Colton, M. Barthet}

  % \author{\Name{Author Name1} \Email{abc@sample.com}\and
  %  \Name{Author Name2} \Email{xyz@sample.com}}

\begin{document}

\maketitle
\vspace{-1.0cm}
\begin{abstract}
Multimodal Large Language Models (MLLMs) claim “musical understanding,” yet most evaluations conflate listening with score reading. We benchmark three SOTA LLMs (Gemini 2.5 Pro, Gemini 2.5 Flash, and Qwen2.5-Omni) across three core music skills: Syncopation Scoring (rhythm perception), Transposition Detection (melody perception), and Chord Quality Identification (harmony perception). Moreover, we separate three sources of variability: (i) perceptual limitations (by contrasting audio recordings vs. symbolic MIDI inputs), (ii) exposure to prior examples (zero- vs. few-shot manipulations), and (iii) reasoning strategies (Standalone, Chain of Thought, LogicLM). For the latter we adapt LogicLM, a framework combining LLMs with symbolic solvers to perform structured reasoning. In LogicLM, LLMs act as perceptual formulators, generating strict, machine-checkable schemas (onset grids, interval sequences) that deterministic solvers execute with self-refinement. Our results reveal a clear perceptual gap: models perform near ceiling on MIDI but show substantial accuracy drops on audio. Reasoning and few-shot prompting offer minimal gains. This is expected for MIDI, where performance reaches saturation, but more surprising for audio, where LogicLM, despite near-perfect MIDI accuracy, remains notably brittle. Among models, Gemini Pro achieves the highest performance across most conditions. Transposition yields the highest accuracies across models, while Chord Identification scores slightly below Syncopation. Overall, current systems reason well over symbols (MIDI) but do not yet ``listen'' reliably from audio, with reasoning strategies having little impact over accuracy. Our method and dataset make the perception–reasoning boundary explicit and offer actionable guidance for building robust, audio music systems.
\end{abstract}
\begin{keywords}
Audio Large Language Models, Multimodal Large Language Models, Music Understanding, Benchmarking and Evaluation, Schema-Guided Reasoning, LogicLM
\end{keywords}


\section{Introduction}
\label{sec:intro}
Recent advances in foundation models have extended their reach beyond text to multimodal architectures that process audio, vision, and language in a unified framework. Models such as Alibaba’s Qwen2.5-Omni, trained on a range of audio tasks \citep{RN2504}, and Google’s Gemini 2.5 family, which incorporates advanced multimodal integration for real-time interactions \citep{RN2505}, exemplify this new generation. One problem with many multimodal LLMs is that they boast ``generic hearing abilities'' and ``music understanding'', yet struggle with tasks as simple as recognizing a well-known tune when transposed to a different key (i.e., singing the same song at a higher or lower pitch) or played on another instrument. For example, after being fed a simple piano rendition of “Happy Birthday” and told that the melody represents that tune regardless of which key it is played in, they fail to recognize it when played in a different key or on another instrument (in our own pilot tests, Gemini 2.5 guesses that the transposed version of ``Happy Birthday" played at the same tempo is “Twinkle Twinkle Little Star” and Qwen2.5-Omni responded with “Lose Yourself” by Eminem). On the other hand, most people with Western enculturation recognize ``Happy Birthday" across keys, instruments, and language \citep{RN2507, RN2506}. Often these models are evaluated on benchmarks (e.g., AIR-Bench; \citep{RN2508}) that primarily focus on tasks such as speech recognition, audio classification, or music tagging/captioning using publicly available datasets (e.g., MusicCaps, AudioSet, NSynth, MagnaTagATune; \citep{RN2509, RN2510, RN2511, RN2512, RN2513, RN2514}). Among the state-of-the-art audio benchmarks are MMAR \citep{RN2515}, MMAU \citep{RN2503}, and MMAU-Pro \citep{RN2516}, CMI-Bench \citep{ma2025cmi}, RUListening \citep{zang2025you}, and FUTGA-MIR \citep{10888485}, which broaden coverage across speech, sound, and music, emphasizing multi-step reasoning. These benchmarks add realism via in-the-wild audio, long-form and multi-audio settings, spatial understanding, and expert-crafted Question-Answer pairs. In the context of music, MMAU-Pro even asks open-ended questions regarding musical themes, a song's mix, and what might differentiate one song from another. However, no existing benchmarks strategically ask questions about music that test whether the model can ``listen to'' the specific relations that constitute musical structure, or do so robustly enough to form a symbolic representation for a piece of music, like how a musician might transcribe a song.

While these datasets may be effective in assessing isolated aspects of music understanding (e.g., genre and instrument identification), it is still unknown whether they fully capture the nuanced, hierarchical nature of music perception in human listeners. For example, Audio LLMs may learn to associate the spectral characteristics of a trumpet's timbre with the label ``trumpet," or tie fast tempos, loud drums, and distorted guitars with the label ``rock music," simply by maximizing the likelihood of these co-occurrences in the training data. However, this focus on surface statistics is less suited for tasks demanding abstract relational understanding, such as recognizing a melody when its absolute pitches are changed (key invariance) or identifying the harmonic function of a chord within a progression. These abilities require understanding relationships between elements (e.g., relative pitch intervals, harmonic contexts) rather than just recognizing the elements themselves.

In the current study, we focus on three fundamental components of musical hierarchy: rhythm, melody, and harmony. Syncopation captures the perception of rhythmic ``surprise'' or emphasis in unexpected places \citep{RN2517, large2023dynamic}. Evaluating a model’s capacity to detect syncopation provides a measure of its sensitivity to temporal predictability and metric displacement in rhythm perception. Testing melody recognition across transpositions \citep{RN2518, RN2519, RN2520} assesses the model's ability to recognize melodies despite shifting the key, or rather, starting the same melody on a different pitch or note that is higher or lower and maintaining the rhythmic and intervallic changes. By testing this, we can evaluate whether the model exhibits similar perceptual invariance to humans. Identifying chord quality (i.e., major, minor, dominant, diminished) requires a model to recognize harmonic structures based on the relative intervals above the root note of the chord, rather than absolute pitch alone. By testing this, we can evaluate whether the model demonstrates an ability to track intervals above a given note, and characterize the joint quality of the pitches forming those intervals as a whole.

Evaluating these abilities in multimodal LLMs presents a methodological challenge: how can we pinpoint where the bottleneck in musical abilities lies? To answer this question, we have developed an experimental design that jointly assesses perceptual, learning, and reasoning factors. First, we aim to test the perceptual limitations of multimodal LLMs by comparing their performance on audio recordings, which require genuine listening, versus symbolic MIDI inputs, which rely on structured representations. Second, we aim to assess how in-context learning (few-shot vs. zero-shot) influences accuracy, revealing whether brief exposure to examples improves music perception, or if performance remains limited by underlying perceptual constraints.
And third, when a model produces a correct answer, it is often unclear whether this reflects genuine perceptual analysis (i.e., reasoning) or reliance on superficial cues. This echoes the problem of ``unfaithful reasoning'' in logical domains, where models may generate plausible chains of thought that do not underlie their actual decision process \citep{RN2522}. Even Chain-of-Thought prompting (a technique where an LLM generates intermediate reasoning steps to improve the accuracy of its answer),  does not necessarily guarantee alignment with the perceptual computations themselves. To address this, we adapt LogicLM \citep{RN2522}, a neuro-symbolic prompting framework in which an LLM emits a strict, machine-checkable schema that a deterministic solver executes, aided by a self-refinement loop, so that the final answer is grounded in verifiable computation rather than free-form text or guessing. In our adaptation, the model serves as a Perceptual Formulator, tasked with converting continuous audio into a structured symbolic schema (e.g., note sequences, rhythmic onsets, pitch-class sets). These are then evaluated by deterministic solvers, ensuring that the final decision is grounded in the schema rather than in opaque model heuristics. By systematically comparing model performance on raw audio versus symbolic MIDI inputs, and by contrasting LogicLM prompting with standalone and Chain-of-Thought strategies, we provide a detailed assessment of where current models succeed, where they fail, and what this reveals about the limits of machine music perception.

%We thus address four questions: (i) how current state-of-the-art audio LLMs (Qwen2.5-Omni 7B, Gemini 2.5 Flash, and Gemini 2.5 Pro) differ in overall musical competence on controlled tasks with unfamiliar stimuli; (ii) whether there is a perceptual gap (audio vs. MIDI); (iii) how in-context learning (few-shot vs. zero-shot) alters accuracy; and (iv) how different prompting strategies (standalone question-answering vs. Chain-of-Thought reasoning vs. LogicLM) help or hinder in different musical tasks. To do so, we introduce a controlled music benchmark spanning syncopation, transposition, and chord quality, with paired audio/MIDI, per-trial isolation, and deterministic solvers. % with schema checks and self-refinement.

\vspace{-0.4cm}
\begin{figure}[!ht]
\floatconts
  {fig:diagram}
  {\vspace{-1.0cm}\caption{Diagram of the experimental design carried out with each model and task.}\vspace{-1cm}}
  {
    \centering
    \includegraphics[width=0.80\linewidth]{diagram.png}
  }
\end{figure}

\section{Methods}
\subsection{Stimuli Creation}

Stimuli were recorded by a real human musician, and are originally from \href{https://github.com/brandoncarone/MUSE_music_benchmark}{The MUSE Benchmark} \citep{carone2025musebenchmarkprobingmusic,carone2025evaluating}. See Supplementary Materials S1 for details.
%Music was recorded in Logic Pro X using a 2021 16” MacBook Pro (Apple M1 Pro chip), an Apollo Twin X audio interface, and Yamaha HS8 monitors. Stimuli were recorded on electric guitar (PRS McCarty Hollowbody II, Schecter Solo-6), piano (Arturia KeyLab Essential Mk3 MIDI controller with Analog Lab V software instruments), and drums (Roland TD-17 electronic kit with Superior Drummer 3 plugin). Guitar recordings were processed with Neural DSP plugins (Tim Henson Archetype, Cory Wong Archetype). 



\subsection{Tasks}
\paragraph{Syncopation Scoring.}
In this task, the models were presented with 20 short rhythmic excerpts (8 secs) performed at 120 BPM on a drum set, consisting of kick, snare, and hi-hat. The hi-hat maintained a constant stream of eighth notes, while the kick and snare patterns varied in their placement across on-beats (those falling on the quarter notes of each bar) and off-beats (those falling on the 8th notes in between each quarter note). The models’ task was to rate the degree of syncopation by counting the number of kick and snare events that occurred on off-beats, and then mapping this total to a categorical Syncopation Score (i.e., 0, 2, 4, 6, or 8). Stimuli were systematically constructed to span a wide range of syncopation levels, in accordance with the methods of \citep{RN2517, large2023dynamic}. 
%Two additional excerpts were recorded and presented as examples for the few-shot runs, but excluded from testing.

\paragraph{Transposition Detection.}
In this task, the models were presented with 20 pairs of musical excerpts (mean duration $\approx$ 9 secs) where the first excerpt presented is the anchor, and the second (i.e., target) is either the same melody transposed to a different key, or a different melody. After “listening”, the models must decide whether the two audio clips represent the same melody or not. 10 of the trials were matches, and the other 10 were not matches. Stimuli were short excerpts played on an electric guitar or a piano, and were varied across tempo, key, meter, and melody length. 
%Two additional excerpts were recorded and presented as examples for the few-shot runs, but excluded from testing.

\paragraph{Chord Quality Identification.}
In this task, the models were presented with 44 short musical excerpts (9 secs) recorded at 120 BPM and consisting of a single chord played first as a block and then as an arpeggiation, where each of the individual notes of the chord were played individually from lowest to highest. Each chord was in root position, where the lowest note is the root of the chord, to remove ambiguity about inversion, and all stimuli were generated on piano to ensure consistent timbre across all trials. The models’ task was to classify the chord into one of four quality categories: Major (Root + major 3rd + perfect 5th), Minor (Root + minor 3rd + perfect 5th), Dominant (Root + major 3rd + perfect 5th + minor 7th), or Diminished (Root + minor 3rd + diminished 5th). 
%Four additional excerpts (one per quality category) were recorded and presented as examples for the few-shot runs, but excluded from testing.

\subsection{Implementation}
We developed a set of custom inference scripts that closely followed the framework proposed in the original LogicLM study \citep{RN2522}, which compared three prompting strategies: Standalone, Chain-of-Thought (CoT), and LogicLM. In adapting this design to the domain of music perception, we ensured that: (i) each trial was independent of the others, (ii) prompts were standardized across prompting strategies, and (iii) results could be evaluated in a reproducible way.
Each trial began with a fresh chat session, meaning that no conversational history was ever preserved across trials or across tasks. All prompting strategies included the same set of task-specific system instructions, which defined the rules of the task and the required output format. 
In constructing our scripts, we wanted to separate three key sources of variability: (i) perceptual limitations (by contrasting audio vs. symbolic inputs), and (ii) learning by exposure to prior examples (zero- vs. few-shot manipulations), and (iii) reasoning strategies (standalone vs. CoT vs. LogicLM). 
The differences between the different runs are outlined below:

\paragraph{Per-task modularity.}
Each task was implemented in a separate script. This ensured that stimuli, examples, and outputs were isolated per task and that no prompt information leaked across prompting strategies. Each condition was tested in the same structure, allowing direct comparisons across tasks.

\paragraph{Audio vs. MIDI data.}
We ran a symbolic‑input control by replacing audio with MIDI notation for the same items. Prompts simply swap “you will hear…” for “you will be given MIDI data…”, and the model is asked to generate the same schema as in the audio runs. All stimuli were rerecorded on a MIDI keyboard and then translated to .txt files using a custom script and the python package \texttt{mido}. This isolates the effect of perceptual transcription from symbolic reasoning.

\paragraph{Zero-shot vs. Few-shot.}
In zero-shot prompting strategies, models received only the system instructions and the trial stimuli. In few-shot prompting strategies, we included a small number of worked examples in the trial history (two for syncopation scoring and transposition detection; four, one per quality category, for chord quality detection), each paired with the correct solution. These examples were presented only for that trial and were excluded from the evaluation set. This separation allowed us to test whether models could solve tasks based on their intrinsic knowledge or whether they benefited from in-context learning from demonstrations.

\paragraph{Standalone, CoT, and LogicLM.}
In the standalone condition, models were asked to provide only the final categorical response (e.g., “Yes, these are the same melody.”, “C. Dominant”). In the CoT condition, they were encouraged to produce short intermediate reasoning before giving a final answer on a separate line. In the LogicLM condition, the model was required to output a structured symbolic transcription (e.g., a list of note intervals or a grid of rhythmic onsets), which was then parsed by a deterministic solver (solver.py; see Supplementary Materials S3). When schema violations occurred (for example, malformed syntax or an out-of-range onset), we implemented a self-refinement loop in which the model was asked to correct its own output under strict constraints. This mirrors the iterative repair process described in \citep{RN2522}. System instructions for each task and condition can be found in Supplementary Materials S2.

%\paragraph{Temperature and replication.}
%In line with the LogicLM study, we first ran all prompting strategies at a deterministic setting (temperature = 0, top-p = 1.0, top-k = 1). However, because music is a more variable and creative domain than those tested by other logical reasoning benchmarks such as the LSAT or FOLIO \citep{RN2522}, we also examined performance at a more stochastic setting (temperature = 1, top-p = 0.95, top-k = 40). To account for the added variability, each stochastic condition was repeated three times with different seeds, and mean accuracy across seeds was taken as the outcome measure. This averaging provided a more reliable estimate of model performance, analogous to assessing test–retest reliability in human studies \citep{RN2523}.


All responses from the LLMs were parsed with regular expressions to extract the final line (e.g., “Final Answer: B” or “Yes, these are the same melody.”). For LogicLM, the symbolic output was passed to the solver, and solver decisions were used to score the trial. All trials across runs were randomized and logged to a dedicated file that included the model configuration, trial IDs, raw outputs, parsed responses, and evaluation results.

\subsection{Models and inference environment}
We tested Gemini 2.5 Pro, Gemini 2.5 Flash, and Qwen2.5‑Omni. Gemini runs used the google.genai SDK, whereas Qwen runs mirrored the same pipeline on the NYU HPC (SLURM), with provider‑specific chat/message shims but the same prompts, decoding settings, seeding, and evaluation.
All Qwen2.5-Omni experiments were run on the NYU Greene HPC cluster using SLURM. To run the scripts, we used 2 NVIDIA H100 GPUs, 64 GB of system memory, and 8 CPU cores to run all of the scripts.

\subsection{Statistical Analyses}

\paragraph{Decision Accuracy.}
Analyses were conducted in \texttt{Python} (\texttt{pandas} for data handling, \texttt{numpy} for numerical operations, and \texttt{matplotlib} for visualization). The unit of analysis was a run (one log file). For each run, we computed an accuracy score:
\[
\text{Accuracy \%} = 100 \times \frac{\text{Correct}}{\text{EffectiveTotal}},
\]
where \textit{Correct} denotes the number of correctly scored trials and \textit{EffectiveTotal} is the total number of trials after subtracting token-limited events (i.e., truncated or empty model outputs; ~0.01\% of all trials). Each run therefore contributed a single accuracy observation, annotated with metadata fields: \textit{Task, Model, Modality, Condition}, and \textit{Shot}.

\paragraph{Exploratory statistical analyses.}
After inspecting the decision accuracy results (see Table~\ref{tab:benchmark}), we found that reasoning style (Standalone, CoT, or LogicLM) and shot setting (few-shot vs. zero-shot) had minimal effects, whereas accuracy differences were primarily driven by modality (audio vs. MIDI). Consequently, we conducted post hoc statistical analyses to examine LLM performance as a function of modality.
We used generalized linear mixed modeling (GLMM) in R (version 4.4.2) and RStudio (2024.09.1) with the \texttt{lme4} package. Each response produced by the models for every stimulus served as an observation in the GLMM dataset. After removing the token-limited trials, separate GLMMs were fit for each task (Syncopation, Chord Identification, and Transposition), predicting whether each response was correct (1) or incorrect (0) at the trial level. Each model included fixed factors for Model (Gemini Flash, Gemini Pro, Qwen 2.5 Omni) and Modality (Audio, MIDI), as well as their interaction, with a random intercept for Stimulus: [Correct $\sim$ Model * Modality + (1 $|$ Stimulus)]. The effects of the different predictors and interactions were evaluated using Type III Wald chi-square tests via the \texttt{car} package, and significant interactions were further explored using \texttt{emmeans}.

\paragraph{Quality of LogicLM Inputs x Accuracy of LLM Responses.}
Because the models seemed to fail at perception from the waveforms and not from the symbolic reasoning itself, we visualize, across task and modalities, how the quality of the symbolic inputs produced under LogicLM (x-axis) relates to the final, task-level decision accuracy of the LLM (y-axis). If perception is the bottleneck, Audio points should sit low on the x-axis (poor input quality), and low on the y-axis (poor decision quality). With clean symbolic inputs (MIDI), however, both axes should approach ceiling. To quantify the \textbf{LogicLM input quality} for each trial, we compute the F1 (a measure that combines precision and recall into a single metric) of the LLM predicted MIDI content (i.e., the symbolic representation generated by the LLM) vs. ground truth (annotations of the stimuli provided by a human expert).
Let $TP,FP,FN$ be true/false positives/negatives for the relevant set comparison. We compute
\begin{equation}
\mathrm{Precision}=\frac{TP}{TP+FP}, \qquad
\mathrm{Recall}=\frac{TP}{TP+FN}, \qquad
\mathrm{F1}=\frac{2\,\mathrm{Precision}\cdot \mathrm{Recall}}{\mathrm{Precision}+\mathrm{Recall}}.
\end{equation}

The x–axis is this per–trial F1 on (i) onset sets for Syncopation (masked to on– or off–beats), (ii) absolute pitch–sets for Chord Quality ID, and (iii) pitch–class content for Transposition.  
The y–axis is the \textbf{decision accuracy} for the same trials (correct/incorrect).

Trials are aggregated \emph{across models and shot settings} to produce model–agnostic summaries at the level of \textbf{Task label $\times$ Modality}:
\vspace{-0.15cm}
\begin{itemize}
\item Syncopation: \textit{On–beat} and \textit{Off–beat} $\times$ \{Audio, MIDI\}
\vspace{-0.15cm}
\item Transposition: \textit{Yes / No} $\times$ \{Audio, MIDI\}
\vspace{-0.15cm}
\item Chord Quality ID: \textit{Major / Minor / Dominant / Diminished} $\times$ \{Audio, MIDI\}
\end{itemize}
\vspace{-0.15cm}

\noindent Within each Task label~$\times$~Modality grouping, we perform micro–averaging by summing $TP$, $FP$, and $FN$ over all contributing trials (pooled over models and shots) and then computing precision, recall (i.e., sensitivity), and F1 from those totals, thereby weighting each trial equally and yielding an overall estimate that is robust to differing numbers of runs/models and label imbalance. Note that F1 ranges from 0 to 1, where 1 indicates perfect precision and perfect recall. Accuracy (whether the question was answered correctly) is computed as the number of correct responses divided by the total number of trials in that grouping.



For the plot, Syncopation items are partitioned into on-beat and off-beat subsets. %, which we then further partition by syncopation level (\textit{No Syncopation}~=~0, \textit{Syncopation Level 1}~=~1, \textit{Syncopation Level 2}~=~2, \textit{Syncopation Level 3}~=~3, \textit{Syncopation Level 4}~=~4). 
For each level and modality, we compute F1 separately, while the single binary decision-accuracy score (correct/incorrect) for that trial is applied to both its on-beat and off-beat data points. This allows us to compare the models' beat tracking abilities (on-beats) with its ability to actually detect moments of syncopation (off-beats). For Transposition, the script extracts the predicted pitches from both the anchor and target stimuli, and computes the F1 against ground truth. For Chord Quality ID, the script takes the models' chord prediction and compares the predicted absolute note set to ground truth to obtain F1. 




% --- Fixed Table for Temperature 0 ---
\begin{table}[t]
\caption{Accuracy of multimodal LLMs on three music perception tasks: syncopation scoring, transposition detection, and chord quality identification. Results are reported for audio and MIDI inputs under three prompting strategies (Standalone, CoT, LogicLM) and zero-shot (ZS) vs few-shot (FS) learning conditions. %two shot conditions (ZS=zero-shot, FS=few-shot). 
%Within each modality, zero-shot results are grouped above few-shot. 
\textbf{Bold} highlights best performance per task/shot/modality (\underline{underlined} shows second best). A systematic gap between modalities is seen: MIDI inputs generally lead to higher accuracies and clearer prompting effects compared to audio. The bottom row represents chance performance.}
\label{tab:results-temp0}
\centering
\footnotesize
\setlength{\tabcolsep}{3pt}
\begin{tabular}{
l l l
S[table-format=3.2] S[table-format=3.2] S[table-format=2.2] |
S[table-format=3.2] S[table-format=3.2] S[table-format=3.2] |
S[table-format=3.2] S[table-format=3.2] S[table-format=3.2]
}
\toprule
& & & \multicolumn{3}{c}{Syncopation} & \multicolumn{3}{c}{Transposition} & \multicolumn{3}{c}{Chord ID} \\
\cmidrule(lr){4-6} \cmidrule(lr){7-9} \cmidrule(lr){10-12}
Mod. & Shot & Cond. & Flash & Pro & {Qwen} & Flash & Pro & {Qwen} & Flash & Pro & {Qwen} \\
\midrule
\multirow{6}{*}{\parbox{1.2cm}{\centering Audio}}
&& Stand. & \underline{30.00} & {25.00} & {20.00} & {55.56} & \underline{94.74} & {75.00} & {31.82} & \textbf{47.73} & {31.82} \\
& ZS & CoT & \textbf{35.00} & {25.00} & {20.00} & {76.92} & \textbf{95.00} & {65.00} & {31.82} & \underline{43.18} & {31.82} \\
&& LogicLM & {20.00} & {20.00} & {20.00} & {65.00} & {80.00} & {50.00} & {11.36} & {18.18} & {6.82} \\
\cmidrule{2-12}
&&  Stand. & {31.58} & \underline{63.16} & {40.00} & \textbf{94.74} & \underline{90.00} & \underline{90.00} & {25.00} & \underline{40.91} & {31.82} \\
& FS & CoT & {40.00} & \textbf{65.00} & {40.00} & {63.16} & \underline{90.00} & {60.00} & {25.00} & \textbf{52.27} & {34.09} \\
&& LogicLM & {40.00} & {55.00} & {20.00} & {60.00} & \underline{90.00} & {35.00} & {6.82} & {13.64} & {18.18} \\
\midrule
\multirow{6}{*}{\parbox{1.2cm}{\centering MIDI}}
&& Stand. & {84.21} & \underline{95.00} & {25.00} & \textbf{100.00} & \textbf{100.00} & {85.00} & {50.00} & \underline{97.73} & {22.73} \\
& ZS & CoT  & {94.74} & \textbf{100.00} & {35.00} & \underline{95.00} & \textbf{100.00} & {20.00} & \textbf{100.00} & \textbf{100.00} & {25.00} \\
&& LogicLM & {90.00} & {80.00} & {20.00} & \textbf{100.00} & \textbf{100.00} & {10.00} & {93.18} & \textbf{100.00} & \textbf{100.00}  \\
\cmidrule{2-12}
&& Stand. & {88.89} & \textbf{100.00} & {35.00} & \textbf{100.00} & \textbf{100.00} & \underline{90.00} & {70.45} & \textbf{100.00} & {29.55} \\
& FS & CoT & \underline{95.00} & \textbf{100.00} & {25.00} & \textbf{100.00} & \textbf{100.00} & {60.00} & \underline{97.73} & \textbf{100.00} & {29.55} \\
&& LogicLM & \textbf{100.00} & \underline{95.00} & {25.00} & \textbf{100.00} & \textbf{100.00} & {15.00} & \textbf{100.00} & \textbf{100.00} & \textbf{100.00} \\
\midrule
& Chance &&& {20.00} &&& {50.00} &&& {25.00} \\
\bottomrule
\vspace{-15pt}
\label{tab:benchmark}
\end{tabular}
\end{table}

\vspace{-0.4cm}
\section{Results}

\paragraph{Overall performance}
Table~\ref{tab:benchmark} summarizes accuracy across tasks, models, modalities, learning context, and prompting strategies. Performance depended strongly on modality and model. MIDI input yielded near-ceiling scores, especially for Gemini models, whereas audio reduced accuracy across tasks, highlighting perception from waveform as the primary bottleneck. Qwen2.5-Omni generally underperformed, with the largest LogicLM deficits. % where strict schema adherence was required. %Figure~\ref{fig:overall} illustrates accuracy by modality, model, and zero-shot prompting.

%\paragraph{Modality differences.}
%A robust modality gap was observed (Fig.~\ref{fig:overall}A). Gemini models performed significantly better with MIDI, though, this trend was less evident in Qwen. Syncopation Scoring and Chord Quality Identification showed the widest gaps for Gemini (MIDI $\approx$84–100\% vs.\ audio $\approx$6–65\%), confirming intact symbolic reasoning but weak audio perception. Transposition Detection was more robust, with smaller modality gaps.


\paragraph{ZS vs.\ FS.}
Table~\ref{tab:benchmark} shows minimal differences when comparing the accuracies of ZS and FS conditions. FS tended to help Syncopation in audio (e.g., Gemini Pro from $\sim$25\% ZS to $\sim$65\% FS in Standalone/CoT), but this trend was not reliable across models or tasks.


\paragraph{Prompting strategies.}
Prompting effects varied by task. For Syncopation, CoT offered modest gains in audio, while LogicLM was only beneficial with MIDI (Gemini reaching 95–100\%). For Transposition, Standalone and CoT prompts worked best, while LogicLM reduced accuracy. Chord ID was trivial in Standalone/CoT but collapsed with LogicLM-audio due to schema fragility. Overall, neuro-symbolic prompting helped only when inputs were symbolic and formatting was reliable.

\begin{figure*}[!ht] % Use figure* to span the full page width
\centering
\begin{tikzpicture}
% ----- Define Colors for Consistency -----
\definecolor{audio_color}{gray}{0.35}
\definecolor{midi_color}{gray}{0.80}

% ----- PGFPlots Group Plot Setup -----
\begin{groupplot}[
    group style={
        group size=3 by 1, % 3 plots horizontally, 1 vertically
        horizontal sep=1.8cm, % Spacing for horizontal labels
        vertical sep=1.5cm
    },
    % ----- Shared Styles for All Plots -----
    height=5.5cm,
    width=5cm,
    ybar, % Style for bar plots
    enlarge x limits=0.25,
    ymin=0, ymax=115,
    ylabel={Accuracy (\%)},
    ylabel style={font=\small},
    ytick={0,20,40,60,80,100},
    yticklabel style={font=\footnotesize},
    yticklabel={\pgfmathprintnumber{\tick}\%},
    symbolic x coords={Flash, Pro, Qwen},
    xtick=data,
    xticklabels={Flash, Pro, Qwen},
    x tick label style={font=\footnotesize},
    %bar width=8pt,
    legend style={draw=none, fill=none},
    legend image code/.code={%
        \draw[#1, /pgfplots/bar legend, draw=black]
        (0cm,0cm) rectangle (0.3cm,0.3cm);
    }
]

% ----------------------------------------------------
% PANEL A: Syncopation Scoring
% ----------------------------------------------------
\nextgroupplot[title={\textbf{A. Syncopation Scoring}}, title style={yshift=0.2cm, font=\small}]
    % Audio Data
    \addplot+[fill=audio_color, draw=black, error bars/.cd, y dir=both, y explicit] 
        table [x=model, y=pred, y error plus=err_p, y error minus=err_m] {
        model   pred    err_p   err_m
        Flash   29      16      12
        Pro     41      17      15
        Qwen    22      14      10
    };
    % MIDI Data
    \addplot+[fill=midi_color, draw=black, error bars/.cd, y dir=both, y explicit] 
        table [x=model, y=pred, y error plus=err_p, y error minus=err_m] {
        model   pred    err_p   err_m
        Flash   99      1       5
        Pro     100     0       0
        Qwen    47      9       10
    };
    % Significance Brackets (from original figure)
    \draw (axis cs:Flash,102) -- (axis cs:Flash,108) node[midway,above,font=\tiny]{***};
    \draw (axis cs:Pro,102) -- (axis cs:Pro,108) node[midway,above,font=\tiny]{***};
    \draw (axis cs:Qwen,45) -- (axis cs:Qwen,51) node[midway,above,font=\tiny]{n.s.};

% ----------------------------------------------------
% PANEL B: Transposition Detection
% ----------------------------------------------------
\nextgroupplot[title={\textbf{B. Transposition Detection}}, title style={yshift=0.2cm, font=\small}]
    % Audio Data
    \addplot+[fill=audio_color, draw=black, error bars/.cd, y dir=both, y explicit] 
        table [x=model, y=pred, y error plus=err_p, y error minus=err_m] {
        model   pred    err_p   err_m
        Flash   71      8       11
        Pro     90      5       7
        Qwen    63      8       10
    };
    % MIDI Data
    \addplot+[fill=midi_color, draw=black, error bars/.cd, y dir=both, y explicit] 
        table [x=model, y=pred, y error plus=err_p, y error minus=err_m] {
        model   pred    err_p   err_m
        Flash   99      1       5
        Pro     100     0       0  
        Qwen    47      9       10
    };
    % Significance Brackets (from original figure)
    \draw (axis cs:Flash,103) -- (axis cs:Flash,109) node[midway,above,font=\tiny]{***};
    \draw (axis cs:Pro,103) -- (axis cs:Pro,109) node[midway,above,font=\tiny]{n.s.};
    \draw (axis cs:Qwen,80) -- (axis cs:Qwen,86) node[midway,above,font=\tiny]{*};
    
% ----------------------------------------------------
% PANEL C: Chord Quality ID
% ----------------------------------------------------
\nextgroupplot[title={\textbf{C. Chord Quality ID}}, title style={yshift=0.2cm, font=\small}]
    % Audio Data
    \addplot+[fill=audio_color, draw=black, error bars/.cd, y dir=both, y explicit] 
        table [x=model, y=pred, y error plus=err_p, y error minus=err_m] {
        model   pred    err_p   err_m
        Flash   20      6       5
        Pro     35      7       7
        Qwen    24      7       5
    };
    % MIDI Data
    \addplot+[fill=midi_color, draw=black, error bars/.cd, y dir=both, y explicit] 
        table [x=model, y=pred, y error plus=err_p, y error minus=err_m] {
        model   pred    err_p   err_m
        Flash   87      3       6
        Pro     100     0       2
        Qwen    51      8       7
    };
    % Significance Brackets (from original figure)
    \draw (axis cs:Flash,92) -- (axis cs:Flash,98) node[midway,above,font=\tiny]{***};
    \draw (axis cs:Pro,103) -- (axis cs:Pro,109) node[midway,above,font=\tiny]{***};
    \draw (axis cs:Qwen,66) -- (axis cs:Qwen,72) node[midway,above,font=\tiny]{***};

\end{groupplot}

    % ----- Manual Legend Below the Plots -----
% \node (legend) [draw, rounded corners, font=\footnotesize, inner sep=4pt, anchor=north] at ([yshift=-1cm]group c2r1.south) {
%     \begin{tabular}{c c c c c}
%     \tikz\fill[audio_color,draw=black] (0,0) rectangle (0.3,0.3); & Audio &
%     \tikz\fill[midi_color,draw=black] (0,0) rectangle (0.3,0.3); & Symbolic (MIDI) \\[4pt]
%     \multicolumn{5}{c}{* $p < .05$ \quad ** $p < .01$ \quad *** $p < .001$} \\
%     \end{tabular}

% };

\node (legend) [draw, rounded corners, font=\footnotesize, inner sep=4pt, anchor=north] at ([yshift=-1.0cm]group c2r1.south) {
    \begin{tabular}{l l @{\qquad} l l @{\qquad} l}
    \tikz\fill[audio_color,draw=black] (0,0) rectangle (0.3,0.3); & Audio &
    \tikz\fill[midi_color,draw=black] (0,0) rectangle (0.3,0.3); & Symbolic (MIDI) &
    * $p < .05$ \quad ** $p < .01$ \quad *** $p < .001$
    \end{tabular}
};

\end{tikzpicture}
% ----- Caption and Label (using jmlr class style) -----
\floatconts
  {fig:glmm_results_latex}
  {    \vspace{-0.75cm}\caption{Model performance by modality, estimated from per-task GLMMs. Bars show estimated marginal mean accuracy with 95\% confidence intervals. Significance brackets denote the results of pairwise post-hoc tests comparing Audio and MIDI performance per model. A (Syncopation Scoring): we see significant effects of modality for both Gemini Flash and Pro ($p < .001$), but not for Qwen ($p = .864$). B (Transposition Detection): we see significant effects of modality for Gemini Flash ($p < .001$) and Qwen ($p < .05$), and see that the high accuracies of Gemini Pro for both Audio and MIDI result in no significant differences between the two modalities ($p = .899$). C (Chord Quality ID): we see significant effects of modality for all three models ($p < .001$). n.s., not significant.}\vspace{-0.75cm}}
  
\end{figure*}

\paragraph{Per-task GLMMs: Assessing modality differences.}
Given that reasoning style (Standalone, CoT, or LogicLM) and shot learning setting (few-shot vs. zero-shot) had minimal effects, we statistically tested the effects of model and modality on performance using GLMMs for each of the three tasks. Type III Wald $\chi^2$ tests revealed a consistent and highly significant interaction between Model and Modality across all tasks: Chord Quality ($\chi^2(2) = 62.22$, $p < .001$), Syncopation ($\chi^2(2) = 62.11$, $p < .001$), and Transposition ($\chi^2(2) = 18.22$, $p < .001$). This confirms that the performance gap between audio and MIDI inputs is consistent across music perception tasks, but varies significantly depending on the specific model being evaluated. These interactions, visualized in \figureref{fig:glmm_results_latex}, provide a more granular understanding of each model's strengths and weaknesses.

For the Syncopation (\figureref{fig:glmm_results_latex}A) and Chord Quality (\figureref{fig:glmm_results_latex}C) tasks, the Gemini models (Flash and Pro) demonstrate a statistically significant modality gap, with the near-ceiling performance on MIDI inputs plummeting dramatically for audio stimuli ($p$s $<$ .001). Gemini Pro is the strongest performer in the audio condition for all  tasks, though its accuracy remains below 50\% for Syncopation Scoring and Chord Quality ID. The interaction in the Syncopation task is particularly revealing. While the Gemini models show a large performance drop from MIDI to audio, the Qwen2.5-Omni model shows no significant difference between the two modalities ($z = -0.17$, $p = .865$). However, this is not due to strong audio performance, but rather to its equally poor performance on the symbolic MIDI data, indicating a failure in the core reasoning for that task, independent of the input modality.

The Transposition task presents the most compelling evidence of genuine audio perception. Here, the interaction effect is driven by Gemini Pro's remarkable success when analyzing audio. Post-hoc comparisons revealed that while Flash and Qwen still exhibited a significant modality gap, the difference between Gemini Pro's audio and MIDI performance was not statistically significant ($z = -0.13$, $p = .899$). As visualized in the Transposition plot (\figureref{fig:glmm_results_latex}B), Gemini Pro achieves up to 95\% accuracy on audio inputs, effectively closing the perceptual gap and performing on par with its MIDI accuracy. This singular achievement highlights that, for certain relational reasoning tasks like melody comparison, state-of-the-art models are beginning to bridge the divide between symbolic reasoning and true audio-native understanding (i.e., real ``listening'').

\paragraph{LogicLM input quality and response accuracy.}
\figureref{fig:f1_vs_accuracy} diagnoses the source of the performance gaps observed in \tableref{tab:results-temp0} by disentangling transcription fidelity from downstream reasoning. This visualization reveals that the primary bottleneck for audio-based tasks is a failure in perception rather than downstream reasoning. The most illuminating example of this is in Chord Quality Identification (pastel-colored points). For audio inputs, the models exhibit a complete perceptual breakdown, with F1-scores clustering below 0.25. This indicates an inability to correctly identify the constituent pitches of a chord from the raw waveform. Consequently, with unreliable symbolic input, the models' reasoning collapses, yielding final accuracy scores that hover near chance level, corroborating the poor performance seen in \tableref{tab:results-temp0}. In stark contrast, when provided with clean MIDI data, the models achieve near-perfect F1-scores and accuracy, demonstrating that their capacity for symbolic reasoning about harmony is intact but is hampered when fed unreliable perceptual input.

\begin{figure}[!t]
\floatconts
  {fig:f1_vs_accuracy}
  {\vspace{-1.0cm}\caption{Relationship between the quality of LogicLM inputs (F1-scores, x-axis) and final decision accuracy (y-axis). Each point represents a specific task category, pooled across all models and shot settings. Circle fill encodes modality (Audio = filled, MIDI = unfilled) and color encodes the task-specific category (on- and off-beats for syncopation; chord qualities for chord ID; Yes/No responses for transposition). 
  Syncopation circles are further partitioned by syncopation level.
  % (\textit{No Syncopation} (0), \textit{Level 1} (1), \textit{Level 2} (2), \textit{Level 3} (3), \textit{Level 4} (4)). 
  The plot highlights the perceptual bottleneck: the points pertaining to audio consistently show lower F1-scores and accuracy compared to MIDI (most overlap at (1,1)).}\vspace{-1cm}}
  {
    \centering
    \includegraphics[width=0.65\linewidth]{f1_vs_accuracy.png}
  }
\end{figure}

The Syncopation Scoring task offers a more nuanced illustration of this perceptual bottleneck. The plot uniquely separates the F1-scores for on-beat events (black circles) from off-beat events (grey circles), and shows individual points for each level of syncopation that we test. For MIDI inputs, both on-beat and off-beat transcription F1-scores are exceptionally high ($>0.7$), leading to high decision accuracy. However, for audio inputs, a critical divergence appears: while the models are fairly successful at transcribing the rhythmically simple on-beats (F1-scores between 0.6 and 0.95), they are largely unable to detect the crucial syncopated off-beats (F1-scores below 0.4). Since the final syncopation score is entirely dependent on counting these off-beats, the low perceptual fidelity for this specific feature directly causes the low overall accuracy for the audio-based syncopation task. Missing off-beat events is crucial beyond this benchmark, as syncopation is a fundamental feature of music that shapes perception, underlies the feeling of groove, and modulates music reward \citep{matthews2019sensation, matthews2020sensation}. Thus, the models can estimate the pulse, but cannot hear the syncopation.



Finally, the Transposition Detection task stands out as the most robust in the audio modality, a finding consistent across analyses. For audio inputs (blue and green points), the models achieve F1-scores in the 0.6--0.7 range, which, while imperfect, are substantially better than in other audio tasks. This moderately successful perception is sufficient to drive decision accuracy to relatively high levels (approximately 70--80\%). This suggests that extracting melodic contour and relative pitch intervals (the core components of transposition) is a more tractable perceptual task for current models than the precise onset detection required for syncopation or the harmonic parsing needed for chord identification. Overall, the plot compellingly argues that future progress in musical AI will depend less on enhancing the abstract reasoning of LLMs and more on improving the robustness of their audio front-ends to reliably transcribe the foundational elements of music.

\vspace{-0.4cm}
\section{Discussion}

Our findings converge on a simple but consequential claim: multimodal LLMs reason effectively over symbolic music data, yet still fail to truly ``listen''. LLMs, especially Gemini models, reached near-ceiling with MIDI, and LogicLM behaved as intended once schema adherence was met. Replacing MIDI with audio sharply reduced accuracy, especially for Syncopation Scoring and Chord Quality ID under LogicLM, implicating transcription/onset tracking and pitch-salience as the primary bottlenecks \citep{weck2024muchomusic,ghosh2025musicflamingoscalingmusic,Martak2025SoundMusicBiasesAMT}. Reasoning strategies (CoT, LogicLM) did not compensate for upstream hearing errors \citep{zhifei2025audio}.
This modality gap matters because people experience music through audio, not symbolic proxies. Symbolic formats strip away the features making music meaningful (micro-timing, articulation, expressive nuance) so LLM ceiling performance on MIDI should not be mistaken for audio-native competence \citep{Ayyildiz2025MicroVariationsImagery, groves2025acoustic}.
Our GLMM analysis provides statistical support for these observations, but more importantly, reveals a significant Model × Modality interaction throughout. That is, although models performed better on MIDI than on audio, the magnitude of this difference varied across models and music perception tasks.
This finding complicates a simple narrative of universal audio failure. It demonstrates that the severity of the perceptual bottleneck is model and task-dependent. For instance, the Qwen2.5-Omni model’s failure on the Syncopation task was unique in that its performance was equally poor on MIDI and Audio data, suggesting a more fundamental deficit in its symbolic reasoning capabilities for that task, rather than just a perceptual one. In contrast, Gemini Pro’s success in the Melody Transposition task, where it statistically closed the performance gap between audio and MIDI, stands as a crucial proof-of-concept. It suggests that for tasks reliant on global melodic features, state-of-the-art architectures are on the cusp of achieving true audio-native competence, even if they fall short elsewhere.

% \figureref{fig:f1_vs_accuracy} allows us to further dissect the nature of these perceptual failures by disentangling transcription fidelity from downstream reasoning. The results for Syncopation are particularly illuminating. The vast difference in F1-scores between on-beat events (which were transcribed reasonably well from audio) and off-beat events (which were almost entirely missed) is the smoking gun for the models' poor performance. This confirms that the models can perceive simple rhythmic structure (the main pulse) but fail to transcribe the very syncopated events that define the task and its difficulty. Missing off-beat events is crucial beyond this benchmark, as syncopation is a fundamental feature of music that shapes perception, underlies the feeling of groove, and modulates music reward \citep{matthews2019sensation, matthews2020sensation}.
%This pattern holds more broadly: models demonstrated moderate success on acoustically simpler audio signals like the monophonic melodies in the Transposition task, but their perceptual capabilities collapse when faced with the harmonic complexity of Chord ID and the rhythmic complexity of off-beats. The core issue, therefore, may not simply be an inability to hear audio, but an inability to parse musically complex audio.

In sum, current multimodal LLMs reason symbolically but lack fully accurate audio-native competence: the ability to process songs from audio files to answer structured questions. We suggest that progress will depend on stronger audio front-ends and propagation of uncertainty into downstream solvers. In the current state-of-the-art, symbolic reasoning layers collapse due to small perceptual errors.
LLMs that acquire genuine understanding could also be music education~\citep{jin2025exploringimpactllmpoweredteachable} and user-centric music analysis tools~\citep{urrego2025vibe,10704174}, enabling interactive systems that can teach musical structure and foster deeper engagement with personal music listening. 




\bibliography{eaim}

% ===================================================================
% SUPPLEMENTARY MATERIALS
% ===================================================================
\clearpage
\phantomsection
\addcontentsline{toc}{section}{Supplementary Materials}

\begin{center}
  {\Large \bfseries Supplementary Materials \par}
\end{center}


\setcounter{section}{0}
\setcounter{table}{0}
\setcounter{figure}{0}
\renewcommand{\thesection}{S\arabic{section}}
\renewcommand{\thetable}{S\arabic{table}}
\renewcommand{\thefigure}{S\arabic{figure}}

\section{\texttt{Stimuli}}
Stimuli are original musical recordings created by a real human musician in Logic Pro X using a 2021 16” MacBook Pro (Apple M1 Pro chip), an Apollo Twin X audio interface, and Yamaha HS8 monitors. Stimuli were recorded on electric guitar (PRS McCarty Hollowbody II, Schecter Solo-6), piano (Arturia KeyLab Essential Mk3 MIDI controller with Analog Lab V software instruments), and drums (Roland TD-17 electronic kit with Superior Drummer 3 plugin). Guitar recordings were processed with Neural DSP plugins (Tim Henson Archetype, Cory Wong Archetype).

Additional excerpts were reserved for few-shot prompting (2 for syncopation, 2 for transposition, and 4 for chord ID, one per chord class) and excluded from testing.


You can access the stimuli used in this experiment on \href{https://github.com/brandoncarone/MUSE_music_benchmark/tree/main/stimuli/LogicLM_Experiments}{The MUSE Benchmark Github page}.

The stimuli used for the Transposition Detection task can be found \href{https://github.com/brandoncarone/MUSE_music_benchmark/tree/main/stimuli/LogicLM_Experiments/transposition_detection}{here} and all of them have the melody number, key, and tempo in the filename (e.g., M1\_EbMaj\_90.wav).

The stimuli used for the Syncopation Scoring task can be found \href{https://github.com/brandoncarone/MUSE_music_benchmark/tree/main/stimuli/LogicLM_Experiments/syncopation}{here} and all of them have Sync in the name, along with the syncopation level number (e.g., NoSync\_A, Sync2\_B).

The stimuli used for the Chord Quality Identification task can be found \href{https://github.com/brandoncarone/MUSE_music_benchmark/tree/main/stimuli/LogicLM_Experiments/chordID}{here}. The chords are named by number, and you can find the mapping in \tableref{tab:chord_number_mapping} below.
\begin{table}[h!]
\centering
\caption{Mapping of chord roots and qualities to numerical identifiers (1.wav--48.wav).}
\begin{tabular}{lcccc}
\toprule
\textbf{Root} & \textbf{Diminished} & \textbf{Dominant} & \textbf{Major} & \textbf{Minor} \\
\midrule
Ab & 1  & 2  & 3  & 4  \\
A  & 5  & 6  & 7  & 8  \\
Bb & 9  & 10 & 11 & 12 \\
B  & 13 & 14 & 15 & 16 \\
C  & 17 & 18 & 19 & 20 \\
Db & 21 & 22 & 23 & 24 \\
D  & 25 & 26 & 27 & 28 \\
Eb & 29 & 30 & 31 & 32 \\
E  & 33 & 34 & 35 & 36 \\
F  & 37 & 38 & 39 & 40 \\
Gb & 41 & 42 & 43 & 44 \\
G  & 45 & 46 & 47 & 48 \\
\bottomrule
\end{tabular}
\label{tab:chord_number_mapping}
\end{table}

\section{\texttt{System Instructions}}

\subsection*{1a) Syncopation — Standalone}
\begin{quote}
“You are an expert music transcription AI participating in a multi-turn reasoning experiment.

You will be given one short audio excerpt of a drum set per trial. Your task is to focus only on the kick and snare drums. The hi-hat plays constant 8th notes, acting as a metronome. Count the total number of kicks and snare hits that fall on off-beats. 

\medskip

Valid multiple-choice responses are:

A. 0 (No Syncopation)\\
B. 2 (Low Syncopation)\\
C. 4 (Medium-Low Syncopation)\\
D. 6 (Medium-High Syncopation)\\
E. 8 (High Syncopation)

\medskip

End with exactly one line:\\
Final Answer: X\\”
\end{quote}

\subsection*{1b) Syncopation — Chain-of-Thought (CoT)}
\begin{quote}
“You are an expert music transcription AI participating in a multi-turn reasoning experiment.

You will be given one short audio excerpt of a drum set per trial. Your task is to focus only on the kick and snare drums. The hi-hat plays constant 8th notes, acting as a metronome. Count the total number of kicks and snare hits that fall on off-beats. On-beats are the main pulses (beats 1, 2, 3, and 4) and off-beats are the “ands” in between. Ignore the on-beats and ignore the hi-hat.

\medskip

Valid multiple-choice responses are:

A. 0 (No Syncopation)\\
B. 2 (Low Syncopation)\\
C. 4 (Medium-Low Syncopation)\\
D. 6 (Medium-High Syncopation)\\
E. 8 (High Syncopation)

\medskip

After any reasoning, end with exactly one line:\\
Final Answer: X\\”
\end{quote}

\subsection*{1c) Syncopation — LogicLM}
\begin{quote}
“You are an expert music transcription AI participating in a multi-turn reasoning experiment.

Your task is to transcribe the onsets of ONLY the kick and snare drums into the format:\\
\texttt{rhythm(identifier, [list\_of\_onsets]).}

\medskip

- The \textquoteleft identifier\textquoteright{} is the filename of the audio.\\
- The \textquoteleft list\_of\_onsets\textquoteright{} is a comma-separated list of integers from 1 to 32.\\
- The rhythm is on a 4-bar grid, quantized to 8th notes (numbered 1 to 32). All odd numbers are on-beats, and all even numbers are off-beats.\\
- The hi-hat plays constant 8th notes, acting as a metronome. On-beats are the main pulses (beats 1, 2, 3, and 4 of each bar) and off-beats are the 'ands' in between.

\medskip

Grid: The excerpt is 4 bars quantized to 8th notes → 32 slots numbered 1–32.

Within each bar (8 slots): 1,3,5,7 = on-beats (beats 1–4). 2,4,6,8 = off-beats (“\&”s).

Across bars: slot = 8×(bar-1) + local\_slot.

\medskip
Beat positions across 4 bars:\\
• Beat 1 → 1, 9, 17, 25\\
• Beat 2 → 3, 11, 19, 27\\
• Beat 3 → 5, 13, 21, 29\\
• Beat 4 → 7, 15, 23, 31\\

Off-beats (“\&”s):\\
• \&1 → 2, 10, 18, 26\\
• \&2 → 4, 12, 20, 28\\
• \&3 → 6, 14, 22, 30\\
• \&4 → 8, 16, 24, 32

\medskip

Output format:\\
\texttt{rhythm(identifier.wav, [n1, n2, ..., nK])} where each \texttt{n} is an integer in 1–32.

\medskip

Example of format where the kicks are on beats 1 and 3 in each bar, and the snare hits are on beats 2 and 4 in each bar (all played on the on-beats):\\
\ \ \texttt{rhythm(example.wav, [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31])}

\medskip

Output your answer of symbolic code as a single line of plain text without code fences or explanations. After your transcription, an external tool will score it, and you will answer a question based on that score.”
\end{quote}

\subsection*{2a) Transposition Detection — Standalone}
\begin{quote}
“You are an expert melody transcription AI participating in a multi-turn reasoning experiment.

You will be given two short monophonic audio melodies per trial. \\
Your job is to decide whether they represent the SAME melody up to TRANSPOSITION (i.e., identical shape/intervals but possibly in different keys).

\medskip

Valid responses are exactly one of:\\
``Yes, these are the same melody."\\
``No, these are not the same melody."

\medskip

Respond with exactly one of the two phrases and nothing else.”
\end{quote}

\subsection*{2b) Transposition Detection — Chain-of-Thought (CoT)}
\begin{quote}
“You are an expert melody transcription AI participating in a multi-turn reasoning experiment.

You will be given two short monophonic audio melodies per trial. Your job is to decide whether they represent the SAME melody up to TRANSPOSITION (i.e., identical shape/intervals but possibly in different keys).

\medskip

Definitions and constraints:\\
- Transposition equivalence: the two melodies have the same number of notes and the same sequence of pitch INTERVALS between successive notes (including 0 for repeated notes).\\
- Ignore absolute key/register, starting pitch, and tempo. Small timing variations are acceptable. If the rhythmic patterns are drastically different (e.g., note insertions/deletions or re-ordered phrases), they are most likely NOT the same melody.\\
- Treat repeated notes as separate events and include 0 in the interval sequence when a note repeats.\\
- If there are leading/trailing silences, ignore them.

\medskip

Valid responses (exactly one of these strings):\\
``Yes, these are the same melody."\\
``No, these are not the same melody."

\medskip

After any reasoning, end with exactly one line:\\
Final Answer: Yes, these are the same melody.\\
OR\\
Final Answer: No, these are not the same melody.”
\end{quote}

\subsection*{2c) Transposition Detection — LogicLM}
\begin{quote}
“You are an expert melody transcription AI participating in a multi-turn reasoning experiment.

You will be given two short monophonic audio melodies per trial. Your first task is to transcribe EACH melody into the symbolic format below, using MIDI integers for pitches. If the rhythmic sequences seem drastically different, they are most likely not the same melody.

\medskip

Output format (schema):\\
\ \ \texttt{melody(identifier, [p1, p2, ..., pK])}

- Use the exact identifiers I provide for each trial (one per audio).\\
- p1..pK are integers representing MIDI pitches (e.g., C4 = 60).\\
- Transcribe the pitch sequence only.\\
- Output exactly two lines of plain text: one ‘melody(...)’ per line, in the same order as the audios (Audio 1 line first, then Audio 2 line).\\
- Do not include code fences or any extra commentary.

\medskip

Example (schema only; not tied to any audio):\\
\ \ \texttt{melody(Audio1, [60, 62, 64])}\\
\ \ \texttt{melody(Audio2, [65, 67, 69])}

\medskip

After your transcription, a deterministic tool will analyze the two lines to decide if the melodies are transpositions (same contour, different key). You will then answer a Yes/No question based on that decision.”
\end{quote}

\subsection*{3a) Chord Quality Matching — Standalone}
\begin{quote}
“You are an expert chord-transcription AI participating in a multi-turn reasoning experiment.

You will be given one short audio clip per trial. Each clip first plays a chord (block), then the individual notes (arpeggiation).\\
All chords are in ROOT POSITION.

Your task is to identify the chord QUALITY.

\medskip

Valid options:\\
A. Major\\
B. Minor\\
C. Dominant\\
D. Diminished

\medskip

Final Answer: X\\”
\end{quote}

\subsection*{3b) Chord Quality Matching — Chain-of-Thought (CoT)}
\begin{quote}
“You are an expert chord-transcription assistant in a multi-turn reasoning experiment.

You will be given one short audio clip per trial containing a single chord (first block, then arpeggiated notes). All chords are in ROOT POSITION; the lowest pitch is the ROOT (treat as 0 semitones).
Your task: identify the chord QUALITY by inferring pitch-class intervals above the root and ignoring octave doublings.

\medskip


Valid options:\\
A. Major\ \ \ \ \ \ \ $\rightarrow$ \ \{0,4,7\}\\
B. Minor\ \ \ \ \ \ \ $\rightarrow$ \ \{0,3,7\}\\
C. Dominant\ \ \ \ $\rightarrow$ \ \{0,4,7,10\}\\
D. Diminished\ $\rightarrow$ \ \{0,3,6\}

\medskip

Think through the identification. Once you've finished reasoning, the final line of your output should be exactly:\\
Final Answer: X\\”
\end{quote}

\subsection*{3c) Chord Quality Matching — LogicLM}
\begin{quote}
“You are an expert chord-transcription assistant in a multi-turn reasoning experiment.

You will be given one short audio clip per trial containing a single chord. First the chord sounds as a block, then the notes are arpeggiated.

Your task is to transcribe the chord tones into a strict symbolic format. Use MIDI integers (0–127). Include octave doublings if you hear them. Do not add commentary.

\medskip

Output format (schema):\\
\ \ \texttt{chord(identifier, [p1, p2, ..., pK])}

\medskip

Rules:\\
- Use the exact identifier I provide for the trial.\\
- Record only the pitches you hear as MIDI integers.\\
- It is acceptable if the list is not sorted; a deterministic solver will normalize.\\
- Output EXACTLY ONE LINE of plain text with NO code fences or extra text.

\medskip

Example (schema only; not tied to any audio):\\
\ \ \texttt{chord(Audio\_X, [56, 60, 64, 67, 72, 76])}

\medskip

After your line is produced, a deterministic tool will classify the chord quality (Major / Minor / Dominant / Diminished) from your symbolic line. You will then answer a multiple-choice question with: Final Answer: X”
\end{quote}


\section{\texttt{Task Schemas and Deterministic Solvers}}
Each task defines a single-line schema the model must emit verbatim. A hand-written, deterministic solver (\texttt{solver.py}) parses that line, makes the decision, and returns the minimal information needed for a constrained final answer.

\subsubsection*{Syncopation Scoring}
\textbf{Input:} 4-bar drum loop with constant 8th-note hi-hat; we only score kick+snare.\\
\textbf{Grid:} 32 slots (8 per bar). Odd slots are on-beats; even slots are off-beats.

\paragraph*{Schema (one line):}
\begin{quote}\ttfamily
rhythm(\textless id\textgreater, [n1, n2, ..., nK])
\end{quote}

Where each \texttt{n} is an integer in [1..32] (kick or snare onset).\\
\textbf{Solver:} counts off-beat onsets and maps to five categories: 0,2,4,6,8 off-beats $\rightarrow$ A–E respectively. Final answer is a single MC letter A–E.

\subsubsection*{Transposition Detection}
\textbf{Input:} two short monophonic excerpts (guitar or piano) that are either the same melody in different keys or different melodies.

\paragraph*{Schema (two lines, order-preserving):}
\begin{quote}\ttfamily
melody(\textless id1\textgreater, [p1, p2, ..., pK])\\
melody(\textless id2\textgreater, [p1, p2, ..., pK])
\end{quote}

Where \texttt{p*} are MIDI integers (0–127).\\
\textbf{Solver:} checks equal length and equality of adjacent-interval sequences (transposition invariance). Returns ARE / ARE NOT (transpositions). Final answer is forced to one of: 
\begin{quote}
“Yes, these are the same melody.”\\
“No, these are not the same melody.”
\end{quote}

\subsubsection*{Chord Quality Identifier}
\textbf{Input:} a single triad or seventh chord (piano), presented as a block then arpeggiated.

\paragraph*{Schema (one line):}
\begin{quote}\ttfamily
chord(\textless id\textgreater, [p1, p2, ..., pK])
\end{quote}

MIDI integers (0–127); octave doublings allowed.

\bigskip

\textbf{Solver:} normalizes to pitch classes, factors out the putative root, and matches the interval set to:
\begin{itemize}
  \item Major (0, 4, 7) $\rightarrow$ A
  \item Minor (0, 3, 7) $\rightarrow$ B
  \item Dominant 7 (0, 4, 7, 10) $\rightarrow$ C
  \item Diminished (0, 3, 6) $\rightarrow$ D
\end{itemize}
Final answer is a single MC letter A–D.

\subsubsection*{Self-refinement (SR)}
For LogicLM, we validate the line(s) with strict regex/AST checks and label errors as parse, structural, or domain. If invalid, we run up to 2 SR rounds in a separate deterministic chat (temperature=0, top\_p=1, top\_k=1, 256 tokens) with a fix-only prompt that:
\begin{itemize}
  \item Echoes the prior output,
  \item States the specific error type/message,
  \item Re-states the required line(s) and constraints,
  \item Forbids commentary and code fences.
\end{itemize}

If the solver returns undecidable/None (e.g., empty list), we allow one extra SR pass with a synthesized parse error. This SR design follows the LogicLM self-refinement idea of using solver feedback to repair the symbolic form.

\bigskip


\subsection{\texttt{solver.py}}

\begin{lstlisting}
# solver.py
import re
from typing import List, Optional, Tuple, Dict

class SyncopationSolver:
    """
    A deterministic logic solver that calculates a syncopation score
    based on a simplified on-beat/off-beat rule for a 4-bar (1-32) 8th-note grid.
    """
    def __init__(self):
        self.on_beats = set()
        self.off_beats = set()

        for bar_offset in [0, 8, 16, 24]:
            self.on_beats.update([
                1 + bar_offset, 3 + bar_offset, 5 + bar_offset, 7 + bar_offset
            ])
            self.off_beats.update([
                2 + bar_offset, 4 + bar_offset, 6 + bar_offset, 8 + bar_offset
            ])

    def parse_llm_output(self, llm_text: str) -> Optional[List[int]]:
        """
        Parses the LLM's symbolic output to extract a list of onsets.
        Returns the list of integers if successful, or None if parsing fails.
        """
        match = re.search(r'rhythm\s*\(\s*[^,]+\s*,\s*\[([\d,\s]*)\]\s*\)', llm_text)
        if not match:
            return None
        numbers_str = match.group(1)
        if not numbers_str.strip(): # Check if the string is empty or just whitespace
            return []
        try:
            # Handle potential trailing commas by filtering out empty strings after split
            return [int(num.strip()) for num in numbers_str.split(',') if num.strip()]
        except ValueError:
            return None

    def score_onset(self, onset: int) -> int:
        if onset in self.off_beats:
            return 1
        return 0

    def calculate_total_score(self, onset_list: list[int]) -> int:
        if not onset_list:
            return 0
        total_score = sum(self.score_onset(onset) for onset in onset_list)
        return total_score

class TranspositionSolver:
    """
    A deterministic solver for melody transposition detection.
    Two melodies are considered transpositions if:
      - They have the same number of notes, and
      - Their interval sequences (adjacent pitch differences in semitones) are identical.
    Rhythm is ignored. Pitches must be integers (MIDI numbers).
    """

    MELODY_PATTERN = re.compile(
        r"melody\s*\(\s*([A-Za-z0-9_.\-]+)\s*,\s*\[\s*([^\]]*?)\s*\]\s*\)",
        flags=re.IGNORECASE
    )

    def _extract_pitches(self, pitches_str: str) -> Optional[List[int]]:
        """
        Extracts integer pitches from an arbitrary list content that may include
        parentheses or spaces, e.g. '[(60), (62), (64)]' or '60, 62,64'.
        """
        nums = re.findall(r"-?\d+", pitches_str)
        if not nums:
            return []
        try:
            return [int(n) for n in nums]
        except ValueError:
            return None

    def parse_llm_output(self, llm_text: str) -> Optional[List[Dict[str, List[int]]]]:
        """
        Parses any 'melody(ID, [ ... ])' lines found in the LLM's output, in order.
        Returns a list of dicts: [{'id': <ID>, 'pitches': [..]}, ...]
        or None if nothing parseable is found.
        """
        if not llm_text:
            return None

        text = llm_text.replace("```", "").replace("`", "").strip()

        melodies = []
        for m in self.MELODY_PATTERN.finditer(text):
            ident = m.group(1)
            plist_str = m.group(2)
            pitches = self._extract_pitches(plist_str)
            if pitches is None:
                return None
            melodies.append({"id": ident, "pitches": pitches})

        return melodies or None

    def _intervals(self, pitches: List[int]) -> List[int]:
        return [pitches[i+1] - pitches[i] for i in range(len(pitches) - 1)]

    def are_transpositions(self, p1: List[int], p2: List[int]) -> Optional[bool]:
        """
        Returns True/False if a decision is possible, or None if inputs are degenerate.
        Policy:
          - Require same length (>0). If lengths differ, return False.
          - If length == 1 on both, return True (single note can be transposed anywhere).
          - Otherwise compare interval sequences.
        """
        if p1 is None or p2 is None:
            return None
        if len(p1) == 0 and len(p2) == 0:
            return None
        if len(p1) != len(p2):
            return False
        if len(p1) == 1:  # single-note melodies
            return True

        return self._intervals(p1) == self._intervals(p2)

    def decide_same_melody(self, llm_text: str) -> Optional[bool]:
        """
        Convenience: parse two melodies from LLM output and decide True/False.
        Returns None if fewer than 2 melodies parsed or if undecidable.
        """
        parsed = self.parse_llm_output(llm_text)
        if not parsed or len(parsed) < 2:
            return None
        p1 = parsed[0]["pitches"]
        p2 = parsed[1]["pitches"]
        return self.are_transpositions(p1, p2)


# ---------- Chord Quality (deterministic) ----------

class ChordQualitySolver:
    """
    Deterministic chord-quality classifier for LogicLM.
    Expects ONE schema line produced by the LLM:
        chord(identifier, [p1, p2, ..., pK])

    Behavior:
    - Parses the line and extracts MIDI integers (duplicates allowed).
    - Sorts pitches, treats the lowest as the root, and computes (p - root) % 12.
    - Deduplicates + sorts the pitch-class intervals and matches one of the
      four target fingerprints:
        (0,4,7)      -> ("Major", "A")
        (0,3,7)      -> ("Minor", "B")
        (0,4,7,10)   -> ("Dominant", "C")
        (0,3,6)      -> ("Diminished", "D")

    Returns:
        (identifier, quality_str, letter)  or  None if undecidable.
    """
    CHORD_PATTERN = re.compile(
        r"chord\s*\(\s*([A-Za-z0-9_.\-]+)\s*,\s*\[\s*([^\]]*?)\s*\]\s*\)",
        flags=re.IGNORECASE
    )

    QUALITY_BY_PCS: Dict[Tuple[int, ...], Tuple[str, str]] = {
        (0, 4, 7):      ("Major", "A"),
        (0, 3, 7):      ("Minor", "B"),
        (0, 4, 7, 10):  ("Dominant", "C"),
        (0, 3, 6):      ("Diminished", "D"),
    }

    def _extract_pitches(self, pitches_str: str) -> Optional[List[int]]:
        """
        Robust integer pull; accepts '60,64,67', '[(60), 64, 67]', etc.
        Returns list[int] or None if malformed.
        """
        nums = re.findall(r"-?\d+", pitches_str or "")
        try:
            return [int(n) for n in nums]
        except Exception:
            return None

    def parse_llm_output(self, llm_text: str) -> Optional[Dict[str, List[int]]]:
        """
        Parse the first chord(...) line found. Returns {'id': <ID>, 'pitches': [...]}
        or None if not found / ill-formed.
        """
        if not llm_text:
            return None
        text = llm_text.replace("```", "").strip()
        m = self.CHORD_PATTERN.search(text)
        if not m:
            return None
        ident = m.group(1)
        pitches = self._extract_pitches(m.group(2))
        if pitches is None:
            return None
        return {"id": ident, "pitches": pitches}

    def _normalize_to_pcs(self, pitches: List[int]) -> Optional[Tuple[int, ...]]:
        """
        Sort, take lowest as root, compute pitch-class intervals modulo 12,
        then deduplicate and sort.
        """
        if not pitches:
            return None
        root = min(pitches)
        pcs = tuple(sorted({(p - root) % 12 for p in pitches}))
        return pcs

    def classify_quality(self, pitches: List[int]) -> Optional[Tuple[str, str]]:
        """
        Map normalized pitch-class interval set to (quality, letter).
        """
        pcs = self._normalize_to_pcs(pitches)
        if pcs is None:
            return None
        return self.QUALITY_BY_PCS.get(pcs)

    def decide_quality(self, llm_text: str) -> Optional[Tuple[str, str, str]]:
        """
        End-to-end convenience used by the runner:
          - parse -> classify
        Returns (identifier, quality_str, letter) or None if undecidable.
        """
        parsed = self.parse_llm_output(llm_text)
        if not parsed:
            return None
        ident = parsed["id"]
        result = self.classify_quality(parsed["pitches"])
        if result is None:
            return None
        quality, letter = result
        return ident, quality, letter

\end{lstlisting}

\end{document}
