\newpage
\appendix

\setcounter{figure}{0} % Reset figure numbering
\renewcommand{\thefigure}{A\arabic{figure}} % Prefix with "A" for Appendix

\setcounter{table}{0} % Reset table numbering
\renewcommand{\thetable}{A\arabic{table}} % Prefix with "A" for Appendix

\section{Overview of all Pathological Datasets}

\begin{table}[ht]
\centering
 \resizebox{0.78\textwidth}{!}{
 \begin{tabular}{lllll}
 \hline
 Dataset Name & Citation & Mod. & Pathology & Images \\
 \hline
 Yale\_BM & \cite{ramakrishnan_large_2024} & MRI T1-ce & Brain metastasis & 25,563 \\
 BraTS\_MET & \cite{moawad2023brain} & MRI T1-ce & Brain metastasis & 30,430 \\
 BraTS\_GLI & \begin{tabular}[c]{@{}l@{}}\cite{baid2021rsna}\\ \cite{brats} \\ \cite{glioma}\end{tabular} & MRI T1-ce & Glioma & 163,066 \\
 MSD\_Liver & \cite{antonelli_medical_2022} & CT-ce & Liver tumor & 19,134 \\
 MSD\_Hep\_Vessel & \cite{antonelli_medical_2022} & CT-ce & Liver tumor & 13,013 \\
 KiTS23\_Kidney & \cite{heller2023kits21} & CT-ce & Kidney tumor & 32,909 \\
 \hline
 \end{tabular}
 }
 \caption{Overview of pathological datasets used in this work. "-ce" refers to constrast enhancing MRI / CT.}
 \label{tab:dataset_summary}
 \end{table}

\section{Training Loss and Validation Scores for all Trained Models}

\begin{figure}[ht]
\centering
\includegraphics[width=0.9\textwidth]{figures/train_curve.png}
\caption{Training loss curves and validation AP plotted for the baseline and the two \ac{CL}-models. The x-axis denotes training steps and the vertical lines at the top of the graph denote the start of new epochs for each model.} \label{fig:train_curve}
\end{figure}

\newpage

\section{Data Distribution for Bounding Box Curriculum}

\begin{table}[ht]
\centering
\setlength{\tabcolsep}{4pt}
\renewcommand{\arraystretch}{1.5}
\resizebox{0.9\textwidth}{!}{
\begin{tabular}{|l|c|c|c|c|c|c|}
\hline
\diagbox{Datasets}{Area Interval}  & {$[0,0.21)$} & {$[0.21,0.72)$} & {$[0.72,1.75)$}  & {$[1.75,3.49)$} & {$[3.49,100]$} & {total}\\
\hline
Yale\_BM & 2037 & 1596 & 881 & 509 & 167 & 5190\\
\hline
BraTS\_GLI & 3825 & 5186 & 7933 & 10346 & 10783 & 38073\\
\hline
BraTS\_MET & 2969 & 1768 & 1067 & 725 & 227 & 6756\\
\hline
MSD\_Liver  & 2897 & 1294 & 510 & 196 & 225 & 5122 \\
\hline
MSD\_Hep\_Vessel  & 717 & 917 & 758 & 383 & 504 & 3279   \\
\hline
KiTS23\_Kidney & 1200 & 2619 & 1934 & 1340 & 1477 & 8570\\
\hline
total  & 13645 & 13380 & 13083 & 13499 & 13383 & 66990\\
\hline
\end{tabular}
}
\caption{Distribution of image slices with ground truth annotations across datasets for the bounding box sorting approach: Based on the smallest bounding box present in a slice, the slice gets sorted into a particular difficulty interval. The intervals are defined by standardized area of the bounding box, with the smallest area intervals being the hardest category.}
\label{tab:areas}
\end{table}

\section{Data Distribution for Teacher Curriculum}

\begin{table}[!h]
\centering
\setlength{\tabcolsep}{4pt}
\renewcommand{\arraystretch}{1.5}
\resizebox{0.85\textwidth}{!}{
\begin{tabular}{|l|c|c|c|c|c|c|}
\hline
\diagbox{Datasets}{AP Interval} & {$[0,0.30)$} & {$[0.30,0.69)$} & {$[0.69,0.87)$}  & {$[0.87,0.9)$} & {$[0.9,1.0]$} & {total}\\
\hline
Yale\_BM & 878 & 1166 & 1875 & 1069 & 399 & 5387\\
\hline
BraTS\_GLI & 6245 & 5242 & 7198 & 10281 & 10437 & 39403\\
\hline
BraTS\_MET & 1232 & 1704 & 1993 & 1349 & 698 & 6976\\
\hline
MSD\_Liver  & 1181 & 2102 & 1337 & 531 & 166 & 5317 \\
\hline
MSD\_Hep\_Vessel  & 1024 & 863 & 894 & 509 & 235 & 3525   \\
\hline
KiTS23\_Kidney & 1676 & 1101 & 2160 & 2288 & 1854 & 9079\\
\hline
total  & 12236 & 12178 & 15457 & 16027 & 13789 & 69687\\
\hline
\end{tabular}
}
\caption{Distribution of image slices with ground truth annotations and false positive predictions across datasets for the teacher sorting approach: Based on the baseline inference performance, the slice gets sorted into a particular difficulty interval. The intervals are defined by \ac{AP} scores, with the lowest AP score intervals being the hardest category.}
\label{tab:areas_teacher}
\end{table}
\newpage


\section{Evolution of  the Sample Distribution During Training of the Teacher Curriculum Model}
\begin{figure}[ht]
\centering
\includegraphics[width=0.9\textwidth]{figures/areas.png}
\caption{The histogram shows how sample distribution across difficulty categories evolves during Teacher CL trainings. We evaluate the distribution at three stages: (a) before training, (b) midway, after the model has encountered all categories at least once, and (c) after training is complete. As shown in the figure, the midway distribution shifts towards both extremes, reflecting ongoing learning. By the end of training, samples predominantly cluster in the “Easy” category.} \label{fig:areas}
\end{figure}



\newpage

\section{Evaluation Scores for Anti-Curriculum Approaches}


\input{tables/anti_results_table} %
\newpage

\section{Evaluation Scores of Non-Curricula Models Trained on One Modality Only} \label{app:one_mod_finetune}

\input{tables/ct_mri_only_finetune_results} %

\noindent\textbf{Finetuning Modality:} The finetuned models shown in \tableref{tab:results} are trained on a multi-modal data from all pathological datasets, making it unclear how each modality contributes individually or whether their combination provides a clear advantage. \tableref{tab:results_modality} presents test scores for two models finetuned on a single modality without \ac{CL}. While results are inconclusive, they indicate that combining modalities is not detrimental. As expected, the \ac{CT} model performs poorly on \ac{MRI} datasets and vice versa. Moreover, performance on datasets  of the same modality as training data is comparable to that of fully finetuned models in \tableref{tab:results}. Specifically, the CT model ranks lowest on MSD\_Liver, second lowest on MSD\_Hep\_Vessel, and second best on KiTS23\_Kidney. Meanwhile, the MRI model ranks second best on Yale\_BM, ties for second best on BraTS\_GLI, and performs best on BraTS\_MET, compared to the finetuned models in \tableref{tab:results}.

\newpage

\section{Qualitative Comparison of Models Directly After Pretraining on Natural Images vs. Medical Images}\label{app:test_wo_finetune}

\begin{figure}[ht]
\centering
\includegraphics[width=0.68\textwidth]{figures/qualitative_pretrain.png}
\caption{A comparison of the top-three bounding box predictions from the natural image \ac{G-DINO} model and the \ac{G-DINO} model pretrained on TotalSegmentator (CT \& MRI) across four pathological examples. Neither model was finetuned on the pathological datasets. In instances where fewer than three distinct boxes appear, the same box was predicted multiple times within the top-three. The findings showcase that the natural image \ac{G-DINO} model typically predicts bounding boxes that encompass the entire region of the human body present in the slice, whereas the medically pretrained \ac{G-DINO} model sometimes even accurately identifies some tumors or detects the corresponding organ.}\label{fig:pretraining_qualitative}
\end{figure}

In this experiment we test vanilla G-DINO trained on natural image \& pretrained G-DINO (on multimodal medical images from TotalSegmentator dataset) directly on the 
pathological datasets without finetuning to compare their comprehension of pathologies. As expected both models
have scores of $\leq$ 1\% AP across all datasets, as they have never been trained on pathological data. However, a qualitative analysis as illustrated in \figureref{fig:pretraining_qualitative}, suggests that the pretrained model exhibits a better understanding of tissue structures, whereas the vanilla model struggles to differentiate anatomical features, often detecting the entire image as a foreground object rather than identifying meaningful regions.


\newpage

\section{Evaluation Scores for Two Bounding Box Curriculum Models Finetuned After CT-only / MRI-only Pretraining}\label{app:one_mod_pretrain}
\input{tables/results_pretraining_ablation} %

In this experiment we fine-tuned two additional bounding box CL models: one pretrained only on MRI scans from TotalSegmentator, and the other pretrained only on CT data. The results are tabulated in \tableref{tab:results_pretraining}. The results indicate that the multi-modal pretraining yields better results (51.7 \% AP, \tableref{tab:results}) compared to MRI-only (50.6 \% AP) and CT-only (50.9 \% AP) pertaining. Moreover, the performance of the multimodal pretrained bounding box CL model is better than the CT-only pretrained bounding box CL model on two out of three CT test datasets, and better than the MRI-only pretrained bounding box CL model on two out of three MRI test datasets.

\newpage

\section{Evaluation Scores of a Bounding Box Curriculum Model Trained With Two Difficulty Categories Only}\label{app:two_cat_CL}


\begin{table}[ht]
\centering
\setlength{\tabcolsep}{3pt}
\renewcommand{\arraystretch}{1.2}
\resizebox{0.9\textwidth}{!}{
\begin{tabular}{|l|l|llllll|}
\hline
\textbf{\begin{tabular}[c]{@{}l@{}}Number of \space \space \\ CL Categories\end{tabular}} &
  \textbf{Dataset} &
  \textbf{\begin{tabular}[c]{@{}l@{}}AP (\%)\\ @0.5\end{tabular}} &
  \textbf{\begin{tabular}[c]{@{}l@{}}AP (\%)\\ @0.75\end{tabular}} &
  \textbf{\begin{tabular}[c]{@{}l@{}}AP (\%) \end{tabular}} &
  \textbf{\begin{tabular}[c]{@{}l@{}}AP (\%)\\ large\end{tabular}} &
  \textbf{\begin{tabular}[c]{@{}l@{}}AP (\%)\\ medium\end{tabular}} &
  \textbf{\begin{tabular}[c]{@{}l@{}}AP (\%)\\ small\end{tabular}} \\ \hline

\multirow{7}{*}{2} & Overall           & 74.4 & 55.2 & 50.4 & 66.7 & 64.2 & 35.4 \\ %n
                    & Yale\_BM          & 77.9 & 63.6 & 56.0 & - & 79.2 & 52.4 \\ %ne
                    & BraTS\_MET        & 81.9 & 65.4 & 56.6 & -  & 83.3 & 53.3\\ %ne
                    & BraTS\_GLI        & 84.6 & 72.1 &  65.8 & - & 81.1 & 48.6\\ %n
                    & MSD\_Liver        & 60.6 & 31.0 &  32.0 & 61.1 & 44.3 & 21.0\\ %ne
                    & MSD\_Hep\_Vessel \space \space  & 71.0 & 43.0 & 42.6 & 65.6 & 46.6 & 16.7 \\ %n
                    & KiTS23\_Kidney   & 70.3 & 56.2 & 49.4 & 73.5 & 50.8 & 20.4 \\  \hline


\end{tabular}
}
\caption{Results table of a bounding box CL model trained using only two difficulty categories.}
\label{tab:results_diff_cat}
\end{table}

This ablation study investigates the effect of the number of difficulty categories employed during CL training. For all standard CL-based models depicted in \tableref{tab:results}, the training procedure utilizes five difficulty categories, which are incrementally introduced with each training epoch. After five CL epochs, fine-tuning is then conducted on the entire training set. In contrast, for the ablation, we implemented bounding box CL using only two difficulty categories. Specifically, the model was initially trained on the easier difficulty category for three epochs, after which the second category was introduced and training continued until convergence. The results demonstrate a slight decrease in performance, with an AP score of 50.4\% compared to 51.7\% AP for the regular bounding box CL (see \tableref{tab:results}).


