\documentclass{midl} % Include author names


% added packages by Mayar :
\usepackage{booktabs} % For better table lines
\usepackage{multirow} % For multi-row cells
\usepackage{graphicx} % For resizing tables
\usepackage{float}    % For H option in tables
\usepackage{arydshln} % For dashed lines in tables
\usepackage{xcolor}
%\usepackage{hyperref}
\usepackage{amssymb}
\usepackage{pifont}
\usepackage{color,soul}

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution

\usepackage{mwe} % to get dummy images
\jmlryear{2025}
\jmlrworkshop{Full Paper -- MIDL 2025}
\jmlrvolume{-- 237}
\editors{Accepted for publication at MIDL 2025}

\RequirePackage{xspace}
\makeatletter
\DeclareRobustCommand\onedot{\futurelet\@let@token\@onedot}
\def\@onedot{\ifx\@let@token.\else.\null\fi\xspace}

\def\eg{\emph{e.g}\onedot} \def\Eg{\emph{E.g}\onedot}
\def\ie{\emph{i.e}\onedot} \def\Ie{\emph{I.e}\onedot}
\def\cf{\emph{cf}\onedot} \def\Cf{\emph{Cf}\onedot}
\def\etc{\emph{etc}\onedot} \def\vs{\emph{vs}\onedot}
\def\wrt{w.r.t\onedot} \def\dof{d.o.f\onedot}
\def\iid{i.i.d\onedot} \def\wolog{w.l.o.g\onedot}
\def\etal{\emph{et al}\onedot}
\makeatother

\newcommand{\Azade}[1]{\textcolor{cyan}{Azade: #1}}
\newcommand{\rebuttal}[1]{\textcolor{purple}{#1}}
\newcommand{\Dmitrii}[1]{\textcolor{red}{Dmitrii: #1}} 
\def\methodName{SurgFlowMAE}
\title[\methodName{} for Surgical Event Recognition]{Surgical Flow Masked Autoencoder for Event Recognition}

 % Use \Name{Author Name} to specify the name.
 % If the surname contains spaces, enclose the surname
 % in braces, e.g. \Name{John {Smith Jones}} similarly
 % if the name has a "von" part, e.g \Name{Jane {de Winter}}.
 % If the first letter in the forenames is a diacritic
 % enclose the diacritic in braces, e.g. \Name{{\'E}louise Smith}

 % Two authors with the same address
 % \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\and
 %  \Name{Author Name2} \Email{xyz@sample.edu}\\
 %  \addr Address}

 % Three or more authors with the same address:
 % \midlauthor{\Name{Author Name1} \Email{an1@sample.edu}\\
 %  \Name{Author Name2} \Email{an2@sample.edu}\\
 %  \Name{Author Name3} \Email{an3@sample.edu}\\
 %  \addr Address}


% Authors with different addresses:
% \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\\
% \addr Address 1
% \AND
% \Name{Author Name2} \Email{xyz@sample.edu}\\
% \addr Address 2
% }

%\footnotetext[1]{Contributed equally}

% More complicate cases, e.g. with dual affiliations and joint authorship
\midlauthor{\Name{Mayar Lotfy Mostafa\nametag{$^{1,2}$}} \Email{mayar.mostafa@tum.de}\\
\Name{Anna Alperovich\nametag{$^{1}$}} \\
\Name{Dmitrii Fedotov\nametag{$^{1}$}} \\
\Name{Ghazal Ghazaei\nametag{$^{1}$}} \\
\Name{Stefan Saur\nametag{$^{3}$}} \\
\Name{Azade Farshad\nametag{$^{2,4}$}} \\
\Name{Nassir Navab\nametag{$^{2,4}$}} \\
\addr $^{1}$ Carl Zeiss AG, Corporate Research \& Technology, Oberkochen, Germany \\
\addr $^{2}$ CAMP, Technical University of Munich, Garching, Germany \\
\addr $^{3}$  Carl Zeiss Meditec AG, Oberkochen, Germany \\
\addr $^{4}$ Munich Center for Machine Learning (MCML), Munich, Germany}

\begin{document}

\maketitle

\input{chapters/0_abstract}

\input{chapters/1_intro}
\input{chapters/2_rel_works}
\input{chapters/3_method}
\input{chapters/4_experiments}
\input{chapters/5_conclusion}

%


\clearpage  % Acknowledgements, references, and appendix do not count toward the page limit (if any)
% Acknowledgments---Will not appear in anonymized version
\midlacknowledgments{This work was conducted at and financed by the Corporate Research and Technology department of Carl Zeiss AG}


\bibliography{midl25_237}


\newpage

\appendix

\section{Implementation Details}
\label{appendix:implementation_details}
\paragraph{Implementation Details}
We utilize the ViT-Small backbone with an input patch size of \( (16, 16) \) for all models. The input video and optical flow are processed at a resolution of \( 224 \times 224 \) pixels, comprising 16 frames with a sampling rate of 2.
For pre-training, in accordance with best practices established in prior research, the sampling ratio of input tokens is fixed at 90\%. 
We employ the AdamW optimizer, configured with a weight decay of \( 1e-4 \) and betas set to \( (0.9, 0.999) \). The pretraining phase utilizes a batch size of 32 and is conducted over 1200 epochs. 
%
As for the finetuning models, the classification head is finetuned for 350 epochs with cross-entropy and a batch size of 12. 



\section{Supporting Results}


\subsection{ CATARACTS}


\begin{table}[H] % Use H to force the table to stay in place
     \centering
    \caption{Summary of Experiments and Results for Flow Mask Models}
    \resizebox{\textwidth}{!}{ % Resize the table to fit the text width
    \begin{tabular}{@{}llccccccc@{}} % Use @{} to remove extra space at the sides
        \toprule
        \multirow{2}{*}{Experiment \& Pretrain Model} & \multirow{2}{*}{Masking Type} & \multicolumn{5}{c}{Metrics} \\ 
        \cmidrule(lr){3-7}
        & & \textbf{Acc1} & \textbf{Acc5} & \textbf{Precision} & \textbf{Recall} & \textbf{Jaccard Index} \\ 
        \midrule
        \multicolumn{7}{c}{\textbf{Rec + Cls}} \\ 
        \midrule
        \phantom{\textcolor{red}{1)}}K400             & Random       & 76.26 & 96.13 & 48.52     & 53.94  & 47.87         \\
        \midrule
        \phantom{\textcolor{red}{2)}}K400 + Cataracts & Random       & 75.76 & 93.94 & 48.65     & 52.48  & 48.04         \\
        \phantom{3)}K400 + Cataracts & Flow         & \textbf{81.65} & 96.80 & \textbf{61.13} & \textbf{64.58} & \textbf{60.18}         \\
        \phantom{\textcolor{red}{16)}}K400 + Cataracts & Encoder      & 80.98 & \textbf{97.14} & 59.40     & 62.17  & 58.57         \\ 
        \midrule
        \phantom{4)}Cataracts        & Random       & \textbf{56.40} & \textbf{86.53} & 16.77     & 23.03  & 16.71         \\
        \phantom{5)}Cataracts        & Flow         & 55.22 & 84.34 & 14.25     & 21.36  & 14.22         \\
        \phantom{13)}Cataracts       & Encoder      & \textbf{56.40} & 85.19 & \textbf{17.16}     & \textbf{23.55}  & \textbf{17.10}          \\ 
        \midrule
        \multicolumn{7}{c}{\textbf{Multitask}} \\ 
        \midrule
        \phantom{\textcolor{red}{6)}}Cataracts        & Random       & 61.95 & 90.75 & 39.71     & 44.95  & 38.55         \\
        \phantom{\textcolor{red}{7)}}Cataracts        & Flow         & \textbf{66.67} & \textbf{91.75} & \textbf{41.14}     & 45.16  & \textbf{39.54}         \\
        \phantom{\textcolor{red}{11)}}Cataracts       & Encoder      & 63.13 & 86.36 & 40.74     & \textbf{46.27}  & 39.26         \\ 
        \midrule
        \phantom{\textcolor{red}{8)}}K400 + Cataracts & Random       & 85.35 & 95.96 &75.79 & 77.62  & 74.89         \\
        \phantom{\textcolor{red}{9)}}K400 + Cataracts & Flow         & \textbf{87.54} & 97.14 & \textbf{77.87} & \textbf{80.63} & \textbf{76.62}         \\
        \phantom{\textcolor{red}{10)}}K400 + Cataracts & Encoder      & 86.7 & \textbf{96.97} & 76.54     & 79.00  & 75.13\\
        \bottomrule
    \end{tabular}
    }
    \label{tab:all_experiments_cataracts}
\end{table}



% Table for Multi
\begin{table}[H]
    \centering
    \caption{Comparison of \textbf{Multi} Models with Different Masking Types}
    \begin{tabular}{@{}llccccc|ccccc@{}} 
        \toprule
        \multirow{2}{*}{Model} & \multirow{2}{*}{Mask Type} & \multicolumn{5}{c|}{\textbf{\phantom{\textcolor{red}{7-}}CATARACTS}} & \multicolumn{5}{c}{\textbf{\phantom{\textcolor{red}{9-}}K400 + CATARACTS}} \\ 
        \cmidrule(lr){3-7} \cmidrule(lr){8-12}
                                &                            & Acc1 & Acc5 & Prec. & Rec. & Jacc. & Acc1 & Acc5 & Prec. & Rec. & Jacc. \\ 
        \midrule
        \multirow{2}{*}{\textbf{In}} & 0.1                       &  65.8    &  90.6    &  39.5     &   44.0   &    38.0   &   86.2   &   97.1   &   75.4    &  77.5    &   73.5    \\
                                & 0.9                       &  65.1    &  89.7    &    38.2   &    42.4  &   36.8    &    87.0  &  96.1    &   75.6    &    78.4  &   74.6    \\ 
        \midrule
        \multirow{2}{*}{\textbf{Out}} & 0.1                       &   67.3   &  89.4    &   41.9    &  46.7    &   39.9    &   86.4   &   96.3   &   76.1    &  78.4    &   74.4    \\   
                                & 0.9                       &   63.5   &  87.2    &   32.8    &   36.9   &    31.5   &    82.0  &   96.0   &  65.6     & 70.1     &    64.5   \\ 
        \bottomrule
    \end{tabular}
    \label{tab:multi_masking_comparison}
\end{table}


\begin{figure}[H]
    \centering
    % Placeholder for the figure (invisible)
    \includegraphics[trim=30 90 70 100, clip, width=1\linewidth]{assets/mask_in_out.jpg}
    \caption{Example images illustrating different masking strategies and ratios. The first column shows the RGB image, followed by the corresponding optical flow. The subsequent columns depict the effects of masking strategies: "Masking In" at ratios of 0.1 and 0.9, which retain the informative regions, and "Masking Out" at ratios of 0.1 and 0.9, which remove the informative regions. These visualizations help to understand the impact of different masking techniques on the model's performance.}
    \label{fig:masking_strategies}
\end{figure}

\subsection{ Neuro }

\begin{figure}[H]
    \centering
    % Placeholder for the figure (invisible)
    \includegraphics[trim=30 250 70 70, clip, width=1\linewidth]{assets/adverse_examples.jpg}
    \caption{Comparison of surgical events in Microscopic Neurosurgery: The top row shows a non-adverse bleeding event, which is a common occurrence during surgery and does not indicate damage. The bottom row illustrates an adverse event caused by unintentional damage from surgical tools interacting with the tissue.}
    \label{fig:neuro_adverse_examples}
\end{figure}





\begin{table}[H] % Use H to force the table to stay in place
    \centering
    \caption{Summary of Multitask Results for Flow Mask Models}
    %\resizebox{\textwidth}{!}{ % Resize the table to fit the text width
    \footnotesize % Set font size to footnotesize
    \begin{tabular}{@{}llcccc@{}} % Use @{} to remove extra space at the sides
        \toprule
        \multirow{2}{*}{Exp. \& Pretrain Model} & \multirow{2}{*}{Masking Type} & \multicolumn{4}{c}{\textbf{Multitask}} \\ 
        \cmidrule(lr){3-6}
        & & \textbf{Acc1} & \textbf{Prec.} & \textbf{Rec.} & \textbf{Jacc.} \\ 
        \midrule
        \multirow{3}{*}{Neuro} & Random       & 70.3 & 71.2 & 70.3 & 67.1 \\ 
                                & Flow         & 76.8 & 74.8 & 76.8 & 73.2 \\ 
                                & Encoder      & 89.5 & 80.1 & 81.1 & 77.8 \\ 
        \midrule
        \multirow{3}{*}{K400 + Neuro} & Random       & 56.2 & 62.7 & 56.2 & 51.0 \\ 
                                & Flow         & 66.8 & 68.4 & 66.8 & 62.7 \\ 
                                & Encoder      & 68.8 & 64.4 & 57.8 & 53.6 \\ 
        \bottomrule
    \end{tabular}
    %}
    \label{tab:all_neuro_experiments}
\end{table}


\subsection{EgoSurgery}
\autoref{tab:phase_recognition_egosurgery} shows the results of our experiments on phase recognition on the EgoSurgery dataset \cite{fujii2024egosurgery} with different masking strategies. The results show that \textit{Flow}-based masking achieves the best overall results in all metrics compared to other strategies.

\begin{table}[h]
    \centering
    \caption{Phase Recognition Results for Egosurgery Dataset}
    \begin{tabular}{ccccc}
        \hline
        \textbf{Mask} & \textbf{Accuracy (\%)} & \textbf{Precision (\%)} & \textbf{Recall (\%)} & \textbf{Jaccard (\%)} \\ \hline
        Random                   & 31.7                   & 29.0                   & 31.67                & 25.71                 \\ %\hline
        Flow                     & \textbf{39.52}                  & \textbf{41.75}                  & \textbf{39.52}                & \textbf{33.97}                 \\ %\hline
        Encoder                  & 38.57                  & 31.43                  & 38.57                & 29.05                 \\ \hline
    \end{tabular}
    \label{tab:phase_recognition_egosurgery}
\end{table}



\end{document}
%   