\documentclass{midl} % Include author names
%\documentclass[anon]{midl} % Anonymized submission

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution

\usepackage{mwe} % to get dummy images
\usepackage{multirow}
\usepackage{booktabs}
\usepackage{threeparttable}
\usepackage{textcomp}
\usepackage{longtable}
\usepackage{lipsum}


%\jmlrvolume{-- Under Review}
\jmlryear{2021}
\jmlrworkshop{Full Paper -- MIDL 2021}
%\editors{Under Review for MIDL 2021}

\title[SWNet for Surgical Workflow Recognition]{SWNet: Surgical Workflow Recognition with Deep Convolutional Network}

 % Use \Name{Author Name} to specify the name.
 % If the surname contains spaces, enclose the surname
 % in braces, e.g. \Name{John {Smith Jones}} similarly
 % if the name has a "von" part, e.g \Name{Jane {de Winter}}.
 % If the first letter in the forenames is a diacritic
 % enclose the diacritic in braces, e.g. \Name{{\'E}louise Smith}

 % Two authors with the same address
 % \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\and
 %  \Name{Author Name2} \Email{xyz@sample.edu}\\
 %  \addr Address}

 % Three or more authors with the same address:
 % \midlauthor{\Name{Author Name1} \Email{an1@sample.edu}\\
 %  \Name{Author Name2} \Email{an2@sample.edu}\\
 %  \Name{Author Name3} \Email{an3@sample.edu}\\
 %  \addr Address}


 %  Three or more authors with the same address:
  \midlauthor{\Name{Bokai Zhang} \Email{bzhang29@its.jnj.com}\\
   \Name{Amer Ghanem} \Email{aghanem1@its.jnj.com}\\
   \Name{Alexander Simes} \Email{alex.simes29@gmail.com}\\
   \Name{Henry Choi} \Email{hchoi51@its.jnj.com}\\
   \Name{Andrew Yoo} \Email{banyanfig@gmail.com}\\
   \Name{Andrew Min} \Email{amin2@its.jnj.com}\\
   \addr C-SATS, Inc., Johnson $\&$ Johnson, Seattle, WA, USA}
%   \addr  C-SATS, Inc., Johnson & Johnson, Seattle, WA, USA}

 % Three or more authors with the same address:
 %   \midlauthor{\Name{Bokai Zhang} \Email{an1@sample.edu}\\
 %    \Name{Amer Ghanem} \Email{an2@sample.edu}\\
 %    \Name{Alexander Simes} \Email{an3@sample.edu}\\
 %    \Name{Henry Choi} \Email{an3@sample.edu}\\
 %    \Name{Andrew Yoo} \Email{an3@sample.edu}\\
 %    \Name{Andrew Min} \Email{an3@sample.edu}\\
 %    \addr {Address}}

% Authors with different addresses:
% \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\\
%  \addr Address 1}
% \AND
% \Name{Author Name2} \Email{xyz@sample.edu}\\
% \addr Address 2
% }

%\footnotetext[1]{Contributed equally}

% More complicate cases, e.g. with dual affiliations and joint authorship

% \midlauthor{\Name{Author Name1\midljointauthortext{Contributed equally}\nametag{$^{1,2}$}} \Email{abc@sample.edu}\\
% \addr $^{1}$ Address 1 \\
% \addr $^{2}$ Address 2 \AND
% \Name{Author Name2\midlotherjointauthor\nametag{$^{1}$}} \Email{xyz@sample.edu}\\
% \Name{Author Name3\nametag{$^{2}$}} \Email{alphabeta@example.edu}\\
% \Name{Author Name4\midljointauthortext{Contributed equally}\nametag{$^{3}$}} \Email{uvw@foo.ac.uk}\\
% \addr $^{3}$ Address 3 \AND
% \Name{Author Name5\midlotherjointauthor\nametag{$^{4}$}} \Email{fgh@bar.com}\\
% \addr $^{4}$ Address 4
% }


\begin{document}

\maketitle

\begin{abstract}
Surgical workflow recognition has been playing an essential role in computer-assisted interventional systems for modern operating rooms. In this paper, we present a computer vision-based method named SWNet that focuses on utilizing spatial information and temporal information from the surgical video to achieve surgical workflow recognition. As the first step, we utilize Interaction-Preserved Channel-Separated Convolutional Network (IP-CSN) to extract features that contain spatial information and local temporal information from the surgical video through segments. Secondly, we train a Multi-Stage Temporal Convolutional Network (MS-TCN) with those extracted features to capture global temporal information from the full surgical video. Finally, by utilizing Prior Knowledge Noise Filtering (PKNF), prediction noise from the output of MS-TCN is filtered. We evaluate SWNet for Sleeve Gastrectomy surgical workflow recognition. SWNet achieves 90\% frame-level accuracy and reaches a weighted Jaccard Score of 0.8256. This demonstrates that SWNet has considerable potential to solve the surgical workflow recognition problem.
\end{abstract}

\begin{keywords}
surgical workflow recognition, computer-assisted interventional systems, IP-CSN, MS-TCN
\end{keywords}

\section{Introduction}

Video-based automatic surgical workflow recognition is one of the key technologies to build computer-assisted interventional systems for modern operating rooms. Such systems can enhance coordination among OR teams and improve surgical safety. For offline surgical workflow recognition, it provides a tool to automate the indexing of surgical video databases as well as provides support in Video-Based Assessment (VBA) systems to surgeons for life-long learning \cite{feldman2020sages}.

Early studies \cite{twinanda2016endonet, kitaguchi2020real} focused on utilizing image classification networks to capture spatial information from surgical videos on a frame by frame basis to achieve surgical workflow recognition. With the rise of the Recurrent Neural Network, researchers have proposed using 2D Convolutional Neural Network and Recurrent Neural Network together to capture both spatial and temporal information from the surgical video through segments \cite{jin2017sv, zisimopoulos2018deepphase, chen2018semi, yengera2018less, funke2018temporal, mondal2019multitask, jin2020multi, nakawala2019deep, yi2019hard}. In a recent study, \citet{czempiel2020tecno} utilize ResNet50 \cite{he2016deep} as the 2D Convolutional Neural Network to extract visual features frame by frame from the surgical video to capture spatial information. They also utilize a 2-stage causal Temporal Convolutional Network to capture global temporal information from the extracted features for surgical workflow recognition.

In this paper, instead of utilizing ResNet to capture spatial features frame by frame, we implement a deep 3D Convolutional Neural Network named Interaction-Preserved Channel-Separated Convolutional Network (IP-CSN) \cite{tran2019video} to capture spatial and local temporal features by video segment. We utilize a Multi-Stage Temporal Convolutional Network (MS-TCN) \cite{farha2019ms} to capture global temporal information from the video. For offline surgical workflow recognition, we utilize the Prior Knowledge Noise Filtering (PKNF) algorithm to filter the prediction noise from MS-TCN output. We name this IPCSN-MSTCN-PKNF workflow SWNet.


\section{Methods}

\subsection{Datasets}

We test our method of surgical workflow recognition on Sleeve Gastrectomy videos. Sleeve Gastrectomy is used to assist patients with losing excess weight. It can reduce the risk of potentially life-threatening weight-related health problems including type 2 diabetes, high blood pressure, sleep apnea, and more. Our medical experts reviewed the literature on Sleeve Gastrectomy surgical workflow \cite{iannelli2008laparoscopic, daskalakis2009sleeve, van2017identification, van2017resident, kaijser2018delphi} and split Sleeve Gastrectomy into 8 surgical phases: ``Exploration phase'', ``Ligation of short gastric vessels phase'', ``Gastric transection phase'', ``Bougie phase'', ``Suturing of omentum to stomach phase'', ``Liver retraction phase'', ``Hiatal hernia repair phase'', and ``Gastric band removal phase''. The time interval between surgical phases was named as ``Not a surgical phase''.

We collected 461 Sleeve Gastrectomy surgical videos from 14 institutions for our dataset. The dataset was split randomly. 317 videos were used for the training dataset. 82 videos were used for the validation dataset. 62 videos were used for the test dataset. Each video is annotated with the above-mentioned set of phases. More details about the datasets are shown in Appendix \ref{appendix:a}.

\subsection{SWNet for offline surgical workflow recognition}
The overview of our SWNet is illustrated in Figure \ref{fig:figureoverviewSWNet}. During inference, we divide the video into short video segments and utilize IP-CSN to extract features for each video segment. Each feature can be considered as a summary of the video segment. We concatenate the extracted features to get the full video features and utilize MS-TCN to achieve initial surgical phase segmentation for the full surgical video. We apply the Prior Knowledge Noise Filtering algorithm to the initial surgical phase segmentation results to get the final prediction results for the full video. With SWNet, we are able to capture spatial and local temporal information in short video segments with IP-CSN as well as capture global temporal information in the full video with MS-TCN.

Next, we work on building SWNet for the offline surgical workflow recognition pipeline. We first conduct transfer learning on our dataset with IP-CSN. Then, we utilize IP-CSN to extract features for our dataset. After that, we train the MS-TCN with the extracted features. Finally, we utilize the Prior Knowledge Noise Filtering algorithm to filter the prediction noise from MS-TCN output.


\begin{figure}[hbt!]
 % Caption and label go in the first argument and the figure contents
 % go in the second argument
       % Give a unique label

\floatconts
  {fig:example}
  {\caption{The overview of SWNet}
  \label{fig:figureoverviewSWNet}}
  {\includegraphics[width=1.0\linewidth]{Fig1.png}}
\end{figure}

\subsubsection{IP-CSN as feature extraction backbone}
In recent research, 3D ConvNet is used to capture spatial and temporal information in video segments. \citet{carreira2017quo} proposed to inflate 2D CNN along the temporal dimension to obtain Inflated 3D ConvNet (I3D). With RGB stream and optical flow stream as the input streams, a two-stream I3D solution is designed. To lower the computational cost and improve accuracy, R(2+1)D \cite{tran2018closer} is designed to factor 3D convolution in space and time while Channel-Separated Convolutional Network (CSN) \cite{tran2019video} is designed to factor 3D convolution by separating channel interaction and spatiotemporal interaction.

From recent studies \cite{tran2019video, ghadiyaram2019large}, CSN outperforms two-stream I3D and R(2+1)D on Kinetics-400 dataset \cite{kay2017kinetics}. With large-scale weakly-supervised pre-training on IG-65M dataset \cite{ghadiyaram2019large}, CSN model performs even better. From the computation standpoint, CSN only needs the RGB stream as input while the optical flow stream in two-stream I3D needs expensive computation. Inspired by this, we adopt CSN, specifically, Interaction-Preserved Channel-Separated Convolutional Network for our problem. Appendix \ref{appendix:b} shows the design of IP-CSN bottleneck block.

A large amount of video data is needed for training a 3D ConvNet from scratch, so we conduct transfer learning instead. The initial weights for IP-CSN152 are publicly available \cite{tran2019video}. We utilize the initial weights pretrained on IG-65M and Kinetics-400 for our work. We annotated each of our surgical videos with nine class labels, including eight surgical phase labels and one not surgical phase label. The start time and the end time for each label are annotated. To fine-tune IP-CSN on our dataset, during each training epoch, five 19.2s video segments are randomly selected inside each annotation segment for each video. 32 frames are sampled with constant intervals as one training sample from each video segment.


\subsubsection{MS-TCN for surgical phase segmentation}
To capture global temporal information from the video, instead of utilizing a 2-stage causal Temporal Convolutional Network proposed in \citet{czempiel2020tecno}, we utilize a 4-stage acausal Temporal Convolutional Network proposed in \citet{farha2019ms}. Given the input $X = \{x_{1}, x_{2}, \dots, x_{t}\}$, MS-TCN predicts the output $P = \{P_{1},P_{2},\dots,P_{t}\}$ where $t$ is the current time step, $1 \le t<=T$, $T$ is the number of total time steps, $x_{t}$ is the feature input at time step $t$, $P_{t}$ is output prediction for the current time step.

The overview of MS-TCN is illustrated in Figure \ref{fig:figureoverviewMSTCN} in Appendix \ref{appendix:d}. For the classification loss in MS-TCN, the cross-entropy loss is calculated by
\begin{equation}
L_{cls} = \frac{1}{T}\sum_{t}-\log(p_{t,c})
\end{equation}
where $p_{t,c}$ is the predicted probability at class $c$ at time step $t$. For the smooth loss to reduce over-segmentation, the truncated mean squared error is calculated over the frame-wise log-probabilities by
\begin{equation}
L_{T-MSE}=
\begin{cases}
\frac{1}{TC}\sum_{t,c} |\log(p_{t,c}) - \log(p_{t-1,c})|^{2} & \text{$|\log(p_{t,c}) - \log(p_{t-1,c})| \le \tau$}\\
\frac{1}{TC}\sum_{t,c} \tau^{2} & \text{otherwise}
\end{cases}
\end{equation}
where $C$ is the total number of classes, $\tau$ is the threshold value. The final loss function sums the losses over all stages which can be calculated by
\begin{equation}
L_{final} = \sum_{S} (L_{cls} + \lambda L_{T-MSE})
\end{equation}
where $S$ is the total stage number for MS-TCN, $\lambda$ is a weighted parameter.

\subsubsection{Prior Knowledge Noise Filtering}
In the surgical videos we gathered, it is observed that surgeons can idle or pull out surgical tools in the middle of a surgical phase. For those video segments, the deep learning model sometimes fails to predict accurately. As a result, we develop a filtering algorithm to filter the wrong predictions.

We develop Prior Knowledge Noise Filtering (PKNF) for offline surgical workflow recognition in consideration of three aspects: surgical phase order, surgical phase incidence, and surgical phase time. From the surgical phase order aspect, we notice several surgical phases follow a specific order. When a prediction from MS-TCN does not follow the specific phase order it should, we correct the prediction by selecting a label that the model has the highest confidence in from the possible labels according to phase order. From the surgical phase time aspect, we run statistical analyses on our annotation to get the minimum phase time $T$. $T=\{T_{1}, T_{2}, \dots, T_{N}\}$ where $N$ is the total number of the surgical phases. We check that the prediction segments share the same prediction labels from MS-TCN first. For adjacent prediction segments that share the same prediction labels, we connect them if the time interval between the prediction segments is shorter than the connection threshold we set for that surgical phase. The connection threshold is set according to the minimum phase time $T$. In this work, we set the connection threshold to be 40\% of the minimum phase time $T$. After adjacent prediction segments are connected correctly, surgical phase time can be calculated for each surgical phase prediction segment. We correct prediction segments that are too short to be a surgical phase. From the surgical phase incidence aspect, we notice some surgical phases normally only happen less than a fixed incidence number when we run statistical analyses on our annotation. If multiple segments of the same phase show up in the prediction and pass the phase incidence threshold value we set for that surgical phase, we select segments according to the ranking of the model\textquotesingle s confidence.

\subsection{Online surgical workflow recognition}
As shown in Figure \ref{fig:figureoverviewSWNet}, the final step in SWNet is PKNF. It is an inference algorithm designed specifically for offline surgical workflow recognition. If we take out the PKNF step in SWNet, we can utilize IPCSN-MSTCN for online surgical workflow recognition. During online inference, spatial and local temporal features extracted by IP-CSN are saved by video segment. So at time step $t$, we can read in all features before time step $t$ together with the feature extracted at time step $t$ to build feature set $F = \{f_{1}, f_{2},\dots,f_{t}\}$, we send the feature set $F$ to MS-TCN to get prediction output $P = \{P_{1},P_{2},\dots,P_{t}\}$, where $P_{t}$ is the online prediction result at time step $t$.

\section{Experimental Results}
We utilize IP-CSN152, MS-TCN, and PKNF to build IPCSN-MSTCN-PKNF workflow and refer to it as SWNet. For fine-tuning IP-CSN152, SGD optimizer with an initial learning rate of $2e^{-4}$ is used. We reduce the learning rate by a factor of 0.2 if validation accuracy does not improve in the last 10 epochs. The weight decay is set to be $1e^{-7}$. Random crop is used for data augmentation. For each training sample sequence of frames, we resize the frames according to the smaller side of the frames to 182 pixels and randomly crop 160* 160 patches from them. For data augmentation, random rotation is applied to 10\% of the training samples. Random flipping is applied to 10\% of the training samples. After training the IP-CSN152, we remove the final layer of IP-CSN152 and use it to extract 2048-dimensional feature vectors. We utilize acausal Temporal Convolutional Networks for MS-TCN. The total number of stages is set to be 4. The total number of dilated convolution layers at each stage is set to be 10. The number of feature maps is set to be 64. During the training of MS-TCN, Adam optimizer is used with a learning rate of $5e^{-4}$.

To quantify the importance of utilizing local temporal information from the feature extraction backbone, we replace IP-CSN152 from SWNet with a 2D ConvNet named EfficientNet-B5 \cite{tan2019efficientnet} to build EfficientNet-MSTCN-PKNF workflow. We utilize EfficientNet-B5 to capture spatial information only as the feature extraction backbone. For fine-tuning EfficientNet-B5, SGD optimizer with an initial learning rate of $1e^{-4}$ is used. We reduce the learning rate by a factor of 0.2 if validation accuracy does not improve in the last 10 epochs. For each training sample frame, we resize the frame according to the smaller side of the frame to 510 pixels and randomly crop 456*456 patch from it. Random rotation and random flipping are also used during the training of EfficientNet-B5.

To quantify the importance of utilizing MS-TCN as the video action segmentation network, we replace MS-TCN from SWNet with a two-layer LSTM to build the IPCSN-LSTM-PKNF workflow. The hidden unit size for LSTM is set to be 128. The dropout rate is set to be 0.5. The learning rate is set to be 0.005.

\subsection{Results for offline surgical workflow recognition}
We evaluate our methods against ResNetLSTM \cite{jin2017sv} as well as TeCNO \cite{czempiel2020tecno}. The overall experimental results conducted on our test dataset are shown in Table \ref{tab:offlineoverll}. Our EfficientNet-MSTCN-PKNF outperforms both ResNetLSTM and TeCNO. By utilizing PKNF, SWNet outperforms IPCSN-MSTCN by 1.16\% and 0.0186 in terms of the overall accuracy and the weighted Jaccard Score. IPCSN-LSTM-PKNF outperforms IPCSN-LSTM by 1.65\% and 0.0239 in terms of the overall accuracy and the weighted Jaccard Score. The above results show that utilizing PKNF in the workflow can reduce noise and improve prediction results. SWNet outperforms EfficientNet-MSTCN-PKNF by 1.76\% and 0.0261 in terms of the overall accuracy and the weighted Jaccard Score. This shows that IP-CSN is a better feature extraction backbone compares to EfficientNet. SWNet outperforms IPCSN-LSTM-PKNF by 3.24\% and 0.0512 in terms of the overall accuracy and the weighted Jaccard Score. This shows that MS-TCN is a better video action segmentation network compares to LSTM. The mean accuracy, the standard deviation of the accuracy, the mean weighted Jaccard Score, and the standard deviation of the weighted Jaccard Score are shown in Table \ref{tab:offlineoverllmean} in Appendix \ref{appendix:c}. SWNet outperforms all other methods in terms of the mean accuracy and the mean weighted Jaccard Score.

To further compare the performance between ResNetLSTM, TeCNO, EfficientNet-MSTCN-PKNF, IPCSN-LSTM-PKNF, and SWNet, we calculate Precision, Recall, and F1-Score for each surgical phase in Table \ref{tab:offlinedetailed} in Appendix \ref{appendix:c}. Except for the ``Exploration phase'', the F1-Score for SWNet outperforms the F1-Score for other networks. SWNet performs well in most surgical phases. From Table \ref{tab:dataset} in Appendix \ref{appendix:a}, lack of training data might be the reason why SWNet does not perform well in some surgical phases.

As shown in Figure \ref{fig:offlinecompare}, we visualize the predictions results from ResNetLSTM, TeCNO, EfficientNet-MSTCN, EfficientNet-MSTCN-PKNF, IPCSN-LSTM, IPCSN-LSTM-PKNF, IPCSN-MSTCN, and IPCSN-MSTCN-PKNF (SWNet) from 4 test videos for offline surgical workflow recognition. It is clear that SWNet can locate the surgical phase more accurately and identify phase transactions better.
% For tables use
\begin{table}[hbt!]
\centering
\fontsize{10}{12}\selectfont
% table caption is above the table
\caption{Overall accuracy and Jaccard score for offline surgical workflow recognition}
\label{tab:offlineoverll}       % Give a unique label
% For LaTeX tables use
\begin{tabular}{ccc}
\toprule
Method & Accuracy & Weighted Jaccard Score\\
\midrule
ResNetLSTM & 0.8235 & 0.7141 \\
TeCNO & 0.8659 & 0.7668 \\
EfficientNet-MSTCN & 0.8818 & 0.7928 \\
EfficientNet-MSTCN-PKNF & 0.8861 & 0.7995 \\
IPCSN-LSTM & 0.8548 & 0.7505 \\
IPCSN-LSTM-PKNF & 0.8713 & 0.7744 \\
IPCSN-MSTCN & 0.8921 & 0.8070 \\
IPCSN-MSTCN-PKNF (SWNet) & 0.9037 & 0.8256 \\
\bottomrule
\end{tabular}
\end{table}

\begin{figure}[hbt!]
 % Caption and label go in the first argument and the figure contents
 % go in the second argument
\floatconts
  {fig:example}
  {\caption{Color-coded ribbon illustration for offline recognition results: (a) ResNetLSTM prediction results (b) TeCNO prediction results (c) EfficientNet-MSTCN model output (d) EfficientNet-MSTCN-PKNF prediction results (e) IPCSN-LSTM model output (f) IPCSN-LSTM-PKNF prediction results (g) IPCSN-MSTCN model output (h) SWNet prediction results (i) Ground Truth}
  \label{fig:offlinecompare}}
  {\includegraphics[width=1.0\linewidth]{Fig4.png}}
\end{figure}

\subsection{Results for online surgical workflow recognition}
We also evaluate IPCSN-MSTCN for online surgical workflow recognition. Instead of focusing on utilizing weighted cross-entropy loss in \citet{czempiel2020tecno}, we investigate the effect of applying smoothing loss in \citet{farha2019ms}.

We evaluate our methods against ResNetLSTM \cite{jin2017sv} as well as TeCNO \cite{czempiel2020tecno}. As shown in Table \ref{tab:onlineoverll}, the performance for TeCNO and IPCSN-MSTCN are similar from the overall accuracy and the weighted Jaccard Score aspects. Models achieving similar accuracy may have large differences,  as visualized in Figure \ref{fig:onlinecompare}. These frame-wise metrics are not suitable to evaluate over-segmentation errors. In order to evaluate out-of-order predictions and over-segmentation errors, segmental metrics \cite{lea2016learning, lea2017temporal, farha2019ms} are utilized. We calculate the segmental edit distance score, and the segmental F1 score at overlapping thresholds 10\%, 25\%, and 50\% as shown in Appendix \ref{appendix:e}. The overlapping threshold is determined based on the intersection over union (IoU) ratio. After applying smooth loss, the segmental edit distance score and the segmental F1 score for IPCSN-MSTCN improve a lot. Comparing with ResNetLSTM and TeCNO, IPCSN-MSTCN trained with smoothing loss can provide smoother predictions.

As shown in Figure \ref{fig:onlinecompare}, we visualize the predictions results from ResNetLSTM and TeCNO together with the output of the predictions from IPCSN-MSTCN trained with different loss functions. It is clear that applying smooth loss can alleviate over-segmentation errors for online surgical workflow recognition.
% For tables use
\begin{table}[hbt!]
\centering
\fontsize{10}{12}\selectfont
% table caption is above the table
\caption{Overall accuracy, segmental edit distance and segmental F1 for online surgical workflow recognition}
\label{tab:onlineoverll}       % Give a unique label
% For LaTeX tables use
\begin{tabular}{cccccccc}
\toprule
Method & Accuracy & Jaccard & Edit & F1@10 & F1@25 &F1@50 \\
\midrule
ResNetLSTM & 0.8130 & 0.6997 & 22.2775 & 23.2044 & 20.6931 & 15.7710 \\
TeCNO & 0.8451 & 0.7331 & 42.5531 & 46.7005 & 43.8578 & 35.7360 \\
IPCSN-MSTCN($L_{cls}$) & 0.8425 & 0.7326 & 49.5681 & 49.6224 & 44.8759 & 33.6570 \\
IPCSN-MSTCN($L_{cls} + \lambda L_{T-MSE}$) & 0.8466 & 0.7367 & 56.5213 & 56.1170 & 52.9255 & 41.4894\\
\bottomrule
\end{tabular}
\end{table}

\begin{figure}[hbt!]
 % Caption and label go in the first argument and the figure contents
 % go in the second argument
\floatconts
  {fig:example}
  {\caption{Color-coded ribbon illustration for online recognition results: (a) ResNetLSTM prediction results (b) TeCNO prediction results (c) Predictions from IPCSN-MSTCN trained with $L_{cls}$ (d) Predictions from IPCSN-MSTCN trained with $L_{cls} + \lambda L_{T-MSE}$ (e) Ground Truth}
  \label{fig:onlinecompare}}
  {\includegraphics[width=1.0\linewidth]{Fig5.png}}
\end{figure}

\section{Conclusion}
In this paper, we designed SWNet for surgical workflow recognition with IP-CSN, MS-TCN, and PKNF. We show that utilizing IP-CSN with RGB stream outperforms EfficientNet as the feature extraction backbone. We show that PKNF can improve prediction results for offline surgical workflow recognition as well as applying smooth loss can reduce over-segmentation errors for online surgical workflow recognition. For future work, we want to investigate prediction filtering algorithms like PKI \cite{jin2017sv} for online surgical workflow recognition. We want to conduct a deeper analysis for the prediction errors and conduct more experiments on other procedures for surgical workflow recognition.


% Acknowledgments---Will not appear in anonymized version
%\midlacknowledgments{We thank a bunch of people.}


\bibliography{zhang21}


\appendix

\section{Details about the datasets}
\label{appendix:a}
We calculate the minutes of video data we have for the dataset. As shown in Table \ref{tab:dataset}, we have a limited amount of data for several surgical phases like: ``Exploration phase'', ``Bougie phase'', ``Liver retraction phase'', and ``Gastric band removal phase''. There are two reasons that cause this data imbalance problem. One reason is the operation time for different surgical phases varies from one another. Another reason is that surgical phases like ``liver retraction'' are optional during the surgery. Video segments labeled as ``Not a surgical phase'' are usually surgical phase transaction segments, undefined surgical phase segments, out-of-body segments, idle segments, and so on.

\begin{table}[htbp]
\centering
\fontsize{10}{12}\selectfont
 % The first argument is the label.
 % The caption goes in the second argument, and the table contents
 % go in the third argument.
\floatconts
  {tab:table1}%
  {\caption{Training, validation and test datasets (minutes of video)}
  \label{tab:dataset}}%
  {\begin{tabular}{cccc}
  \toprule
  Phase Name & Training Data & Validation Data & Testing Data \\
  \midrule
  Not a surgical phase & 5729.91 & 1460.91 & 1202.01 \\
  Ligation of short gastric vessels phase & 4247.63 & 1082.03 & 828.13 \\
  Gastric transection phase & 3988.37 & 953.85 & 690.50 \\
  Bougie phase & 305.08 & 64.35 & 50.62 \\
  Suturing of omentum to stomach phase & 2562.70 & 807.70 & 397.62 \\
  Exploration phase & 181.83 & 38.33 & 27.22 \\
  Liver retraction phase & 65.48 & 25.97 & 6.88 \\
  Hiatal hernia repair phase & 448.95 & 72.38 & 102.63 \\
  Gastric band removal phase & 52.63 & 42.32 & 31.03 \\
  \bottomrule
  \end{tabular}}
\end{table}


\section{IP-CSN block example}
\label{appendix:b}
CSN \cite{tran2019video} is defined as 3D CNNs in which all convolutional layers (except for conv1) are either $1\times1\times1$ conventional convolutions or $k\times k\times k$ depthwise convolutions. $1\times1\times1$ conventional convolutions are used for channel interactions and $k\times k \times k$ depthwise convolutions are used for local spatiotemporal interactions. As shown in Figure \ref{fig:ipcsnbb}, by replacing the $3\times3\times3$ convolution with a $1\times1\times1$ traditional convolution and a $3\times3\times3$ depthwise convolution, a standard 3D bottleneck block in 3D ResNet was changed into an IP-CSN bottleneck block. This design can not only reduce parameters and FLOPs of the traditional $3\times3\times3$
convolution significantly but also preserves all channel interactions with a newly-added $1\times1\times1$ convolution.

\begin{figure}[htbp]
 % Caption and label go in the first argument and the figure contents
 % go in the second argument
\floatconts
  {fig:example}
  {\caption{Example of IP-CSN bottleneck block}
  \label{fig:ipcsnbb}}
  {\includegraphics[width=0.55\linewidth]{Fig2.png}}
\end{figure}


\section{Results for offline surgical workflow recognition}
The mean accuracy, the standard deviation of the accuracy, the mean weighted Jaccard Score, and the standard deviation of the weighted Jaccard Score are calculated from video level using the test dataset and are shown in Table \ref{tab:offlineoverllmean}. SWNet outperforms all other methods in terms of the mean accuracy and the mean weighted Jaccard Score. The Precision, Recall, and F1-Score are shown in Table \ref{tab:offlinedetailed}.

\label{appendix:c}

% For tables use
\begin{table}[hbt!]
\centering
\fontsize{10}{12}\selectfont
% table caption is above the table
\caption{Accuracy and Weighted Jaccard Score for offline surgical workflow recognition (mean $\pm$ std. \%)}
\label{tab:offlineoverllmean}       % Give a unique label
% For LaTeX tables use
\begin{tabular}{ccc}
\toprule
Method & Accuracy & Weighted Jaccard Score\\
\midrule
ResNetLSTM & $84.02\pm10.50$ & $75.97\pm13.27$ \\
TeCNO &  $88.42\pm10.19$  & $81.48\pm12.91$ \\
EfficientNet-MSTCN & $88.31\pm7.778$ & $81.47\pm10.67$ \\
EfficientNet-MSTCN-PKNF & $88.65\pm8.54$ & $81.87\pm11.70$ \\
IPCSN-LSTM & $86.23\pm7.472$ & $78.21\pm9.84$ \\
IPCSN-LSTM-PKNF & $87.19\pm7.427$ & $79.36\pm10.32$ \\
IPCSN-MSTCN & $89.85\pm7.841$ & $83.47\pm11.12$ \\
IPCSN-MSTCN-PKNF (SWNet) & $90.60\pm7.288$ & $84.51\pm10.65$ \\
\bottomrule
\end{tabular}
\end{table}


\renewcommand{\arraystretch}{0.6} %控制行高
\begin{table}[p!]
  \centering
  \fontsize{8}{10}\selectfont
  \begin{threeparttable}
  \caption{Detailed performance for offline surgical workflow recognition}
  \label{tab:offlinedetailed}
    \begin{tabular}{ccccc}
    \toprule
    Phase Name & Method & Precision & Recall & F1-Score\\
    \midrule
    \multirow{9}{*}{Not a surgical phase}&
    \multicolumn{1}{c}{ResNetLSTM}&\multicolumn{1}{c}{0.81}&\multicolumn{1}{c}{0.73}&\multicolumn{1}{c}{0.77}\cr
    \cmidrule(lr){2-2} \cmidrule(lr){3-5}
    &TeCNO&0.83&      0.82&      0.82\\
    \cmidrule(lr){2-2} \cmidrule(lr){3-5}
    &EfficientNet-MSTCN-PKNF&0.86&      0.83&      0.84\\
    \cmidrule(lr){2-2} \cmidrule(lr){3-5}
    &IPCSN-LSTM-PKNF&0.86&      0.78&      0.82\\
    \cmidrule(lr){2-2} \cmidrule(lr){3-5}
    &IPCSN-MSTCN-PKNF(SWNet)&\textbf{0.88}&      \textbf{0.85}&      \textbf{0.87}\\
    \midrule
    \multirow{9}{*}{Ligation of short gastric vessels phase}&
    \multicolumn{1}{c}{ResNetLSTM}&\multicolumn{1}{c}{0.85}&\multicolumn{1}{c}{0.89}&\multicolumn{1}{c}{0.87}\cr
    \cmidrule(lr){2-2} \cmidrule(lr){3-5}
    &TeCNO&0.84&      0.88&      0.86\\
    \cmidrule(lr){2-2} \cmidrule(lr){3-5}
    &EfficientNet-MSTCN-PKNF&0.90&      0.91&      0.90\\
    \cmidrule(lr){2-2} \cmidrule(lr){3-5}
    &IPCSN-LSTM-PKNF&0.88&      \textbf{0.93}&      0.90\\
    \cmidrule(lr){2-2} \cmidrule(lr){3-5}
    &IPCSN-MSTCN-PKNF(SWNet)&\textbf{0.91}&      0.92&      \textbf{0.92}\\
    \midrule
    \multirow{9}{*}{Gastric transection phase}&
    \multicolumn{1}{c}{ResNetLSTM}&\multicolumn{1}{c}{0.90}&\multicolumn{1}{c}{0.93}&\multicolumn{1}{c}{0.92}\cr
    \cmidrule(lr){2-2} \cmidrule(lr){3-5}
    &TeCNO&\textbf{0.96}&      0.94&      \textbf{0.95}\\
    \cmidrule(lr){2-2} \cmidrule(lr){3-5}
    &EfficientNet-MSTCN-PKNF&0.92&      \textbf{0.97}&      0.94\\
    \cmidrule(lr){2-2} \cmidrule(lr){3-5}
    &IPCSN-LSTM-PKNF&0.91&      0.96&      0.94\\
    \cmidrule(lr){2-2} \cmidrule(lr){3-5}
    &IPCSN-MSTCN-PKNF(SWNet)&0.94&      0.96&      \textbf{0.95}\\
    \midrule
    \multirow{9}{*}{Bougie phase}&
    \multicolumn{1}{c}{ResNetLSTM}&\multicolumn{1}{c}{0.33}&\multicolumn{1}{c}{0.40}&\multicolumn{1}{c}{0.36}\cr
    \cmidrule(lr){2-2} \cmidrule(lr){3-5}
    &TeCNO&0.70&      0.40&      0.51\\
    \cmidrule(lr){2-2} \cmidrule(lr){3-5}
    &EfficientNet-MSTCN-PKNF&0.61&      \textbf{0.71}&      0.65\\
    \cmidrule(lr){2-2} \cmidrule(lr){3-5}
    &IPCSN-LSTM-PKNF&0.51&      0.48&      0.49\\
    \cmidrule(lr){2-2} \cmidrule(lr){3-5}
    &IPCSN-MSTCN-PKNF(SWNet)&\textbf{0.73}&      0.64&      \textbf{0.68}\\
    \midrule
    \multirow{9}{*}{Suturing of omentum to stomach phase}&
    \multicolumn{1}{c}{ResNetLSTM}&\multicolumn{1}{c}{0.83}&\multicolumn{1}{c}{0.97}&\multicolumn{1}{c}{0.90}\cr
    \cmidrule(lr){2-2} \cmidrule(lr){3-5}
    &TeCNO&0.86&      \textbf{1.00}&      0.92\\
    \cmidrule(lr){2-2} \cmidrule(lr){3-5}
    &EfficientNet-MSTCN-PKNF&\textbf{0.91}&      0.98&      0.94\\
    \cmidrule(lr){2-2} \cmidrule(lr){3-5}
    &IPCSN-LSTM-PKNF&0.87&      0.97&      0.92\\
    \cmidrule(lr){2-2} \cmidrule(lr){3-5}
    &IPCSN-MSTCN-PKNF(SWNet)&\textbf{0.91}&      0.99&      \textbf{0.95}\\
    \midrule
    \multirow{9}{*}{Exploration phase}&
    \multicolumn{1}{c}{ResNetLSTM}&\multicolumn{1}{c}{0.09}&\multicolumn{1}{c}{0.04}&\multicolumn{1}{c}{0.06}\cr
    \cmidrule(lr){2-2} \cmidrule(lr){3-5}
    &TeCNO&0.71&      0.23&      0.35\\
    \cmidrule(lr){2-2} \cmidrule(lr){3-5}
    &EfficientNet-MSTCN-PKNF&0.35&      0.18&      0.23\\
    \cmidrule(lr){2-2} \cmidrule(lr){3-5}
    &IPCSN-LSTM-PKNF&0.87&      \textbf{0.50}&      \textbf{0.64}\\
    \cmidrule(lr){2-2} \cmidrule(lr){3-5}
    &IPCSN-MSTCN-PKNF(SWNet)&\textbf{0.95}&      0.30&      0.46\\
    \midrule
    \multirow{9}{*}{Liver retraction phase}&
    \multicolumn{1}{c}{ResNetLSTM}&\multicolumn{1}{c}{0.01}&\multicolumn{1}{c}{0.10}&\multicolumn{1}{c}{0.03}\cr
    \cmidrule(lr){2-2} \cmidrule(lr){3-5}
    &TeCNO&0.42&      0.12&      0.19\\
    \cmidrule(lr){2-2} \cmidrule(lr){3-5}
    &EfficientNet-MSTCN-PKNF&0.40&      0.32&      0.36\\
    \cmidrule(lr){2-2} \cmidrule(lr){3-5}
    &IPCSN-LSTM-PKNF&0.66&      0.28&      0.40\\
    \cmidrule(lr){2-2} \cmidrule(lr){3-5}
    &IPCSN-MSTCN-PKNF(SWNet)&\textbf{0.81}&      \textbf{0.56}&      \textbf{0.66}\\
    \midrule
    \multirow{9}{*}{Hiatal hernia repair phase}&
    \multicolumn{1}{c}{ResNetLSTM}&\multicolumn{1}{c}{0.90}&\multicolumn{1}{c}{0.65}&\multicolumn{1}{c}{0.76}\cr
    \cmidrule(lr){2-2} \cmidrule(lr){3-5}
    &TeCNO&\textbf{0.98}&      0.88&      0.92\\
    \cmidrule(lr){2-2} \cmidrule(lr){3-5}
    &EfficientNet-MSTCN-PKNF&0.93&      0.90&      0.92\\
    \cmidrule(lr){2-2} \cmidrule(lr){3-5}
    &IPCSN-LSTM-PKNF&0.87&      0.90&      0.88\\
    \cmidrule(lr){2-2} \cmidrule(lr){3-5}
    &IPCSN-MSTCN-PKNF(SWNet)&0.92&      \textbf{0.95}&      \textbf{0.94}\\
    \midrule
    \multirow{9}{*}{Gastric band removal phase}&
    \multicolumn{1}{c}{ResNetLSTM}&\multicolumn{1}{c}{0.81}&\multicolumn{1}{c}{0.29}&\multicolumn{1}{c}{0.43}\cr
    \cmidrule(lr){2-2} \cmidrule(lr){3-5}
    &TeCNO&0.84&      0.52&      0.64\\
    \cmidrule(lr){2-2} \cmidrule(lr){3-5}
    &EfficientNet-MSTCN-PKNF&\textbf{0.89}&      0.41&      0.56\\
    \cmidrule(lr){2-2} \cmidrule(lr){3-5}
    &IPCSN-LSTM-PKNF&0.73&      0.31&      0.43\\
    \cmidrule(lr){2-2} \cmidrule(lr){3-5}
    &IPCSN-MSTCN-PKNF(SWNet)&\textbf{0.89}&      \textbf{0.68}&      \textbf{0.77}\\
    \bottomrule
    \end{tabular}
    \end{threeparttable}
\end{table}

\section{The overview of MS-TCN}

\label{appendix:d}
The overview of MS-TCN \cite{farha2019ms} is illustrated in Figure \ref{fig:figureoverviewMSTCN}. Given the input $X = \{x_{1}, x_{2}, \dots, x_{t}\}$, MS-TCN predicts the output $P = \{P_{1},P_{2},\dots,P_{t}\}$ where $t$ is the current time step, $1 \le t<=T$, $T$ is the number of total time steps, $x_{t}$ is the feature input at time step $t$, $P_{t}$ is output prediction for the current time step. In each stage of MS-TCN, $l$ is the layer number and $l \in[1, L]$, $L$ is the total number of dilated convolution layers. $S$ is the total stage number for MS-TCN. The first stage of MS-TCN only consists of temporal convolutional layers. The first layer of stage 1 is a $1\times1$ convolutional layer. It is used to match the input dimension with the feature map number in the network. After that, several layers of dilated 1D convolution with the same number of convolutional filters and a kernel size of 3 are used. ReLU activation is used in each layer. Residual connections are used to facilitate gradients flow. After the last dilated convolution layer, a $1\times1$ convolution and a softmax activation are used to get the initial predictions from the first stage. To refine the initial predictions, additional stages are used. Each additional stage takes initial predictions from the previous stage and refines them.

\begin{figure}[hbt!]
 % Caption and label go in the first argument and the figure contents
 % go in the second argument
\floatconts
  {fig:example}
  {\caption{The overview of MS-TCN}
   \label{fig:figureoverviewMSTCN}}
  {\includegraphics[width=1.0\linewidth]{Fig3.png}}
\end{figure}


\section{Segmental metrics}

\label{appendix:e}
In order to evaluate out-of-order predictions and over-segmentation errors, segmental metrics \cite{lea2016learning, lea2017temporal, farha2019ms} are utilized. We calculate the segmental edit distance score \cite{lea2016learning}, and the segmental F1 score \cite{lea2017temporal} at overlapping thresholds 10\%, 25\%, and 50\%.

Let $G$ be the ground truth labeling and let $P$ be the prediction labeling. For each sequence we denote the segmental labelings $G_{s}$ and $P_{s}$ such that if $G$ = \{$ABBBBCC$\} where $A$, $B$, $C$ are three different labels, then $G_{s}$ = \{$ABC$\}. The unnormalized segmental edit score is defined using a edit distance, $S_{e}(G_{s}, P_{s})$, with insertions, deletions, and replacements. The segmental edit score can be calculated by
\begin{equation}
S_{ne} = (1-\frac{S_{e}(G_{s}, P_{s})}{\max(L_{G}, L_{P})})\times100
\end{equation}
where $L_{G}$ is the length for $G_{s}$ and $L_{P}$ is the length for $P_{s}$

For each segment, true positive and false positive are calculated by comparing its temporal Intersection over Union (IoU) with respect to the corresponding ground truth with a certain threshold. $Precision$ and $Recall$ are summed over all classes. The segmental F1 score can be calculated by
\begin{equation}
F1_{s} = (2 \times \frac{Precision \times Recall}{Precision + Recall})\times100
\end{equation}

\end{document}
