%%
%% This is file `sample-sigconf.tex',
%% generated with the docstrip utility.
%%
%% The original source files were:
%%
%% samples.dtx  (with options: `sigconf')
%% 
%% IMPORTANT NOTICE:
%% 
%% For the copyright see the source file.
%% 
%% Any modified versions of this file must be renamed
%% with new filenames distinct from sample-sigconf.tex.
%% 
%% For distribution of the original source see the terms
%% for copying and modification in the file samples.dtx.
%% 
%% This generated file may be distributed as long as the
%% original source files, as listed above, are part of the
%% same distribution. (The sources need not necessarily be
%% in the same archive or directory.)
%%
%% Commands for TeXCount
%TC:macro \cite [option:text,text]
%TC:macro \citep [option:text,text]
%TC:macro \citet [option:text,text]
%TC:envir table 0 1
%TC:envir table* 0 1
%TC:envir tabular [ignore] word
%TC:envir displaymath 0 word
%TC:envir math 0 word
%TC:envir comment 0 0
%%
%%
%% The first command in your LaTeX source must be the \documentclass command.
\documentclass[sigconf]{acmart}
\usepackage{multirow}
\usepackage{multicol}
\usepackage{graphicx}
\usepackage{bm}
\usepackage{float}
\usepackage{subfig}
\usepackage{pifont}
\let\Bbbk\relax\usepackage{amssymb}
\usepackage{wasysym}
\usepackage{colortbl}
% \usepackage[table,xcdraw]{xcolor}
\newcommand{\ve}[1]{\mathbf{#1}}
\settopmatter{authorsperrow=4}
%% NOTE that a single column version is required for 
%% submission and peer review. This can be done by changing
%% the \doucmentclass[...]{acmart} in this template to 
%% \documentclass[manuscript,screen]{acmart}
%% 
%% To ensure 100% compatibility, please check the white list of
%% approved LaTeX packages to be used with the Master Article Template at
%% https://www.acm.org/publications/taps/whitelist-of-latex-packages 
%% before creating your document. The white list page provides 
%% information on how to submit additional LaTeX packages for 
%% review and adoption.
%% Fonts used in the template cannot be substituted; margin 
%% adjustments are not allowed.

%%
%% \BibTeX command to typeset BibTeX logo in the docs
\AtBeginDocument{%
  \providecommand\BibTeX{{%
    \normalfont B\kern-0.5em{\scshape i\kern-0.25em b}\kern-0.8em\TeX}}}

%% Rights management information.  This information is sent to you
%% when you complete the rights form.  These commands have SAMPLE
%% values in them; it is your responsibility as an author to replace
%% the commands and values with those provided to you when you
%% complete the rights form.
\setcopyright{acmlicensed}
% \copyrightyear{2024}
% \acmYear{2024}
% \acmDOI{10.1145/3664647.3681309}
% %% These commands are for a PROCEEDINGS abstract or paper.
% \acmConference[MM'24]{the 32nd ACM International Conference on Multimedia}{October 28 - November 1,
%   2024}{Melbourne, Australia.}
% \acmBooktitle{Proceedings of the 32nd ACM International Conference on Multimedia(MM'24), October 28 - November 1, 2024, Melbourne, VIC, Australia}

% %
% %  Uncomment \acmBooktitle if th title of the proceedings is different
% %  from ``Proceedings of ...''!
% %
% %\acmBooktitle{Woodstock '18: ACM Symposium on Neural Gaze Detection,
% %  June 03--05, 2018, Woodstock, NY} 
% \acmISBN{979-8-4007-0686-8/24/10}

\copyrightyear{2024}
\acmYear{2024}
\setcopyright{acmlicensed}\acmConference[MM '24]{Proceedings of the 32nd
ACM International Conference on Multimedia}{October 28-November 1,
2024}{Melbourne, VIC, Australia}
\acmBooktitle{Proceedings of the 32nd ACM International Conference on
Multimedia (MM '24), October 28-November 1, 2024, Melbourne, VIC, Australia}
\acmDOI{10.1145/3664647.3681309}
\acmISBN{979-8-4007-0686-8/24/10}


%%
%% Submission ID.
%% Use this when submitting an article to a sponsored event. You'll
%% receive a unique submission ID from the organizers
%% of the event, and this ID should be used as the parameter to this command.
%%\acmSubmissionID{123-A56-BU3}

%%
%% For managing citations, it is recommended to use bibliography
%% files in BibTeX format.
%%
%% You can then either use BibTeX with the ACM-Reference-Format style,
%% or BibLaTeX with the acmnumeric or acmauthoryear sytles, that include
%% support for advanced citation of software artefact from the
%% biblatex-software package, also separately available on CTAN.
%%
%% Look at the sample-*-biblatex.tex files for templates showcasing
%% the biblatex styles.
%%

%%
%% The majority of ACM publications use numbered citations and
%% references.  The command \citestyle{authoryear} switches to the
%% "author year" style.
%%
%% If you are preparing content for an event
%% sponsored by ACM SIGGRAPH, you must use the "author year" style of
%% citations and references.
%% Uncommenting
%% the next command will enable that style.
%%\citestyle{acmauthoryear}

%%
%% end of the preamble, start of the body of the document source.
\begin{document}

%%
%% The "title" command has an optional parameter,
%% allowing the author to define a "short title" to be used in page headers.
\title{FodFoM: Fake Outlier Data by Foundation Models Creates Stronger Visual Out-of-Distribution Detector}

%%
%% The "author" command and its associated commands are used to define
%% the authors and their affiliations.
%% Of note is the shared affiliation of the first two authors, and the
%% "authornote" and "authornotemark" commands
%% used to denote shared contribution to the research.


\author{Jiankang Chen}
\orcid{0009-0008-8980-8028}
\affiliation{%
  \department{School of Computer Science and Engineering}
  \institution{Sun Yat-sen University}
  \city{Guangzhou}
  \state{Guangdong}
  \country{China}}
\affiliation{%
  \institution{Peng Cheng Laboratory}
  \city{Shenzhen}
  \state{Guangdong}
  \country{China}}
\affiliation{ 
  \institution{Key Laboratory of Machine Intelligence and Advanced Computing}
  \city{Guangzhou}
  \state{Guangdong}
  \country{China} 
}
\email{chenjk36@mail2.sysu.edu.cn}

\author{Ling Deng}
\orcid{0009-0002-5607-8192}
\email{denglingl@chinaunicom.cn}
\author{Zhiyong Gan}
\orcid{0009-0008-6451-1068}
\email{ganzy1@chinaunicom.cn}
\affiliation{%
  \institution{China United Network Communications Corporation Limited Guangdong Branch}
  % \streetaddress{1 Th{\o}rv{\"a}ld Circle}
  \city{Guangzhou}
  \state{Guangdong}
  \country{China}}



\author{Wei-Shi Zheng}
\orcid{0000-0001-8327-0003}
\affiliation{%
  \department[0]{School of Computer Science and Engineering}
  \department[1]{Key Laboratory of Machine Intelligence and Advanced Computing}
  \institution{Sun Yat-sen University}
  \city{Guangzhou}
  \state{Guangdong}
  \country{China}}
\email{wszheng@ieee.org}
\author{Ruixuan Wang}
\orcid{0000-0002-8714-0369}
\authornote{Corresponding author.}
\affiliation{%
  \department{School of Computer Science and Engineering}
  \institution{Sun Yat-sen University}
  \city{Guangzhou}
  \state{Guangdong}
  \country{China}}
\affiliation{%
  \institution{Peng Cheng Laboratory}
  \city{Shenzhen}
  \state{Guangdong}
  \country{China}}
\affiliation{ 
  \institution{Key Laboratory of Machine Intelligence and Advanced Computing}
  \city{Guangzhou}
  \state{Guangdong}
  \country{China} 
}
\email{wangruix5@mail.sysu.edu.cn}
%%
%% By default, the full list of authors will be used in the page
%% headers. Often, this list is too long, and will overlap
%% other information printed in the page headers. This command allows
%% the author to define a more concise list
%% of authors' names for this purpose.
\renewcommand{\shortauthors}{Jiankang Chen, Ling Deng, Zhiyong Gan, Wei-Shi Zheng, Ruixuan Wang}
%%


%%
%% This command processes the author and affiliation and title
%% information and builds the first part of the formatted document.
\maketitle

\section{Datasets details}
\subsection{CIFAR benchmarks}
\begin{table*}[ht]
\centering
\caption{OOD detection performance on the CIFAR10 and the CIFAR100(ID) benchmarks with model backbone ResNet34. $\uparrow$ indicates that larger values are better and $\downarrow$ indicates that smaller values are better. The best and second-best results are indicated in \textbf{bold} and \underline{underline}. All values are percentages.}
\label{tab:cifar10_res34}
\resizebox{\textwidth}{!}{%
\begin{tabular}{ccccccccccccccccc}
\toprule
\multirow{3}{*}{\begin{tabular}[c]{@{}c@{}}ID Dataset \\ Model\end{tabular}} & \multirow{3}{*}{Method} & \multicolumn{12}{c}{OOD Datasets} & \multicolumn{2}{c}{\multirow{2}{*}{Average}} \\ \cline{3-14}
& & \multicolumn{2}{c}{SVHN} & \multicolumn{2}{c}{LSUN-R} & \multicolumn{2}{c}{LSUN-C} & \multicolumn{2}{c}{iSUN} & \multicolumn{2}{c}{Textures} & \multicolumn{2}{c}{Places365} \\
& & FPR95$\downarrow$ & AUROC$\uparrow$ & FPR95$\downarrow$ & AUROC$\uparrow$ & FPR95$\downarrow$ & AUROC$\uparrow$ & FPR95$\downarrow$ & AUROC$\uparrow$ & FPR95$\downarrow$ & AUROC$\uparrow$ & FPR95$\downarrow$ & AUROC$\uparrow$ & FPR95$\downarrow$ & AUROC$\uparrow$ \\ \midrule
\multirow{17}{*}{\begin{tabular}[c]{@{}c@{}}CIFAR10\\ResNet34\end{tabular}} 
 & MSP & 33.79 & 94.18 & 42.62 & 92.31 & 20.47 & 96.86 & 45.17 & 91.71 & 49.95 & 89.66 & 53.69 & 87.84 & 40.95 & 92.09 \\
 & Mahalanobis & 41.38 & 94.10 & 52.67 & 92.54 & 92.99 & 88.48 & 52.26 & 92.41 & 38.95 & 94.24 & 55.89 & 90.19 & 55.69 & 91.99 \\
 & ODIN & 43.80 & 86.52 & 23.66 & 93.70 & 7.03 & 98.74 & 26.03 & 93.18 & 45.73 & 83.19 & 52.08 & 82.55 & 33.06 & 89.64 \\
 & DICE & 36.67 & 90.64 & 35.13 & 92.79 & 6.67 & 98.70 & 40.95 & 90.92 & 50.73 & 86.60 & 49.65 & 84.70 & 36.63 & 90.72 \\ 
 & ViM & 29.66 & 95.06 & 38.13 & 94.00 & 49.00 & 93.84 & 37.49 & 93.84 & 28.19 & 94.94 & 47.58 & 90.78 & 38.35 & 93.75 \\
 & Energy & 20.65 & 95.25 & 25.64 & 94.13 & 5.32 & 99.05 & 28.17 & 93.46 & 38.83 & 88.91 & 41.49 & 88.26 & 26.69 & 93.17 \\
 & BATS & 26.37 & 94.75 & 30.02 & 93.72 & 10.67 & 98.30 & 32.47 & 92.90 & 37.71 & 91.71 & 41.84 & 90.27 & 29.85 & 93.60 \\ 
 & ReAct & 24.19 & 94.40 & 26.69 & 94.16 & 6.72 & 98.87 & 29.18 & 93.46 & 39.50 & 89.33 & 40.30 & 89.55 & 27.76 & 93.29 \\ 
 & DICE+ReAct & 38.22 & 90.64 & 34.83 & 92.79 & 6.67 & 98.70 & 40.95 & 89.92 & 50.73 & 87.17 & 49.67 & 84.72 & 36.85 & 90.72 \\
 & FeatureNorm & \underline{4.01} & 99.18 & 44.30& 91.80 & \underline{0.53} & \textbf{99.87} & 37.73 & 93.25 & 25.32 &  94.19& 66.69 & 80.28 & 29.76 & 93.10 \\
 & LINe & 27.64 & 94.98 & 54.03 & 88.20 & 3.41 & 99.31 & 56.53 & 87.04 & 54.26 & 86.95 & 61.93 & 80.02 & 42.97 & 89.42 \\
\cline{2-16}
 & CSI & 19.65 & 96.07 & 50.37 & 90.12 & 20.70 & 95.75 & 45.20 & 90.78 & 25.30 & 94.57 & 64.11 & 80.81 & 37.56 & 91.35 \\
 & SSD+ & \textbf{0.45} & \textbf{99.66} & 15.27 & 96.25 & 3.04 & 98.64 & 16.80 & 96.10 & \underline{12.98} & 97.10 & \underline{18.24} & \underline{96.10} & \underline{11.13} & \underline{97.31} \\
% & KNN+ & 2.70 & 99.61 & 19.56 & 96.58 & 7.89 & 98.01 & 24.56 & 96.21 & 10.11 & 97.43 & 23.05 & 94.88 & 14.65 & 97.28\\
 & VOS & 27.93 & 93.55 & 21.64 & 96.03 & 8.34 & 98.36 & 26.00 & 95.20 & 37.84 & 91.57 & 40.89 & 89.53 & 27.01 & 94.04 \\
 & LogitNorm & 17.40 & 96.96 & \underline{11.04} & \underline{97.92} & \textbf{0.48} & \underline{99.80} & \underline{11.39} & \underline{97.86} & 31.51 & 94.71 & 29.90 & 94.31 & 16.95 & 96.93 \\
 & NPOS & 13.28 & 97.32 & 17.76 & 96.38 & 3.10 & 98.30 & 15.44 & 96.99 & 26.64 & 95.74 & 35.77 & 93.21 & 18.67 & 96.32\\
 & CIDER & 4.93 & \underline{99.22} & 21.45 & 96.53 & 2.99 & 99.33 & 22.69 & 96.43 & 15.37 & \underline{97.56} & 29.16 & 94.42 & 16.10 & 97.25 \\
 % \cline{2-16}
& \cellcolor[HTML]{EFEFEF}\textbf{FodFoM (Ours)} & \cellcolor[HTML]{EFEFEF}5.46 &\cellcolor[HTML]{EFEFEF}98.87 & \cellcolor[HTML]{EFEFEF}\textbf{9.38} &\cellcolor[HTML]{EFEFEF}\textbf{98.24} & \cellcolor[HTML]{EFEFEF}7.02 & \cellcolor[HTML]{EFEFEF}98.53 & \cellcolor[HTML]{EFEFEF}\textbf{8.85} & \cellcolor[HTML]{EFEFEF}\textbf{98.32} &\cellcolor[HTML]{EFEFEF}\textbf{3.62} &\cellcolor[HTML]{EFEFEF}\textbf{99.22} & \cellcolor[HTML]{EFEFEF}\textbf{8.82} & \cellcolor[HTML]{EFEFEF}\textbf{98.22} & \cellcolor[HTML]{EFEFEF}\textbf{7.19} & \cellcolor[HTML]{EFEFEF}\textbf{98.57} \\
 \hline
  \multirow{17}{*}{\begin{tabular}[c]{@{}c@{}}CIFAR100\\ ResNet34\end{tabular}} 
  & MSP & 81.31 & 77.61 & 74.39 & 82.08 & 76.52 & 80.65 & 75.99 & 80.96 & 81.72 & 76.90 & 79.81 & 77.30 & 78.29 & 79.25 \\
 & Mahalanobis & 98.81 & 54.62 & 97.38 & 53.35 & 99.48 & 36.04 & 94.64 & 58.99 & 75.67 & 76.95 & 97.17 & 51.33 & 93.86 & 55.21 \\
 & ODIN & 85.55 & 76.83 & 38.18 & 92.96 & 68.48 & 84.16 & 41.92 & 91.84 & 71.19 & 79.28 & 78.74 & 75.54 & 64.01 & 83.44 \\
  & Energy & 76.73 & 81.52 & 58.02 & 88.40 & 61.12 & 85.88 & 61.82 & 86.95 & 80.73 & 77.17 & 78.06 & 76.72 & 69.41 & 83.64 \\
   & ViM & 75.80 & 82.31 & \textbf{37.70} & \textbf{93.40} & 89.49 & 73.78 & \textbf{38.51} & \textbf{93.01} & 49.47 & 89.33 & 78.09 & 78.17 & 61.51 & 85.00 \\
 & DICE & 53.65 & 89.97 & 85.83 & 76.80 & 32.29 & 93.49 & 86.01 & 77.83 & 68.17 & 82.49 & 82.90 & 76.31 & 68.14 & 83.53 \\ 
 & BATS & 60.48 & 89.50 & \underline{38.64} & \underline{92.96} & 65.69 & 84.84 & \underline{40.80} & 92.29 & 59.65 & 86.59 & 75.13 & 78.96 & 56.73 & 87.52 \\ 
 & ReAct & 43.10 & 92.22 & 41.60 & 91.58 & 52.61 & 87.32 & 41.95 & 91.29 & 53.31 & 87.65 & \underline{70.80} & \underline{79.73} & 50.56 & 88.30 \\ 
 & DICE+ReAct & 48.18 & 91.19 & 84.17 & 78.80 & 32.01 & 93.71 & 82.23 & 79.65 & 66.74 & 83.96 & 80.26 & 78.04 & 65.61 & 84.22 \\
  & FeatureNorm & 21.02 & 95.61 & 97.06 & 67.20 & \textbf{9.87} & \underline{98.21} & 91.79 & 73.87 & 45.23 & 84.79 & 92.64 & 61.02 & 59.60 & 80.12 \\
 & LINe & 39.97 & 91.17 & 61.02 & 86.90 & 26.51 & 94.02 & 62.62 & 86.16 & 55.18 & 86.80 & 81.81 & 72.90 & 54.52 & 86.32 \\
 \cline{2-16}
 & CSI & 44.53 & 92.65 & 86.12 & 77.34 & 75.58 & 83.78 & 76.62 & 84.98 & 61.61 & 86.47 & 79.08 & 76.27 & 70.59 & 83.58\\
 & SSD+& \underline{20.92} & \underline{96.42} & 75.06 & 86.01 & 37.95 & 93.55 & 80.97 & 83.73 & 54.24 & 90.23 & 78.75 & 79.64 & 57.98 & 88.26 \\
 & VOS & 83.52 & 81.24 & 77.81 & 78.50 & 79.40 & 80.39 & 78.34 & 78.23 & 84.35 & 77.81 & 79.77 & 78.01 & 80.53 & 79.03 \\
 & LogitNorm & 64.65 & 88.69 & 93.39& 69.28 & \underline{10.57} & \textbf{98.22} & 94.39 & 68.36 & 81.56 & 74.45 & 80.30 & 77.59 & 70.81 & 79.44 \\
 & NPOS & \textbf{18.56}&\textbf{97.14} & 48.37 & 89.33 & 42.59 & 89.71 & 47.61 & 88.52 & \underline{38.14} & \underline{92.65} & 79.75 & 72.52 & \underline{45.84} & \underline{88.31}\\
 & CIDER & 23.09 & 95.16 & 69.50 & 81.85 & 16.16 & 96.33 & 71.68 & 82.98 & 43.87 & 90.42 & 79.63 & 73.43 & 50.66 & 86.70 \\
 &\cellcolor[HTML]{EFEFEF}\textbf{FodFoM (Ours)} & \cellcolor[HTML]{EFEFEF}44.05 & \cellcolor[HTML]{EFEFEF}91.12 &\cellcolor[HTML]{EFEFEF}39.34 & \cellcolor[HTML]{EFEFEF}92.66 & \cellcolor[HTML]{EFEFEF}71.38 & \cellcolor[HTML]{EFEFEF}86.89 & \cellcolor[HTML]{EFEFEF}41.19 & \cellcolor[HTML]{EFEFEF}\underline{92.36} & \cellcolor[HTML]{EFEFEF}\textbf{31.47} & \cellcolor[HTML]{EFEFEF}\textbf{94.00} & \cellcolor[HTML]{EFEFEF}\textbf{40.30} & \cellcolor[HTML]{EFEFEF}\textbf{91.04} &\cellcolor[HTML]{EFEFEF}\textbf{44.62} & \cellcolor[HTML]{EFEFEF}\textbf{91.34} \\ \bottomrule
\end{tabular}
}
\end{table*}

\begin{table*}[ht]
\centering
\caption{Comparison between different methods in OOD detection on ImageNet100 Benchmark with two different model backbones. $\uparrow$ indicates that larger values are better and $\downarrow$ indicates that smaller values are better. The best and second-best results are indicated in \textbf{bold} and \underline{underline}. All values are percentages.}
\label{tab:imagenet1k} 
\resizebox{\textwidth}{!}{%
\begin{tabular}{cccccccccccc}
\hline
\multirow{3}{*}{\begin{tabular}[c]{@{}c@{}}ID Dataset\\ Model\end{tabular}} & \multirow{3}{*}{Method} & \multicolumn{8}{c}{OOD Datasets} & \multicolumn{2}{c}{\multirow{2}{*}{Average}} \\ \cline{3-10}
 &  & \multicolumn{2}{c}{iNaturalist} & \multicolumn{2}{c}{SUN} & \multicolumn{2}{c}{Places} & \multicolumn{2}{c}{Textures} &  &  \\ \cline{3-12} 
 &  & FPR95$\downarrow$ & AUROC$\uparrow$ & FPR95$\downarrow$ & AUROC$\uparrow$ & FPR95$\downarrow$ & AUROC$\uparrow$ & FPR95$\downarrow$ & AUROC$\uparrow$ & FPR95$\downarrow$ & AUROC$\uparrow$ \\ \hline
\multirow{16}{*}{\begin{tabular}[c]{@{}c@{}}ImageNet100\\ ResNet50\end{tabular}} & MSP & 69.28 & {85.84}     & 70.14 & {84.20}           & 69.43 &                                                                                                    {84.29}           & 64.27 & {84.09} & 68.28 & {84.60}     \\
& ODIN                    & 44.22 &{92.42} & 54.71 & {88.94} & 57.52 & {88.01} & 42.87 & {89.76} & 49.83 & {89.78} \\
& Mahalanobis             & 96.60 & {45.22} & 98.01 & {42.55} & 97.77 & {43.87} & 38.44 & {87.88} & 82.70 & {54.88}                               \\
& Energy                  & 64.60                 & {89.08}      & 62.70                & {88.03}      & 60.70                   & {87.78}   &51.38         & {87.89}           & 59.85 & {88.20}           \\
& ViM                     & 84.92 & {81.92} & 83.18 & {81.47} & 81.45 & {81.55} & 20.00 & {96.07} & 67.39 & {85.25} \\
& BATS                    & 43.05 & {92.65}           & 58.75 & {87.83}           & 57.72 & {87.43}           & 38.88 & {91.97}           & 49.60 & {89.97}   \\
& DICE & 35.08 & {93.29}           & 36.89 & {92.53}           & 43.71 & {90.66}           & 31.84 & {92.08}           & 36.46 & {92.11}   \\
& ReAct                   & 30.60 & {94.40}           & 47.55 & {89.99}           & 47.21 & {89.53}           & 50.89 & {87.57}           & 44.06 & {90.45}           \\
& DICE+ReAct                & \textbf{26.75} & {\textbf{94.69}}           & 35.99 & {92.40}           & 43.48 & {90.52}           & 32.76 & {91.94}           & \underline{34.75} & {92.39}   \\
& FeatureNorm & 65.14 & 83.59 & 64.79 & 83.07 & 72.88 & 78.78 & 38.76 & 89.78 & 60.39 & 83.80 \\
& LINe & \underline{27.38} & \underline{94.64} & 37.28 & 91.95 & 42.32 & 90.44 & 36.78 & 91.01 & 36.19 & 92.01 \\
\cline{2-12}
& CSI &69.00& 89.04 & 64.95& 83.35 & 64.93 & 84.23 & 31.20 & 94.31 & 57.52 & 87.73 \\
& SSD+ & 43.55 & 93.09 & 48.80 & 91.37 & 57.60 & 88.38 & \textbf{8.03} & \textbf{97.42} & 39.50 & \underline{92.56}\\
& VOS & 56.40 & 88.92 & 47.50 & 90.00 & 63.20 & 87.69 & 64.30 & 85.68 & 57.85 & 88.07\\
& LogitNorm & 39.73& 93.09& \underline{34.08} & \underline{93.10}& \underline{37.78}& \underline{92.53}& 42.99& 90.75&38.64 &92.37\\
& NPOS & 32.45 & 94.50 & 39.47 & 92.56 & 47.78 & 90.55 & 24.82 & 91.30 & 36.13 & 92.23\\
& CIDER & 72.68 & 85.95 & 44.68 & 92.22 & 53.52 & 90.30 & \underline{21.22} & \underline{95.97} & 48.03 & 91.11\\
& \cellcolor[HTML]{EFEFEF}\textbf{FodFoM (Ours)} & \cellcolor[HTML]{EFEFEF}35.22 &\cellcolor[HTML]{EFEFEF}94.53 &\cellcolor[HTML]{EFEFEF}\textbf{33.91} &\cellcolor[HTML]{EFEFEF}\textbf{93.81} & \cellcolor[HTML]{EFEFEF}\textbf{33.06} & \cellcolor[HTML]{EFEFEF}\textbf{93.52} & \cellcolor[HTML]{EFEFEF}31.56 & \cellcolor[HTML]{EFEFEF}93.28 &\cellcolor[HTML]{EFEFEF}\textbf{33.44} & \cellcolor[HTML]{EFEFEF}\textbf{93.79} \\ \hline

\multirow{18}{*}{\begin{tabular}[c]{@{}c@{}}ImageNet100\\ ResNet101\end{tabular}} & MSP & 60.74 & 88.66 & 61.29 & 87.23 & 61.87 & 86.70 & 57.80 & 86.61 & 60.43 & 87.30\\
& ODIN & 53.00 & 89.52 & 66.90 & 85.01 & 70.40 & 82.77 & 48.40 & 89.19 & 59.67 & 86.62 \\
& Mahalanobis & 93.63 & 59.38 & 95.59 & 56.32 & 94.73 & 57.83 & 31.83 & 92.34 & 78.83 & 66.47\\
& Energy & 58.86 & 90.54 & 53.04 & 90.61 & 52.50 & 89.76 & 43.53 & 90.39 & 51.98 & 90.33 \\
& ViM & 72.40 & 84.88 & 73.80 & 83.99 & 76.20 & 81.54 & \underline{22.20} & \underline{95.63} & 61.15 & 86.51 \\
& BATS & 48.54 & 90.92 & 64.55 & 86.18 & 64.38 & 85.15 & 42.59 & 89.84 & 55.02 & 88.02\\
& DICE & 39.83 & 93.20 & 36.58 & 93.08 & 42.91 & 91.31 & 28.78 & 93.02 & 37.02 & 92.65\\
& ReAct & 44.66 & 92.95 & 53.17 & 89.20 & 51.92 & 88.67 & 44.93 & 90.59 & 48.67 & 90.35\\
& DICE+ReAct & \textbf{30.12} & 93.28 & 41.37 & 90.84 & 47.24 & 88.68 & 28.14 & 93.35 & 36.72 & 91.67\\
& FeatureNorm & 55.49 & 84.89 & 69.09 & 83.14 & 75.92 & 78.94 & 40.98 & 87.58 & 60.37 & 83.64\\
& LINe & 53.31 & 91.22 & 55.15 & 89.72 & 55.55 & 89.41 & 54.01 & 87.74 & 54.50 & 89.52\\
\cline{2-12}
& CSI & 54.25 & 87.76 & 65.13 & 82.73 & 64.68 & 84.67 & 40.27 & 90.76 & 56.08 & 86.48\\
& SSD+ & 39.06 & 93.76 & 47.30 & 91.90 & 56.46 & 88.21 & \textbf{7.97} & \textbf{97.44} & 37.70 & 92.83\\
& VOS & 54.68 & 89.74 & 41.67 & 91.54 & 63.71 & 88.97 & 67.92 & 84.34 & 56.99 & 88.65 \\
& LogitNorm & 37.59 & \underline{93.56}& \underline{29.66} & \underline{94.64} & \underline{35.93}& \underline{93.05}&41.65 &90.51 &36.21 &\underline{92.94} \\
& NPOS & 43.51 & 91.41 & \textbf{16.28} & \textbf{95.84} & 37.84 & 92.77 & 46.24 & 89.97 & \underline{35.97} & 92.50 \\
& CIDER & 72.42 & 85.52 & 48.13 & 90.76 & 57.38 & 88.68 & 23.33 & 95.51 & 50.31 & 90.12\\
& \cellcolor[HTML]{EFEFEF}\textbf{FodFoM (Ours)} &\cellcolor[HTML]{EFEFEF}\underline{34.79}& \cellcolor[HTML]{EFEFEF}\textbf{94.73} & \cellcolor[HTML]{EFEFEF}35.74 & \cellcolor[HTML]{EFEFEF}93.70 & \cellcolor[HTML]{EFEFEF}\textbf{31.97} &\cellcolor[HTML]{EFEFEF}\textbf{93.75}& \cellcolor[HTML]{EFEFEF}34.22 &\cellcolor[HTML]{EFEFEF}92.72& \cellcolor[HTML]{EFEFEF}\textbf{34.18} & \cellcolor[HTML]{EFEFEF}\textbf{93.73} \\ \hline
\end{tabular}%
}
\end{table*}

For the CIFAR10 and CIFAR100 benchmarks, we used the following OOD datasets:\\
\textbf{SVHN}: The SVHN (Street View Door Number) dataset contains door numbers extracted from images taken from Google Street View. In our study, we use the full test set of SVHN (consisting of 26,032 images) as out-of-distribution (OOD) examples.\\
\textbf{LSUN}: LSUN is a dataset focusing on scene understanding, which mainly consists of images depicting various scenes such as bedroom, house, living room, classroom, etc. LSUN\_C and LSUN\_R are image datasets obtained by cropping and resizing operations on the original images. For our purpose, we randomly selected 10,000 images from LSUN\_C and LSUN\_R as out-of-distribution examples, respectively.\\
\textbf{Places365}: Places365 is a dataset that collects a large number of photos with scenes categorized into 365 different scene categories. The test set of this dataset consists of 900 images from each category. For evaluation, we randomly select 100 images from each category. These images serve as a representative sample to evaluate the performance of an algorithm or technique on the various scene categories in the Places365 dataset.\\
\textbf{Texture}: The Describable Texture Dataset (DTD) is a collection of texture and abstract pattern images containing 5,640 images categorized into 47 classes based on human perception. Since there are no categories overlapping with CIFAR, we used the entire texture dataset.\\
\textbf{iSUN}: iSUN is an extensive eye-tracking dataset containing 20,608 images of natural scenes from the SUN database. We carefully extracted 8,925 images from iSUN to ensure that they had no conceptual overlap with CIFAR to serve as undistributed examples. These selected images provide unique visual content for our analysis, allowing us to evaluate performance beyond the scope of CIFAR.\\
\subsection{ImageNet100 benchmark}
ImageNet100 as ID dataset contains 100 classes randomly selected from ImageNet-1k. The categories are as follows:\\
n01986214, n04200800, n03680355, n03208938, n02963159, n03874293,\\
n02058221, n04612504, n02841315, n02099712, n02093754, n03649909,\\ n02114712, n03733281, n02319095, n01978455, n04127249, n07614500,\\ n03595614, n04542943, n02391049, n04540053, n03483316, n03146219,\\ n02091134, n02870880, n04479046, n03347037, n02090379, n10148035,\\
n07717556, n04487081, n04192698, n02268853, n02883205, n02002556,\\ n04273569, n02443114, n03544143, n03697007, n04557648, n02510455,\\ n03633091, n02174001, n02077923, n03085013, n03888605, n02279972,\\ n04311174, n01748264, n02837789, n07613480, n02113712, n02137549,\\
n02111129, n01689811, n02099601, n02085620, n03786901, n04476259,\\
n12998815, n04371774, n02814533, n02009229, n02500267, n04592741,\\ n02119789, n02090622, n02132136, n02797295, n01740131, n02951358,\\ n04141975, n02169497, n01774750, n02128757, n02097298, n02085782,\\
n03476684, n03095699, n04326547, n02107142, n02641379, n04081281,\\
n06596364, n03444034, n07745940, n03876231, n09421951, n02672831,\\
n03467068, n01530575, n03388043, n03991062, n02777292, n03710193,\\
n09256479, n02443484, n01728572, n03903868.\\
For large-scale dataset, we use the following OOD test data for ImageNet100 benchmarks, whose selected categories are disjoint with ImageNet100.\\
\textbf{iNaturalist} is a dataset containing images of the natural world. It consists of 13 super-categories and 5,089 sub-categories covering plants, insects, birds, mammals, and more. In particular, we used a subset from iNaturalist, which consists of 110 plant categories, to ensure that there is no overlap with the categories in ImageNet100. For evaluation, we randomly selected 10,000 images from a library of images independent of ImageNet100. \\
\textbf{SUN} consists of 397 carefully selected categories for evaluating the performance of scene recognition algorithms, and contains 899 categories covering a wide range of indoor, urban, and natural places with and without human presence. For evaluation, 10,000 images were randomly selected from a database independent of ImageNet100. \\
\textbf{Places} is an extensive dataset of scene photos. The dataset consists of labeled photographs belonging to different scene semantic categories in three macroclasses: indoor, nature and urban. For evaluation, we randomly selected 10,000 images from an ImageNet100-independent image library.\\
\textbf{Textures} is a collection of texture and abstract pattern images containing 5,640 images categorized into 47 classes based on human perception. Since there are no classes overlapping with ImageNet100, we used the entire Textures dataset.\\
\section{More Experiment Results}
For CIFAR benchmarks, to demonstrate the robustness of our method in model selection, we report the average performance on the ResNet34 architecture in the main paper, and our performance on the six OOD datasets on the ResNet34 architecture is shown in Table~\ref{tab:cifar10_res34}.

For large-scale datasets, we utilized ImageNet100 benchmark and the main paper reported our average performance on ResNet50 and ResNet101. In Table\ref{tab:imagenet1k} the achieved OOD detection performance for different strong baselines and ours over the four OOD datasets are reported, which showed our approach significantly outperformed strong methods such as CIDER, NPOS, SSD+ across the three of four OOD datasets, LINe, VOS, LogitNorm across all OOD datasets and achieves state-of-the-art average performance.





\section{Exploration of ($\textbf{C+1}$)-th Logit}

In our work, the OOD detection score is calculated based on the first $C$ logits of the classifier head ($h$). Note that the ($C$+1)-th logit represents the OOD information, and to better utilize the OOD information, we take the MSP  score, the most basic score, and discuss it. 
MSP is to take the maximum probability value of logits after softmax operation as the confidence score that the data belongs to the ID. The problem of the model being overconfident with the OOD data can cause the model to produce large maximum probability values for the OOD data, resulting in indistinguishing OOD from ID data. As indicated by the MSP in Figure \ref{fig:MSP_discuss}, the performance of origin MSP is not outstanding. When the model was trained using our generated fake outlier data, the overconfidence problem was effectively mitigated and achieved state-of-the-art performance on all benchmarks, as indicated by the MSP (Pre-C) which means calculating MSP score by the previous C logits in Figure \ref{fig:MSP_discuss}. Considering that the ($C$+1)-th logit represents the OOD information, which means that the ($C$+1)-th logit of the ID data will have a smaller probability value via softmax, while the OOD will have a larger value, so utilizing the previous $C$ values computed MSP to divide with the probability of the ($C$+1)-th will further widen the score gap between ID and OOD, which is also been proved by the MSP (Pre-C/C+1) in Figure \ref{fig:MSP_discuss} and it outperforms the MSP (Pre-C) and achieves the state-of-the-art performance on all benchmarks.

\begin{figure}[!tbh]
    \centering
    \includegraphics[width=0.49\linewidth, height=0.38\linewidth]{Pictures/msp_more_auroc.pdf}
    \includegraphics[width=0.49\linewidth, height=0.38\linewidth]{Pictures/msp_more_fpr95.pdf}
    \caption{Comparsion of MSP, MSP (Pre-C) and MSP (Pre-C/C+1), 
    % where MSP means the baseline MSP, MSP (Pre-C) means that MSP score calculated by taking the pervious C logits in our framework. MSP (Pre-C/C+1) means that the score is computed by dividing the MSP computation of the previous C logits by the probability of the ($C$+1)-th dimension in our framework. 
    Three benchmarks include CIFAR10 (ID) and CIFAR100 (ID) with ResNet18 backbone and ImageNet100 (ID) with ResNet50 backbone. All values are average percentages over several OOD datasets.
    }
    \label{fig:MSP_discuss}
\end{figure}

\section{Generalization of Our Framework}
In order to do generalization study of metric strategies in selecting ood embeddings in clusters of text embeddings, we also compared our cosine similarity metric with two different similarity metrics including Euclidean distance and Mahalanobis distance in Table \ref{tab:ablation_metric}. The Euclidean distance calculates the difference between the corresponding elements of two vectors, while the Mahalanobis distance utilizes the covariance to measure the similarity of two vectors. It can be found that each metric is able to achieve state-of-the-art performance, which can prove the robustness of our method for selection of similarity metric.


\begin{table}[!bht]
\centering
\caption{Generalization study of metric for selecting text ood embeddings. Results are averaged across six OOD datasets.} 
\label{tab:ablation_metric}
\resizebox{\linewidth}{!}{%
\begin{tabular}{ccccc} 
\toprule
 \multirow{4}{*}{Metric Ablation}& \multicolumn{4}{c}{ResNet18}\\\cline{2-3} \cline{4-5}
 &\multicolumn{2}{c}{CIFAR10} &\multicolumn{2}{c}{CIFAR100} \\ 
& \multicolumn{2}{c}{Average} & \multicolumn{2}{c}{Average}\\
& FPR95$\downarrow$ & AUROC$\uparrow$& FPR95$\downarrow$ & AUROC$\uparrow$  \\ \hline
Euclidean distance& 9.80 & 98.16 & \textbf{32.96} & 93.65 \\
Mahalanobis distance& 9.23 & 98.22 & 38.50 & 92.87 \\
\textbf{Cosine similarity}& \textbf{8.43} & \textbf{98.33} & 33.17 & \textbf{93.78} \\
\bottomrule
\end{tabular}%
}
\end{table}
In the selection of the image captioning model, in Table \ref{tab:ablation_caption} we also compared the generation of image captions for the larger-scale ImageNet100 benchmark using the GIT model, the performance of which is also comparable to that of BLIP-2, and also verified the robustness of our method on the selection of image-captioning model.
\begin{table}[!tbh]
\centering
\caption{Generalization study of different image captioning model to generate captions for ID images.} 
\label{tab:ablation_caption}
\resizebox{\linewidth}{!}{%
\begin{tabular}{cccccccc} 
\toprule
 \multirow{4}{*}{Image Captioning Model}& \multicolumn{4}{c}{ImageNet100}\\\cline{2-3} \cline{4-5}
 &\multicolumn{2}{c}{ResNet50} &\multicolumn{2}{c}{ResNet101} \\ 
& \multicolumn{2}{c}{Average} & \multicolumn{2}{c}{Average}\\
& FPR95$\downarrow$ & AUROC$\uparrow$& FPR95$\downarrow$ & AUROC$\uparrow$  \\ \hline
GIT & 34.36 & 93.60 & \textbf{31.92} & \textbf{93.76} \\
\textbf{BLIP-2}& \textbf{33.44} & \textbf{93.79} & 34.18 & 93.73 \\
\bottomrule
\end{tabular}%
}
\end{table}
\subsection{Comparaion with Dream-OOD}
The recently proposed Dream-OOD also utilizes diffusion model to generate OOD data for OOD detection. However, Dream-OOD needs to train a text-conditional space based on single-word class name (failure for class names consisting of multiple words), % and too little semantic information for a single word), 
and then constructs fake embeddings based on learned visual representations. % utilizing the same methodology as NPOS~\cite{NPOS} sampling (secondary filtering of features required) to allow the diffusion model to generate outlier images. 
In contrast, our method utilizes BLIP-2 (more semantically informative, and not limited to single-word class name), 
CLIP's Text Encoder (no extra training required, and more effective construction of fake OOD text embedding), %text latent space pretrained-free and fake OOD embeddings generation without further filtering), 
Stable Diffusion (generated fake OOD images semantically closer to ID images), and GroundingDINO (constructing background images as fake OOD data), % by considering the current background factors affecting OOD detection), 
to jointly generate more challenging fake OOD images to guide model learning. To further validate the superiority of our method, the setup of Dream-OOD, including its selected ImageNet100 dataset (ImageNet100-Dream) as ID and ResNet34 as backbone, was adopted for empirical comparison. As shown in Table \ref{tab:dream-ood}, our method achieves state-of-the-art average performance, even with fewer fake OOD images for model training (ours 78000 vs. Dream-OOD 100000).
\begin{table}[!tb]
\centering
\caption{Comparison between different methods in OOD detection on ImageNet100-Dream benchmark with ResNet34 backbone. All values are percentages.}
\label{tab:dream-ood}
\resizebox{\linewidth}{!}{%
\begin{tabular}{cccccccc}
\toprule
\multirow{2}{*}{\begin{tabular}[c]{@{}c@{}} 
 OOD Dataset\end{tabular}} & \multirow{2}{*}{Metrics} & \multicolumn{6}{c}{Methods} \\ \cline{3-8} 
 &  & ReAct & FeatureNorm & VOS & NPOS & DREAM-OOD & FodFoM (Ours)  \\ \hline
\multicolumn{1}{c|}{\multirow{2}{*}{\begin{tabular}[c]{@{}c@{}}iNaturalist\end{tabular}}} & \multicolumn{1}{c|}{FPR95$\downarrow$} & 31.01 & 62.36 & 43.00 & 53.84 & \textbf{24.10} & 35.75  \\
\multicolumn{1}{c|}{} & \multicolumn{1}{c|}{AUROC$\uparrow$} & 94.62 & 84.03 & 93.77 & 86.52 & \textbf{96.10} & 93.13 \\\hline
\multicolumn{1}{c|}{\multirow{2}{*}{\begin{tabular}[c]{@{}c@{}}SUN\end{tabular}}} & \multicolumn{1}{c|}{FPR95$\downarrow$} & 43.16 & 74.49 & 39.40 & 53.54 & 36.88 & \textbf{33.91}  \\
\multicolumn{1}{c|}{} & \multicolumn{1}{c|}{AUROC$\uparrow$} & 91.45 & 79.37 & 93.17 & 87.99 & 93.31 & \textbf{93.62} \\\hline
\multicolumn{1}{c|}{\multirow{2}{*}{\begin{tabular}[c]{@{}c@{}}Places\end{tabular}}} & \multicolumn{1}{c|}{FPR95$\downarrow$} & 44.90 & 65.13 & 47.60 & 59.66 & 39.87 & \textbf{39.23}  \\
\multicolumn{1}{c|}{} & \multicolumn{1}{c|}{AUROC$\uparrow$} & 90.70 & 83.78 & 91.77 & 83.50 & \textbf{93.11} & 92.02 \\\hline
\multicolumn{1}{c|}{\multirow{2}{*}{\begin{tabular}[c]{@{}c@{}}Textures\end{tabular}}} & \multicolumn{1}{c|}{FPR95$\downarrow$} & 43.79 & 40.46 & 66.10 & 8.98 & 53.99 & 42.13  \\
\multicolumn{1}{c|}{} & \multicolumn{1}{c|}{AUROC$\uparrow$} & 90.53 & 89.31 & 81.42 & \textbf{98.13} & 85.56 & 91.31 \\\hline
\multicolumn{1}{c|}{\multirow{2}{*}{\begin{tabular}[c]{@{}c@{}}Average\end{tabular}}} & \multicolumn{1}{c|}{FPR95$\downarrow$} & 40.72 & 61.33 & 49.02 & 44.00 & 38.76 & \textbf{37.75}  \\
\multicolumn{1}{c|}{} & \multicolumn{1}{c|}{AUROC$\uparrow$} & 91.83 & 84.12 & 90.03 & 89.04 & 92.02 & \textbf{92.52} \\
\bottomrule
\end{tabular}%
}
\end{table}
\begin{figure}[!tbh]
\centering
\includegraphics[width=0.93\linewidth]{Pictures/GroundingDINO.pdf}
\caption{Visualizations of background images generated by GroundingDINO and blurring of foreground regions. ID images are from ImageNet100.}
\label{fig:image_back}
\end{figure}



\section{Visualizations of Background Images as Fake OOD Images}
In Figure \ref{fig:image_back}, we visualize ID images from ImageNet100 and background images generated by GroundingDINO and operation of blurring. With the powerful detection capability of GroundingDINO, ID objects in images can be detected, and the background information can be retained as fake outlier images by blurring operation to destroy the semantic information of ID. The introduction of these background images in model regularization alleviates the problem of model's overconfidence on OOD images with backgrounds similar to ID images in the area of OOD detection.





\end{document}
\endinput
%%
%% End of file `sample-sigconf.tex'.
