\section*{ Appendix }
\section{Additional definitions, assumptions, and theorems for the SCE module}

\subsection{Sample reweighting}
 Assumption~\ref{ass1} indicates that the correlation between $\mathrm{FM}_{\mathrm{rob}}$ and $g(\mathrm{FM}_{\mathrm{rob}})$ has lower variation than that between $\mathrm{FM}_{\mathrm{fal}}$ and $g(\mathrm{FM}_{\mathrm{\mathrm{rob}}})$ when subjected to parameter variance.
\begin{assumption}
\label{ass1}
For all $P'$ uniformly sampled from $\mathcal{P}$, the following condition holds: $Var[\mathbb{E}_{P'}(\mathrm{FM}_{\mathrm{rob}}g(\mathrm{FM}_{\mathrm{rob}}))]<Var[\mathbb{E}_{P'}(\mathrm{FM}_{\mathrm{fal}}g(\mathrm{FM}_{\mathrm{rob}}))]$.
\end{assumption}

Theorem \ref{the1} \cite{he2023covariate} demonstrates that when we apply sample weighting even in a random way, the estimation of $\beta_{\mathrm{FM}_{\mathrm{rob}}}$ is more robust and exhibits less variation in comparison to $\beta_{\mathrm{FM}_{\mathrm{\mathrm{fal}}}}$.
\begin{theorem}
\label{the1}
Let $\tilde{\beta_{\mathrm{FM}_{\mathrm{rob}}}}$ and $\tilde{\beta_{\mathrm{FM}_{\mathrm{fal}}}}$ be the components of $\tilde{\beta}$ corresponding to $\mathrm{FM}_{\mathrm{rob}}$ and $\mathrm{FM}_{\mathrm{fal}}$ respectively. Under Assumption 3.5.1, we have $Var(\tilde \beta_{\mathrm{FM}_{\mathrm{rob}}}) < Var(\tilde \beta_{\mathrm{FM}_{\mathrm{fal}}})$. 
\end{theorem}


We define the minimal robust features set.


\begin{definition}
minimal robust features set: A minimal robust features set of predicting Y under training distribution $P^\text{train}$ is any subset $\mathrm{FM}_{\mathrm{rob}}$ of $\mathrm{FM}$ satisfying the following equation, and no proper subsets of $\mathrm{FM}_{\mathrm{rob}}$ satisfies it:
\begin{equation}
\mathbb{E}_{P^\text{train}}[Y|\mathrm{FM}_{\mathrm{rob}}] = \mathbb{E}_{P^\text{train}}[Y|\mathrm{FM}]. \label{eq5}
\end{equation}
It can also be formulated as:
$$Y \bot \mathrm{FM}_{\mathrm{fal}} |\mathrm{FM}_{\mathrm{rob}}, $$
where $Y \bot \mathrm{FM}_{\mathrm{fal}}$ means Y and $\mathrm{FM}_{\mathrm{fal}}$ are statistically independent of each other.
\end{definition}




It has been proven that $\mathrm{FM}_{\mathrm{rob}}$ is the minimal and optimal predictor if and only if $\mathrm{FM}_{\mathrm{rob}}$ is the minimal robust feature~\citep{xu2022theoretical}. Therefore, we intend to capture the minimal robust feature set $\mathrm{FM}_{\mathrm{rob}}$ for robust prediction. We denote the robust feature set as the minimal robust feature set for simplicity.

\begin{definition}
    Sample weighting function: Let $\mathcal{W}$ be the set of sample weighting functions that satisfy:
    \begin{equation}
    \mathcal{W} = \{w:\mathcal{\mathrm{FM}}\rightarrow R^+ |E_{P^\text{train}(\mathrm{FM})}[w(\mathrm{FM})] = 1\} .\label{eq6}
    \end{equation}
\end{definition}



Then $\forall $ $w$ $\in \mathcal{W}$, the corresponding weighted distribution is $ \widetilde P_w(\mathrm{FM},Y) = w(\mathrm{FM})P^\text{train}(\mathrm{FM},Y)$, $\widetilde P_{w}$ is well defined with the same support of $P^\text{train}$. 
What we want is actually $W_{\bot}$: the subset of W in which $\mathrm{FM}$ are mutually independent.


\begin{lemma}
\label{lemma}
Existence of a ``debiased'' weighting function{\normalfont~\citep{zhou2022model}}.
Consider $FM$=[$\mathrm{FM}_{\mathrm{rob}}$, $\mathrm{FM}_{\mathrm{fal}}$]. We want to fit a linear model $\theta^T\mathrm{FM}$ to predict $y$.
Given infinite data in the training dataset $D_\text{train}$, there exists a weight function $w \in \mathcal W $, i.e.,
\begin{equation}
w(\mathrm{FM},y) = \frac{\mathbb{P}_(\mathrm{FM}_{\mathrm{rob}},y)\mathbb{P}(\mathrm{FM}_{\mathrm{fal}})}{\mathbb{P}(\mathrm{FM}_{\mathrm{rob}},\mathrm{FM}_{\mathrm{fal}},y)}, \label{eq7}
\end{equation}
such that the solution satisfies that
\begin{equation}
\theta^*(w) = \overline\theta = [\overline\theta_{\mathrm{FM}_{\mathrm{rob}}};0], \label{eq8}
\end{equation}
where $\overline\theta_{\mathrm{FM}_{\mathrm{rob}}}$ is the optimal model that merely uses $\mathrm{FM}_{\mathrm{rob}}$, i.e.,
\begin{equation}
\overline\theta_{\mathrm{FM}_{\mathrm{rob}}} := \mathop{arg\,min}\limits_{\theta \in R^{d_{\mathrm{FM}_{\mathrm{rob}}}}} \mathbb{E}[(y-\theta^T\mathrm{FM}_{\mathrm{rob}})^2] \label{eq9}.
\end{equation}
\end{lemma}

\begin{theorem}
\label{theorem}
No matter whether data generation is linear or nonlinear, robust features can be almost perfectly selected if conducting weighted least squares (WLS) using the weighting function in $\mathcal{W}_{\bot}${\normalfont~\citep{2021Why}}.
\end{theorem}







\subsection{SCE}
Here, we use $X$ and $Y$ to denote random variables, the corresponding RKHS is denoted by $\mathcal{H}_X$ and $\mathcal{H}_Y$. 
We define the cross-covariance operator $\sum_{XY}$ from $\mathcal{H}_Y$ to $\mathcal{H}_X$:
\begin{align}
    &<h_X,\sum\nolimits_{XY} h_Y
    >\nonumber  \\
    &=\mathbb{E}_{XY}[h_X(X)h_Y(Y)]-\mathbb{E}_X[h_X(X)]\mathbb{E}_{Y}[h_Y(Y)]
\end{align}
for all $h_X\in \mathcal{H}_X$ and $h_Y\in \mathcal{H}_Y$. Then, the independence
can be determined by the following proposition~\citep{fukumizu2007kernel}:
\begin{proposition}
If the product $k_Xk_Y$ is characteristic, $\mathbb{E}[k_X(X,X)]<\infty$ and $\mathbb{E}[k_Y(Y,Y)]<\infty$, we have that $\mathbb{E}_{XY}=0$ if and only if $X\perp Y$.
\end{proposition}
The Hilbert-Schmidt independence criterion (HSIC)~\citep{gretton2008kernel}, which necessitates that the squared Hilbert-Schmidt norm of $\sum_{XY}$ be equal to zero, can be utilized as a criterion for supervising the elimination of spurious correlations~\citep{bahng2020learning}.











































\begin{figure}[t]
    \centering
    \includegraphics[scale=0.415]{figure/var8.pdf}
    \caption{Variance comparison of different SOTA models.}
    \label{2}
\end{figure}







\section{ Additional experiment results}


\noindent\textbf{Analysis of the performance variance.}
We analyze the performance variance of our model in comparison to that of three representative SOTA models: FinalMLP~\citep{mao2023finalmlp}, FINAL~\citep{zhu2023final}, and GDCN~\citep{wang2023towards}. We carried out 10 experiments for each model on each dataset. As depicted in Figure~\ref{2}, our model consistently outperforms and exhibits the greatest stability across different datasets compared to the other methods.




\input{img/fourier}









\noindent\textbf{Different numbers of random Fourier spaces (RFSs).}
We analyze the influence of using different numbers of RFSs. In Figure~\ref{Fourier2}, the light blue columns show the different AUCs obtained using different numbers of fixed RFSs (1$\sim$8) on MovieLens and Avazu. Through extensive experiments, we further find that compared to utilizing a fixed number of RFSs, using decreasing numbers of RFSs in each epoch further yields improved performance. The dark blue columns show the performance achieved with decreasing numbers of RFSs, and the horizontal axis represents the maximum value (the number of RFSs uniformly decreases from the maximum value to 1 in each epoch).\\







\noindent\textbf{Feature correlation visualizations of the S-SR and D-SR streams.}
We randomly chose 50 feature variables from both the Avazu and Frappe datasets. The feature correlation maps of the S-SR and D-SR streams are depicted in Figure~\ref{visual}. As illustrated in the figure, noticeable differences exist between the two feature correlation maps across both datasets.

\begin{figure}[htb]
  \centering
  \begin{subfigure}{0.24\textwidth}
    \centering
    \includegraphics[width=\linewidth]{figure/stable3.png}
    \caption{Feature correlation of S-SR on the Avazu dataset.}
    \label{fig:subfig12}
  \end{subfigure}
  \hfill
  \begin{subfigure}{0.24\textwidth}
    \centering
    \includegraphics[width=\linewidth]{figure/stbel5.png}
    \caption{Feature correlation of D-SR on the Avazu dataset.}
    \label{fig:subfig22}
  \end{subfigure}
  \vspace{0.5cm}
  \begin{subfigure}{0.24\textwidth}
    \centering
    \includegraphics[width=\linewidth]{figure/F_312.png}
    \caption{Feature correlation of S-SR on the Frappe dataset.}
    \label{fig:subfig32}
  \end{subfigure}
  \hfill
  \begin{subfigure}{0.24\textwidth}
    \centering
    \includegraphics[width=\linewidth]{figure/F_322.png}
    \caption{Feature correlation of D-SR on the Frappe dataset.}
    \label{fig:subfig42}
  \end{subfigure}
  \caption{Feature visualization of D-SR and S-SR.}
  \label{visual}
\end{figure}


























\section{Related methods}
We classify the related CTR prediction methods into four types: 
\begin{enumerate}
    \item \textbf{First-Order}: LR~\citep{richardson2007predicting}. It models both first-order and second-order feature interactions.
    \item \textbf{Second-Order}: FM~\citep{rendle2010factorization}, AFM~\citep{xiao2017attentional}. They model both first-order and second-order interactions.
    \item \textbf{High-Order}: HOFM~\citep{blondel2016higher}, NFM~\citep{he2017neural}, OPNN~\citep{qu2016product}, CIN~\citep{xu2021core}, AutoInt~\citep{song2019autoint}, 
    AFN~\citep{cheng2020adaptive},
    SAM~\citep{cheng2021looking}. They can model interactions higher than second-order.
    \item \textbf{Ensemble}: DCN~\citep{wang2017deep}, 
    DeepFM~\citep{guo2017deepfm}, xDeepFM~\citep{lian2018xdeepfm}, 
    DRIN~\citep{2022DRIN},
    MaskNet~\citep{wang2021masknet},
    DCN-V2~\citep{wang2021dcn},
    LightDIL~\citep{zhang2023reformulating},
    FINAL~\citep{zhu2023final},
    FinalMLP~\citep{mao2023finalmlp}, DELTA~\citep{zhu2024delta}, GDCN~\citep{wang2023towards}. These models adopt parallel or stacked structures to integrate different feature interaction methods. 
\end{enumerate}


