\section{Experiment Details}

% \subsection{Few-shot View Synthesis} We conduct our experiments in the few-shot setting across 3 datasets: the blender dataset~\cite{mildenhall2020nerf}, the DTU dataset~\cite{jensen2014large} and the LLFF dataset~\cite{mildenhall2019local}. There are many works focusing on the few-shot setting in different designed benchmarks, and it is hard to compare all of them in the same benchmark. To make a fair and comprehensive comparison, we adopt the setting from FreeNeRF~\cite{yang2023freenerf}.
% \textbf{Blender Dataset:} The Blender dataset~\cite{mildenhall2020nerf} has 8 synthetic scenes in total. We follow the data split used in DietNeRF~\cite{jain2021putting} to simulate a few-shot neural rendering scenario.
%  For each scene, the training images with IDs (counting from
% “0”) 26, 86, 2, 55, 75, 93, 16, 73, and 8 are used as the
% input views and 25 images are sampled evenly from the
% testing images for evaluation.
% \textbf{DTU dataset:} The DTU dataset~\cite{jensen2014large} is a large-scale multiview dataset that consists of 124 different scenes. PixelNeRF~\cite{yu2021pixelnerf} uses a split of 88 training scenes and 15 test
% scenes to study the pre-training or per-scene fine-tuning
% setting in a few-shot neural rendering scenario. We do not require pre-training as FreeNeRF. We
% follow~\cite{niemeyer2022regnerf} to optimize NeRF models directly on the 15 test
% scenes. The test scan IDs are: 8, 21, 30, 31, 34, 38, 40, 41,
% 45, 55, 63, 82, 103, 110, and 114. In each scan, the images
% with the following IDs (counting from “0”) are used as the
% input views: 25, 22, 28. The images with IDs in [1, 2, 9, 10,
% 11, 12, 14, 15, 23, 24, 26, 27, 29, 30, 31, 32, 33, 34, 35, 41,
% 42, 43, 45, 46, 47] serve as the novel views for evaluation.
% \textbf{LLFF dataset:}  The LLFF dataset~\cite{mildenhall2019local} is a forward-facing
% dataset that contains 8 scenes in total. Adhere to~\cite{mildenhall2020nerf,niemeyer2022regnerf},
% we use every 8-th image as the novel views for evaluation
% and evenly sample the input views across the remaining
% views.
\subsection{Sparse View Sampling}

\begin{table}[ht]
\centering
\begin{tabular}{c|cccccccc|c}
\toprule
% \textbf{Method} &\textbf{Object  }&\textbf{Object }&\textbf{Full-image PSNR $\uparrow$}&\textbf{Full-image SSIM $\uparrow$}\\
% \textbf{Method} &\textbf{ PSNR $\uparrow$}&\textbf{Object SSIM $\uparrow$}&\textbf{Full-image PSNR $\uparrow$}&\textbf{Full-image SSIM $\uparrow$}\\

% \textbf{Method} &\textbf{ Object PSNR $\uparrow$}&\textbf{Object SSIM $\uparrow$}&\textbf{Full-image PSNR $\uparrow$}&\textbf{Full-image SSIM $\uparrow$}\\

\textbf{PSNR$\uparrow$}
&\textbf{hotdog}
&\textbf{lego}
&\textbf{chair}
&\textbf{drums}
&\textbf{ficus}
&\textbf{materials}
&\textbf{mic}
&\textbf{ship}
&\textbf{Avg.}
\\

\hline
NeRF + Rand &
22.19 &
19.85 &
19.99 &
10.93 &
18.13 &
8.73 &
17.85 &
15.31 &
16.62
\\
NeRF + FVS &
23.87 &
17.83 &
20.06 &
15.38 &
17.91 &
13.76 &
17.91 &
15.94 &
17.83 

\\
ActiveNeRF &
17.87 &
18.96 &
20.20 &
14.82 &
\textbf{22.55} &
\textbf{18.19} &
17.92 &
\textbf{19.34} &
18.73 

\\
Ours~(S$\rightarrow$P) &
\textbf{24.01} &
20.48 &
\textbf{26.21} &
16.78 &
18.49 &
13.95 &
17.57 &
13.95 &
18.93 
\\
Ours~(P$\rightarrow$S) &
23.14 &
\textbf{22.90} &
20.08 &
\textbf{17.96} &
20.99 &
15.16 &
\textbf{24.01} &
16.50 &
\textbf{20.09} \\
\hline
\hline
\textbf{SSIM$\uparrow$}
&\textbf{hotdog}
&\textbf{lego}
&\textbf{chair}
&\textbf{drums}
&\textbf{ficus}
&\textbf{materials}
&\textbf{mic}
&\textbf{ship}
&\textbf{Avg.}
\\

\hline
NeRF + Rand &
0.919 &
0.838 &
0.848 &
0.793 &
0.845 &
0.762 &
0.881 &
0.689 &
0.822 


\\
NeRF + FVS &
\textbf{0.922} &
0.798 &
0.853 &
0.776 &
0.838 &
0.776 &
0.879 &
0.706 &
0.819 



\\
ActiveNeRF &
0.860 &
0.829 &
0.858 &
0.768 &
\textbf{0.886} &
\textbf{0.813} &
0.876 &
0.716 &
0.826  

\\
Ours~(S$\rightarrow$P) &
0.918 &
\textbf{0.852} &
\textbf{0.898} &
0.793 &
0.848 &
0.789 &
0.883 &
\textbf{0.789} &
\textbf{0.846 }
 

\\
Ours~(P$\rightarrow$S) &
0.916 &
0.851 &
0.849 &
\textbf{0.814} &
0.859 &
0.812 &
\textbf{0.924} &
0.704 &
0.841 
\\

\hline
\hline
\textbf{LPIPS$\downarrow$}
&\textbf{hotdog}
&\textbf{lego}
&\textbf{chair}
&\textbf{drums}
&\textbf{ficus}
&\textbf{materials}
&\textbf{mic}
&\textbf{ship}
&\textbf{Avg.}
\\

\hline
NeRF + Rand &
0.089 &
0.152 &
0.165 &
0.231 &
0.152 &
0.241 &
0.138 &
0.317 &
0.186 
\\
NeRF + FVS &
\textbf{0.082} &
0.197 &
0.158 &
0.239 &
0.167 &
0.205 &
0.140 &
0.304 &
0.186 

\\
ActiveNeRF &
0.172 &
0.150 &
0.149 &
0.253 &
\textbf{0.116} &
\textbf{0.145} &
0.142 &
0.319 &
0.181 
 

\\
Ours~(S$\rightarrow$P) &
0.089 &
\textbf{0.135} &
\textbf{0.109} &
0.218 &
0.152 &
0.177 &
0.139 &
\textbf{0.177} &
\textbf{0.149 }

 

\\
Ours~(P$\rightarrow$S) &
0.099 &
0.153 &
0.165 &
\textbf{0.183} &
0.136 &
0.159 &
\textbf{0.093} &
0.306 &
0.162 
\\
\bottomrule
\end{tabular}
% \vspace{1mm}
\caption{\textbf{Quantitative comparison on Blender in Setting \uppercase\expandafter{\romannumeral1}.} We provide a detailed listing of the metric values for each object on Blender, which is the same in Table \ref{tab:active} in the manuscript. } 
\label{tab:detail}
\end{table}

We conduct experiments in Active Learning settings using the ActiveNeRF~\citep{pan2022activenerf} codebase. In traditional NeRF~\citep{mildenhall2020nerf}, we obtain a volume parameter $\sigma$ and color values $c = (r, g, b)$ for a specific position and direction. In ActiveNeRF, it simultaneously outputs both mean and variance, following a Gaussian distribution.  For simplicity, we adopt the ActiveNeRF version and apply its pipeline to our baseline methods \textit{(NeRF+Random}, \textit{NeRF+FVS}) as well as our proposed strategy. The primary modification we make is in the evaluation step, which is central to this active learning setting.

Its original codebase only provides training configuration files for a portion of the LLFF dataset and the Blender dataset. We observe that for the Blender dataset, the codebase used a fixed number (20) of initial training samples so we cannot decide the initial training set size. We then modify it to allow the selection of the initial training set size, with the remaining images serving as a holdout set. For instance, in Setting \uppercase\expandafter{\romannumeral1}, for each object in the Blender dataset with 100 ordered images, we choose the first 4 images as the initial set and use the remaining 96 images as the holdout set. Due to excessive memory requirements, training on the LLFF dataset is not feasible even on a 48GB A40 GPU, so we temporarily refrain from conducting experiments on it.  However, we believe that the results on the Blender dataset sufficiently validate our claims.

Due to the randomness of the strategy and potential variations in the training process, we conducted three experiments for each result and selected the average outcome. In Table ~\ref{tab:detail}, We provide a detailed breakdown of the specific results for each object on Blender in Setting \uppercase\expandafter{\romannumeral1}.

\subsection{Few-shot View Synthesis}

\subsubsection{Dataset}

We conduct our experiments in the few-shot setting across three datasets: the Blender dataset~\citep{mildenhall2020nerf}, the DTU dataset~\citep{jensen2014large}, and the LLFF dataset~\citep{mildenhall2019local}. Many works focus on the few-shot setting using different benchmarks, making it challenging to compare all of them uniformly. To ensure a fair and comprehensive comparison, we adopt the settings from FreeNeRF~\citep{yang2023freenerf}. We conduct the experiments on a 48GB A40 GPU.

\textbf{Blender Dataset:} The Blender dataset~\citep{mildenhall2020nerf} comprises eight synthetic scenes. We follow the data split used in DietNeRF~\citep{jain2021putting} to simulate a few-shot neural rendering scenario. For each scene, the training images with IDs (counting from “0”) 26, 86, 2, 55, 75, 93, 16, 73, and 8 are used as the input views, and 25 images are sampled evenly from the testing images for evaluation.

\textbf{DTU Dataset:} The DTU dataset~\citep{jensen2014large} is a large-scale multiview dataset consisting of 124 different scenes. PixelNeRF~\citep{yu2021pixelnerf} uses a split of 88 training scenes and 15 test scenes to study the pre-training or per-scene fine-tuning setting in a few-shot neural rendering scenario. Unlike FreeNeRF, we do not require pre-training. We follow~\citep{niemeyer2022regnerf} to optimize NeRF models directly on the 15 test scenes. The test scan IDs are 8, 21, 30, 31, 34, 38, 40, 41, 45, 55, 63, 82, 103, 110, and 114. In each scan, the images with the following IDs (counting from “0”) are used as the input views: 25, 22, 28. The images with IDs in [1, 2, 9, 10, 11, 12, 14, 15, 23, 24, 26, 27, 29, 30, 31, 32, 33, 34, 35, 41, 42, 43, 45, 46, 47] serve as the novel views for evaluation.
According to the FreeNeRF, masks of the DTU dataset do not always help improve PSNR and SSIM and sometimes the PSNR score in a specific scene drops a lot. For a fair comparison, we train one model for one scene to produce the results in the object and full-image setting at the same time.

\textbf{LLFF Dataset:} The LLFF dataset~\citep{mildenhall2019local} is a forward-facing dataset containing eight scenes. Adhering to~\citep{mildenhall2020nerf, niemeyer2022regnerf}, we use every 8th image as the novel views for evaluation and evenly sample the input views from the remaining views.

\subsubsection{Experiment Results}
Figures~\ref{fig:dtu_all} and \ref{fig:llff_all} present qualitative results on the DTU and LLFF datasets, respectively, corresponding to the quantitative results in Table~\ref{tab:blender}.


In our experiments, $L_{\text{micro}}$ 
  represents the variance of the mean color value between training images and randomly rendered images, ensuring that the color difference is constrained within a certain range. This is based on Lemma~\ref{lem:rgb}, where we emphasize the color difference between images.$L_{\text{micro}}$  is not limited to this form and can be interpreted using other measures like KL-divergence in color, which can also achieve similar performance.

Similarly, $L_{\text{macro}}$ is not restricted to using CLIP. Other models such as DINO~\cite{caron2021emerging} or BLIP~\cite{li2022blip} can also extract semantic features for our framework.

Our framework is flexible and can incorporate various forms of regularization terms related to semantic space distance or pixel space distance, allowing for broad applicability and adaptability.

\begin{figure}[h]
    \centering
    \includegraphics[width=1\linewidth]{Figures/dtu_all.pdf}

    \caption{\textbf{Example of our results with 3 input views on the DTU dataset.}}
    \label{fig:dtu_all}
\end{figure}

\begin{figure}[h]
    \centering
    \includegraphics[width=1\linewidth]{Figures/llff_all.pdf}

    \caption{\textbf{Example of our results with 3 input views on the LLFF dataset.}}
    \label{fig:llff_all}
\end{figure}

\subsubsection{Limitations on baselines}
FreeNeRF is a strong baseline that achieves state-of-the-art performance compared to methods using priors from diffusion models across many datasets. We get this conclusion from the experiment results of ReconFusion~\citep{wu2023reconfusion}.
Therefore, 
it is worthwhile to continue our comparison between our method and some diffusion-based methods like SparseFusion~\citep{zhou2022sparsefusion} or ReconFusion~\citep{wu2023reconfusion}. 

SparseFusion's evaluation is currently limited to the CO3D dataset~\citep{reizenstein2021common}, and it lacks performance data on three popular and classical datasets which we have used to keep the same as FreeNeRF: the Blender dataset, the DTU dataset and the LLFF dataset. Fair evaluations of SparseFusion on these datasets are absent, and addressing this gap would require significant additional time, which might divert from our primary research focus. Nonetheless, the datasets we employ are robust and widely accepted in NeRF research, providing sufficient support for our experiments with numerous baseline performances available for reference.

Additionally, the lack of open-source code for ReconFusion limits our ability to apply custom regularization terms or conduct meaningful comparisons.  Future work should aim to incorporate more new baseline methods and explore additional variations within our framework.

% \section{Guiding Sampling under a Limited Budget}


% \section{Regularization Term for Few-shot View Synthesis on DTU} \label{sec:dtu}


% \begin{figure}[h]
%     \centering
%     \includegraphics[width=1\linewidth]{Figures/unseen.png}

%     \caption{\textbf{Example of our results with 3 input views on the llff dataset.}}
%     \label{fig:dtu_all}
% \end{figure}


% \begin{table}[ht!]
% \centering
% \begin{tabular}{c|c c|c c}
% \toprule
% % \textbf{Method} &\textbf{Object  }&\textbf{Object }&\textbf{Full-image PSNR $\uparrow$}&\textbf{Full-image SSIM $\uparrow$}\\
% % \textbf{Method} &\textbf{ PSNR $\uparrow$}&\textbf{Object SSIM $\uparrow$}&\textbf{Full-image PSNR $\uparrow$}&\textbf{Full-image SSIM $\uparrow$}\\

% % \textbf{Method} &\textbf{ Object PSNR $\uparrow$}&\textbf{Object SSIM $\uparrow$}&\textbf{Full-image PSNR $\uparrow$}&\textbf{Full-image SSIM $\uparrow$}\\

%  & \multicolumn{2}{c|}{\textbf{Object} } & \multicolumn{2}{c}{\textbf{Full-image} }\\

% \textbf{Method} &\textbf{  PSNR $\uparrow$}&\textbf{ SSIM $\uparrow$}&\textbf{ PSNR $\uparrow$}&\textbf{ SSIM $\uparrow$}\\

% \hline
% \hline
% SRF \cite{chibane2021stereo}
% & 15.32    
% & 0.671 
% & 15.84
% & 0.532
% \\

% PixelNeRF \cite{yu2021pixelnerf}
% & 16.82      
% & 0.695
% & \textbf{18.74}
% & 0.618\\
% MVSNeRF \cite{chen2021mvsnerf}
% &  18.63     
% & 0.769  
% & 16.33
% & 0.602\\
% \hline
% \hline
% SRF ft \cite{chibane2021stereo}
% & 15.68 
% & 0.698 
% & 16.06
% & 0.550 \\

% PixelNeRF ft \cite{yu2021pixelnerf}
% & 18.95    
% &0.710 
% &17.38
% & 0.548\\
% MVSNeRF ft \cite{chen2021mvsnerf}
% & 18.54       
% &0.769   
% &16.26
% &0.601\\
% \hline
% \hline
% Mip-NeRF \cite{barron2021mip}
% & 8.68   
% &0.571  
% &7.64
% &0.227\\

% DietNeRF \cite{jain2021putting}
% & 11.85     
% &0.633  
% &10.01
% &0.354\\
% RegNeRF \cite{niemeyer2022regnerf}
% & 18.89      
% & 0.745   
% & 15.33 
% & 0.621\\
% \hline
% \hline
% mip-NeRF concat. (repro.) 
% & 9.10     
% & 0.578  
% & 7.94
% & 0.235\\

% RegNeRF concat. (repro.)
% & 18.50    
% &0.744 
% &15.00
% &0.606\\

% FreeNeRF\cite{yang2023freenerf} 
% & 19.92     
% &\textbf{ 0.787 }
% & 18.02
% & 0.680\\
% \hline
% \hline
% \textbf{FreeNeRF + Ours} 
% & \textbf{20.13(+0.21)}    
% & \textbf{0.787}
% &\textbf{ 18.35(+0.33)}
% &\textbf{ 0.677(-0.003)}
% \\
% \bottomrule
% \end{tabular}
% \caption{\textbf{Quantitative comparison on DTU.} We follow the experiment setting in FreeNeRF and present the PSNR and SSIM scores of foreground objects and full images. Compared with FreeNeRF and other baselines, We can observe that ours based on FreeNeRF can better synthesize foreground objects and full images, especially in PSNR.} 
% \label{tab:dtu_supplement}
% \end{table}

% In the 3-view setting, we also conduct additional experiments on the DTU dataset\cite{jensen2014large} following the setting of FreeNeRF. It contains 124 scenes and we follow \cite{niemeyer2022regnerf} to optimize NeRF models directly on the 15 test
% scenes. The test scan IDs are: 8, 21, 30, 31, 34, 38, 40, 41,
% 45, 55, 63, 82, 103, 110, and 114. In each scan, the images
% with the following IDs (counting from “0”) are used as the
% input views: 25, 22, 28, 40, 44, 48, 0, 8, 13. We use the first 3 images as the input views in a 3-view setting. The 25 images with IDs in [1, 2, 9, 10,11, 12, 14, 15, 23, 24, 26, 27, 29, 30, 31, 32, 33, 34, 35, 41,
% 42, 43, 45, 46, 47] serve as the novel views for evaluation. We follow  \cite{niemeyer2022regnerf,yu2021pixelnerf} to use a 4× downsampled resolution,
% resulting in 300 × 400 pixels for each image.

% Table \ref{tab:dtu_supplement} and Figure \ref{fig:dtu_all} show quantitative and qualitative results on the DTU dataset.  
% We find that masks of the DTU dataset do not always help improve PSNR and SSIM and sometimes the PSNR score in a specific scene drops a lot. For a fair comparison, we train one model for one scene to produce the results in the object and full-image setting at the same time.
% Transfer learning-based methods \cite{chibane2021stereo, yu2021pixelnerf, chen2021mvsnerf} that require
% expensive pre-training underperform ours in almost all settings, except the full-image PSNR score of \cite{yu2021pixelnerf}. This may
% be due to the bias introduced by the white table and black
% background present in many scenes in the DTU dataset. Compared with FreeNeRF, our method can get better performance in the full-image setting.



% \section{Additional discussions on loss functions}\label{theory}
% Notice that instead of analyzing the original loss function $\mathcal{L}_{}(\bm{\theta})$, we propose the new framework to analyze the new loss function $\mathbb{E}_{\bm{t}\sim P^F} \mathcal{L}(f_{\bm{\theta}}(\bm{d}_i), Y_i) +   \mathbb{E}_{\bm{t}\sim P^{CF}} \mathcal{L}(f_{\bm{\theta}^{}}(\bm{d}_i), Y_i)$. Since we have $\mathcal{L}_{} (\bm{\theta})
% \leq K_1 [\mathbb{E}_{\bm{t}\sim P^F} \mathcal{L}(f_{\bm{\theta}}(\bm{d}_i) , Y_i) +   \mathbb{E}_{\bm{t}\sim P^{CF}} \mathcal{L}(f_{\bm{\theta}^{}}(\bm{d}_i) , Y_i)]-K_2
% \label{two_loss}$, where $K_1, K_2$ are two bounded constants. On this basis, we just need to control the upper bound in order to control the real ideal bound. The advantage is that the original ideal bound is difficult to train directly in theory, but when we make appropriate decomposition, we can observe three key terms (see the main paper) that affect the loss and extract insights from them to guide our sampling and training.


% For $\mathbb{E}_{\bm{t}\sim P^F} \mathcal{L}(f_{\bm{\theta}}(\bm{d}_i) , Y_i) +   \mathbb{E}_{\bm{t}\sim P^{CF}} \mathcal{L}(f_{\bm{\theta}^{}}(\bm{d}_i), Y_i)$, it denotes the sum of factual loss and conterfactual loss. For the inequality, we refer readers to its prototype to get some insights as in~\cite{shalit2017estimating}. In brief, we can observe that

% \begin{equation}
% \begin{aligned}
% \mathbb{E}_{\bm{t}\sim P^F} \mathcal{L}(f_{\bm{\theta}}(\bm{d}_i) , Y_i)  = \sum\limits_{\bm{t} \sim P^F } \sum\limits_{i} [\mathcal{L}(f_{\bm{\theta}}(\bm{d}_i), c_i(\bm{d}_i)) \mathbb{I}(t_i = 1) + \mathcal{L}(f_{\bm{\theta}}(\bm{d}_i), f_{\bm{\theta_t^s}}(\bm{d}_i)) \mathbb{I}(t_i = 0)].\\
% \mathbb{E}_{\bm{t}\sim P^{CF}} \mathcal{L}(f_{\bm{\theta}}(\bm{d}_i) , Y_i)  = \sum\limits_{\bm{t} \sim P^F } \sum\limits_{i} [\mathcal{L}(f_{\bm{\theta}}(\bm{d}_i), c_i(\bm{d}_i)) \mathbb{I}(t_i = 0) + \mathcal{L}(f_{\bm{\theta}}(\bm{d}_i), f_{\bm{\theta_{\bm{1} - \bm{t}}^s}}(\bm{d}_i)) \mathbb{I}(t_i = 1)].
% \end{aligned}
% \end{equation}
%  Each equation contains two terms. Notice that the second term is different from the first term via substituting $c_i(\bm{d}_i)$ to $_i = 0) + \mathcal{L}(f_{\bm{\theta}}(\bm{d}_i), f_{\bm{\theta_{\bm{1} - \bm{t}}^s}}(\bm{d}_i)$. These two items are similar and their loss can be apriori bounded in real-world experiments.
 
 
%  $\mathcal{L}_{ideal} (\bm{\theta})
% \leq K_1 [\mathbb{E}_{\bm{t}\sim P^F} \mathcal{L}(f_{\bm{\theta}}(\bm{d}_i) , Y_i) +   \mathbb{E}_{\bm{t}\sim P^{CF}} \mathcal{L}(f_{\bm{\theta}^{}}(\bm{d}_i) , Y_i)]-K_2
% \label{two_loss}$, where $K_1, K_2$ are two bounded constants. 

